fenguhao commited on
Commit
b56ad99
·
verified ·
1 Parent(s): 3b96b39

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - dpo
9
  - generated_from_trainer
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  model-index:
13
  - name: zephyr-7b-dpo-full
14
  results: []
@@ -19,17 +15,17 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # zephyr-7b-dpo-full
21
 
22
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.4993
25
- - Rewards/chosen: -1.3296
26
- - Rewards/rejected: -2.3308
27
- - Rewards/accuracies: 0.7718
28
- - Rewards/margins: 1.0012
29
- - Logps/rejected: -494.8592
30
- - Logps/chosen: -417.0712
31
- - Logits/rejected: 2.8442
32
- - Logits/chosen: 2.1731
33
 
34
  ## Model description
35
 
@@ -66,15 +62,15 @@ The following hyperparameters were used during training:
66
 
67
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
68
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
69
- | 0.6359 | 0.1 | 100 | 0.6497 | -0.5228 | -0.6565 | 0.6766 | 0.1337 | -327.4267 | -336.3859 | -2.5311 | -2.5647 |
70
- | 0.5609 | 0.21 | 200 | 0.5503 | -1.0030 | -1.6845 | 0.7599 | 0.6814 | -430.2245 | -384.4107 | 0.3543 | -0.0049 |
71
- | 0.515 | 0.31 | 300 | 0.5301 | -1.0052 | -1.8726 | 0.7659 | 0.8674 | -449.0401 | -384.6346 | 0.9044 | 0.2913 |
72
- | 0.49 | 0.42 | 400 | 0.5220 | -1.2561 | -2.1216 | 0.7599 | 0.8655 | -473.9429 | -409.7225 | 2.0848 | 1.5415 |
73
- | 0.513 | 0.52 | 500 | 0.5144 | -1.2211 | -2.1313 | 0.7599 | 0.9101 | -474.9064 | -406.2240 | 2.7724 | 2.2683 |
74
- | 0.491 | 0.63 | 600 | 0.5091 | -1.2471 | -2.2323 | 0.7698 | 0.9852 | -485.0119 | -408.8233 | 3.0663 | 2.4025 |
75
- | 0.4633 | 0.73 | 700 | 0.5028 | -1.3279 | -2.2883 | 0.7798 | 0.9605 | -490.6107 | -416.8968 | 2.7686 | 2.1855 |
76
- | 0.4676 | 0.84 | 800 | 0.5004 | -1.4612 | -2.4850 | 0.7679 | 1.0239 | -510.2817 | -430.2271 | 3.0074 | 2.3628 |
77
- | 0.4959 | 0.94 | 900 | 0.4994 | -1.3270 | -2.3263 | 0.7738 | 0.9992 | -494.4040 | -416.8122 | 2.8412 | 2.1702 |
78
 
79
 
80
  ### Framework versions
 
2
  license: apache-2.0
3
  base_model: alignment-handbook/zephyr-7b-sft-full
4
  tags:
 
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
 
 
8
  model-index:
9
  - name: zephyr-7b-dpo-full
10
  results: []
 
15
 
16
  # zephyr-7b-dpo-full
17
 
18
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.6712
21
+ - Rewards/chosen: -2.0287
22
+ - Rewards/rejected: -3.3245
23
+ - Rewards/accuracies: 0.7639
24
+ - Rewards/margins: 1.2958
25
+ - Logps/rejected: -594.2247
26
+ - Logps/chosen: -486.9804
27
+ - Logits/rejected: 3.7376
28
+ - Logits/chosen: 2.4533
29
 
30
  ## Model description
31
 
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
65
+ | 0.8957 | 0.1 | 100 | 0.9028 | -0.5210 | -0.6849 | 0.6905 | 0.1639 | -330.2668 | -336.2060 | -2.5107 | -2.5460 |
66
+ | 0.7658 | 0.21 | 200 | 0.7650 | -0.9414 | -1.6932 | 0.7460 | 0.7519 | -431.1015 | -378.2476 | 0.3347 | -0.1529 |
67
+ | 0.7079 | 0.31 | 300 | 0.7289 | -1.3837 | -2.4868 | 0.7560 | 1.1031 | -510.4591 | -422.4754 | 1.8370 | 0.8744 |
68
+ | 0.6806 | 0.42 | 400 | 0.7040 | -1.3285 | -2.4190 | 0.7698 | 1.0904 | -503.6740 | -416.9630 | 1.2713 | 0.0992 |
69
+ | 0.7129 | 0.52 | 500 | 0.6980 | -1.4621 | -2.5268 | 0.7440 | 1.0648 | -514.4609 | -430.3167 | 2.3343 | 1.4091 |
70
+ | 0.6636 | 0.63 | 600 | 0.6877 | -1.3328 | -2.5188 | 0.75 | 1.1861 | -513.6627 | -417.3850 | 2.2082 | 0.7470 |
71
+ | 0.6217 | 0.73 | 700 | 0.6762 | -1.8908 | -3.1786 | 0.7698 | 1.2878 | -579.6354 | -473.1887 | 3.8163 | 2.5932 |
72
+ | 0.6418 | 0.84 | 800 | 0.6712 | -2.0993 | -3.4028 | 0.7679 | 1.3035 | -602.0607 | -494.0422 | 3.8655 | 2.6092 |
73
+ | 0.6678 | 0.94 | 900 | 0.6716 | -2.0307 | -3.3233 | 0.7639 | 1.2926 | -594.1103 | -487.1844 | 3.7332 | 2.4518 |
74
 
75
 
76
  ### Framework versions
all_results.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": 2.1731138229370117,
4
- "eval_logits/rejected": 2.8442113399505615,
5
- "eval_logps/chosen": -417.0711669921875,
6
- "eval_logps/rejected": -494.8591613769531,
7
- "eval_loss": 0.4992651343345642,
8
- "eval_rewards/accuracies": 0.77182537317276,
9
- "eval_rewards/chosen": -1.329614520072937,
10
- "eval_rewards/margins": 1.00119149684906,
11
- "eval_rewards/rejected": -2.330806255340576,
12
- "eval_runtime": 243.9609,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 8.198,
15
  "eval_steps_per_second": 0.258,
16
- "train_loss": 0.5295458661324066,
17
- "train_runtime": 21408.4408,
18
  "train_samples": 61135,
19
- "train_samples_per_second": 2.856,
20
- "train_steps_per_second": 0.045
21
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": 2.4533472061157227,
4
+ "eval_logits/rejected": 3.7376418113708496,
5
+ "eval_logps/chosen": -486.9803771972656,
6
+ "eval_logps/rejected": -594.2247314453125,
7
+ "eval_loss": 0.6712061166763306,
8
+ "eval_rewards/accuracies": 0.7638888955116272,
9
+ "eval_rewards/chosen": -2.0287060737609863,
10
+ "eval_rewards/margins": 1.2957550287246704,
11
+ "eval_rewards/rejected": -3.324460983276367,
12
+ "eval_runtime": 244.3063,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 8.186,
15
  "eval_steps_per_second": 0.258,
16
+ "train_loss": 0.7263204834224042,
17
+ "train_runtime": 20734.7169,
18
  "train_samples": 61135,
19
+ "train_samples_per_second": 2.948,
20
+ "train_steps_per_second": 0.046
21
  }
config.json CHANGED
@@ -21,6 +21,6 @@
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.36.2",
24
- "use_cache": true,
25
  "vocab_size": 32000
26
  }
 
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
  "transformers_version": "4.36.2",
24
+ "use_cache": false,
25
  "vocab_size": 32000
26
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": 2.1731138229370117,
4
- "eval_logits/rejected": 2.8442113399505615,
5
- "eval_logps/chosen": -417.0711669921875,
6
- "eval_logps/rejected": -494.8591613769531,
7
- "eval_loss": 0.4992651343345642,
8
- "eval_rewards/accuracies": 0.77182537317276,
9
- "eval_rewards/chosen": -1.329614520072937,
10
- "eval_rewards/margins": 1.00119149684906,
11
- "eval_rewards/rejected": -2.330806255340576,
12
- "eval_runtime": 243.9609,
13
  "eval_samples": 2000,
14
- "eval_samples_per_second": 8.198,
15
  "eval_steps_per_second": 0.258
16
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_logits/chosen": 2.4533472061157227,
4
+ "eval_logits/rejected": 3.7376418113708496,
5
+ "eval_logps/chosen": -486.9803771972656,
6
+ "eval_logps/rejected": -594.2247314453125,
7
+ "eval_loss": 0.6712061166763306,
8
+ "eval_rewards/accuracies": 0.7638888955116272,
9
+ "eval_rewards/chosen": -2.0287060737609863,
10
+ "eval_rewards/margins": 1.2957550287246704,
11
+ "eval_rewards/rejected": -3.324460983276367,
12
+ "eval_runtime": 244.3063,
13
  "eval_samples": 2000,
14
+ "eval_samples_per_second": 8.186,
15
  "eval_steps_per_second": 0.258
16
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a892e6108f6000fe3167ac0ebb202bc51c830b6d89320780ddfc3ef54029e44
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b1e9e9688ce643625583017cfcba65fa8ff37865db15e78443ca69585dd2607
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e6d783a1238baa5b5f683c0908c62994fdb9f3dda2a74bdcf60d8399fb4f084
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c520ae489cc180f688441279cef32fac2764d4f58b2900b8450d180cbd2d1e3
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cab617c906e0c3ff6b4420793df6ed06ae053d27f7de8293d12732e306cbeb7
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e6172e143dd652d2e901dbedaaa58e54a6eb5247664e6c4669fa1dfb45b15f
3
  size 4540516344
runs/Jun24_18-16-23_facf8d095d45/events.out.tfevents.1719253006.facf8d095d45.359765.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86428da338c601f6b6affd0dcfe571ba509fb0cd7cee84ad60687dc019f8af40
3
+ size 72293
runs/Jun24_18-16-23_facf8d095d45/events.out.tfevents.1719273985.facf8d095d45.359765.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fac01b95a1acb80341bf4c6df1e45b03a848e7e418301c9e1ed5aa85b5589458
3
+ size 828
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.5295458661324066,
4
- "train_runtime": 21408.4408,
5
  "train_samples": 61135,
6
- "train_samples_per_second": 2.856,
7
- "train_steps_per_second": 0.045
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.7263204834224042,
4
+ "train_runtime": 20734.7169,
5
  "train_samples": 61135,
6
+ "train_samples_per_second": 2.948,
7
+ "train_steps_per_second": 0.046
8
  }
trainer_state.json CHANGED
@@ -15,7 +15,7 @@
15
  "logits/rejected": -2.686896800994873,
16
  "logps/chosen": -229.94229125976562,
17
  "logps/rejected": -214.70114135742188,
18
- "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -25,1492 +25,1492 @@
25
  {
26
  "epoch": 0.01,
27
  "learning_rate": 5.208333333333333e-08,
28
- "logits/chosen": -2.6808853149414062,
29
- "logits/rejected": -2.7088348865509033,
30
- "logps/chosen": -295.84722900390625,
31
- "logps/rejected": -250.60598754882812,
32
- "loss": 0.6932,
33
- "rewards/accuracies": 0.4583333432674408,
34
- "rewards/chosen": 2.2828255168860778e-05,
35
- "rewards/margins": -5.2194358431734145e-05,
36
- "rewards/rejected": 7.502263906644657e-05,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.02,
41
  "learning_rate": 1.0416666666666667e-07,
42
- "logits/chosen": -2.6196653842926025,
43
- "logits/rejected": -2.6258511543273926,
44
- "logps/chosen": -271.2942810058594,
45
- "logps/rejected": -246.940185546875,
46
- "loss": 0.6929,
47
- "rewards/accuracies": 0.5,
48
- "rewards/chosen": 0.0005789586575701833,
49
- "rewards/margins": 0.0010592096950858831,
50
- "rewards/rejected": -0.0004802510084118694,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.03,
55
  "learning_rate": 1.5624999999999999e-07,
56
- "logits/chosen": -2.7037882804870605,
57
- "logits/rejected": -2.666579484939575,
58
- "logps/chosen": -278.3568115234375,
59
- "logps/rejected": -254.5241241455078,
60
- "loss": 0.6926,
61
- "rewards/accuracies": 0.550000011920929,
62
- "rewards/chosen": 0.0008040089160203934,
63
- "rewards/margins": 0.0013185159768909216,
64
- "rewards/rejected": -0.0005145071190781891,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.04,
69
  "learning_rate": 2.0833333333333333e-07,
70
- "logits/chosen": -2.6501824855804443,
71
- "logits/rejected": -2.637791395187378,
72
- "logps/chosen": -273.8215637207031,
73
- "logps/rejected": -237.7809600830078,
74
- "loss": 0.6907,
75
- "rewards/accuracies": 0.6875,
76
- "rewards/chosen": 0.004025370813906193,
77
- "rewards/margins": 0.007183588109910488,
78
- "rewards/rejected": -0.0031582186929881573,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.05,
83
  "learning_rate": 2.604166666666667e-07,
84
- "logits/chosen": -2.6747143268585205,
85
- "logits/rejected": -2.63855242729187,
86
- "logps/chosen": -296.03778076171875,
87
- "logps/rejected": -274.6916809082031,
88
- "loss": 0.6864,
89
- "rewards/accuracies": 0.65625,
90
- "rewards/chosen": 0.00938174407929182,
91
- "rewards/margins": 0.01614220067858696,
92
- "rewards/rejected": -0.00676045473664999,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.06,
97
  "learning_rate": 3.1249999999999997e-07,
98
- "logits/chosen": -2.6307215690612793,
99
- "logits/rejected": -2.6333343982696533,
100
- "logps/chosen": -285.3677062988281,
101
- "logps/rejected": -274.3479309082031,
102
- "loss": 0.6804,
103
- "rewards/accuracies": 0.675000011920929,
104
- "rewards/chosen": 0.02419787459075451,
105
- "rewards/margins": 0.02778133749961853,
106
- "rewards/rejected": -0.0035834647715091705,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.07,
111
  "learning_rate": 3.645833333333333e-07,
112
- "logits/chosen": -2.6513328552246094,
113
- "logits/rejected": -2.686645984649658,
114
- "logps/chosen": -310.8687438964844,
115
- "logps/rejected": -290.38031005859375,
116
- "loss": 0.6672,
117
- "rewards/accuracies": 0.699999988079071,
118
- "rewards/chosen": 0.03635421395301819,
119
- "rewards/margins": 0.05565086752176285,
120
- "rewards/rejected": -0.01929665170609951,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.08,
125
  "learning_rate": 4.1666666666666667e-07,
126
- "logits/chosen": -2.5258519649505615,
127
- "logits/rejected": -2.468043327331543,
128
- "logps/chosen": -304.2803649902344,
129
- "logps/rejected": -281.13604736328125,
130
- "loss": 0.644,
131
- "rewards/accuracies": 0.762499988079071,
132
- "rewards/chosen": -0.03719509392976761,
133
- "rewards/margins": 0.11543086916208267,
134
- "rewards/rejected": -0.15262596309185028,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.09,
139
  "learning_rate": 4.6874999999999996e-07,
140
- "logits/chosen": -2.531845808029175,
141
- "logits/rejected": -2.5026631355285645,
142
- "logps/chosen": -291.836181640625,
143
- "logps/rejected": -283.42706298828125,
144
- "loss": 0.6348,
145
- "rewards/accuracies": 0.699999988079071,
146
- "rewards/chosen": 0.0002045964793069288,
147
- "rewards/margins": 0.17069891095161438,
148
- "rewards/rejected": -0.17049431800842285,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.1,
153
  "learning_rate": 4.999732492681437e-07,
154
- "logits/chosen": -2.497889757156372,
155
- "logits/rejected": -2.487199306488037,
156
- "logps/chosen": -337.8491516113281,
157
- "logps/rejected": -340.85809326171875,
158
- "loss": 0.6359,
159
  "rewards/accuracies": 0.7124999761581421,
160
- "rewards/chosen": -0.32423847913742065,
161
- "rewards/margins": 0.16256344318389893,
162
- "rewards/rejected": -0.4868019223213196,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.1,
167
- "eval_logits/chosen": -2.564718008041382,
168
- "eval_logits/rejected": -2.53108549118042,
169
- "eval_logps/chosen": -336.38592529296875,
170
- "eval_logps/rejected": -327.4267272949219,
171
- "eval_loss": 0.6497124433517456,
172
- "eval_rewards/accuracies": 0.6765872836112976,
173
- "eval_rewards/chosen": -0.5227616429328918,
174
- "eval_rewards/margins": 0.13372045755386353,
175
- "eval_rewards/rejected": -0.6564821600914001,
176
- "eval_runtime": 243.9804,
177
- "eval_samples_per_second": 8.197,
178
- "eval_steps_per_second": 0.258,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.12,
183
  "learning_rate": 4.996723692767926e-07,
184
- "logits/chosen": -2.459810733795166,
185
- "logits/rejected": -2.393411636352539,
186
- "logps/chosen": -339.1122741699219,
187
- "logps/rejected": -311.82769775390625,
188
- "loss": 0.6391,
189
- "rewards/accuracies": 0.6875,
190
- "rewards/chosen": -0.7878889441490173,
191
- "rewards/margins": 0.16021695733070374,
192
- "rewards/rejected": -0.9481059312820435,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.13,
197
  "learning_rate": 4.990375746213598e-07,
198
- "logits/chosen": -2.3250021934509277,
199
- "logits/rejected": -2.2410531044006348,
200
- "logps/chosen": -377.79180908203125,
201
- "logps/rejected": -354.20977783203125,
202
- "loss": 0.6137,
203
- "rewards/accuracies": 0.7124999761581421,
204
- "rewards/chosen": -0.7221530675888062,
205
- "rewards/margins": 0.2862502932548523,
206
- "rewards/rejected": -1.0084033012390137,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.14,
211
  "learning_rate": 4.980697142834314e-07,
212
- "logits/chosen": -2.118708848953247,
213
- "logits/rejected": -1.9641555547714233,
214
- "logps/chosen": -409.3984680175781,
215
- "logps/rejected": -392.3823547363281,
216
- "loss": 0.6,
217
- "rewards/accuracies": 0.71875,
218
- "rewards/chosen": -0.891018271446228,
219
- "rewards/margins": 0.33814504742622375,
220
- "rewards/rejected": -1.2291632890701294,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.15,
225
  "learning_rate": 4.967700826904229e-07,
226
- "logits/chosen": -1.0956144332885742,
227
- "logits/rejected": -1.0446064472198486,
228
- "logps/chosen": -306.23236083984375,
229
- "logps/rejected": -339.3392639160156,
230
- "loss": 0.5511,
231
- "rewards/accuracies": 0.706250011920929,
232
- "rewards/chosen": -0.6644871830940247,
233
- "rewards/margins": 0.4738723337650299,
234
- "rewards/rejected": -1.1383594274520874,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.16,
239
  "learning_rate": 4.951404179843962e-07,
240
- "logits/chosen": -1.1295273303985596,
241
- "logits/rejected": -1.1123173236846924,
242
- "logps/chosen": -344.6251220703125,
243
- "logps/rejected": -387.8667297363281,
244
- "loss": 0.6012,
245
- "rewards/accuracies": 0.65625,
246
- "rewards/chosen": -0.6758090853691101,
247
- "rewards/margins": 0.35659271478652954,
248
- "rewards/rejected": -1.0324018001556396,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.17,
253
  "learning_rate": 4.931828996974498e-07,
254
- "logits/chosen": -1.5214803218841553,
255
- "logits/rejected": -1.17283034324646,
256
- "logps/chosen": -371.93804931640625,
257
- "logps/rejected": -402.75628662109375,
258
- "loss": 0.5542,
259
  "rewards/accuracies": 0.6937500238418579,
260
- "rewards/chosen": -0.7613044381141663,
261
- "rewards/margins": 0.3605394959449768,
262
- "rewards/rejected": -1.1218438148498535,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.18,
267
  "learning_rate": 4.909001458367866e-07,
268
- "logits/chosen": 0.024106794968247414,
269
- "logits/rejected": -0.007686579134315252,
270
- "logps/chosen": -420.8289489746094,
271
- "logps/rejected": -456.450439453125,
272
- "loss": 0.5631,
273
- "rewards/accuracies": 0.699999988079071,
274
- "rewards/chosen": -1.4776414632797241,
275
- "rewards/margins": 0.614984393119812,
276
- "rewards/rejected": -2.092625856399536,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.19,
281
  "learning_rate": 4.882952093833627e-07,
282
- "logits/chosen": 0.17635126411914825,
283
- "logits/rejected": 0.3050743639469147,
284
- "logps/chosen": -359.55633544921875,
285
- "logps/rejected": -410.02191162109375,
286
- "loss": 0.5361,
287
- "rewards/accuracies": 0.731249988079071,
288
- "rewards/chosen": -1.1035659313201904,
289
- "rewards/margins": 0.6763278245925903,
290
- "rewards/rejected": -1.7798936367034912,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.2,
295
  "learning_rate": 4.853715742087946e-07,
296
- "logits/chosen": 0.14620204269886017,
297
- "logits/rejected": 0.6025527715682983,
298
- "logps/chosen": -390.93023681640625,
299
- "logps/rejected": -438.7168884277344,
300
- "loss": 0.533,
301
- "rewards/accuracies": 0.7749999761581421,
302
- "rewards/chosen": -1.120954155921936,
303
- "rewards/margins": 0.7051862478256226,
304
- "rewards/rejected": -1.8261404037475586,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.21,
309
  "learning_rate": 4.821331504159906e-07,
310
- "logits/chosen": 0.39229562878608704,
311
- "logits/rejected": 0.8097684979438782,
312
- "logps/chosen": -403.7792053222656,
313
- "logps/rejected": -453.61492919921875,
314
- "loss": 0.5609,
315
- "rewards/accuracies": 0.731249988079071,
316
- "rewards/chosen": -1.1711080074310303,
317
- "rewards/margins": 0.6101128458976746,
318
- "rewards/rejected": -1.7812210321426392,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.21,
323
- "eval_logits/chosen": -0.004928875248879194,
324
- "eval_logits/rejected": 0.35432979464530945,
325
- "eval_logps/chosen": -384.41070556640625,
326
- "eval_logps/rejected": -430.2244873046875,
327
- "eval_loss": 0.550298810005188,
328
- "eval_rewards/accuracies": 0.7599206566810608,
329
- "eval_rewards/chosen": -1.0030099153518677,
330
- "eval_rewards/margins": 0.681449294090271,
331
- "eval_rewards/rejected": -1.6844590902328491,
332
- "eval_runtime": 244.4844,
333
- "eval_samples_per_second": 8.18,
334
- "eval_steps_per_second": 0.258,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.22,
339
  "learning_rate": 4.785842691097342e-07,
340
- "logits/chosen": -0.22027039527893066,
341
- "logits/rejected": 0.11937667429447174,
342
- "logps/chosen": -391.0158996582031,
343
- "logps/rejected": -396.7218322753906,
344
- "loss": 0.5501,
345
  "rewards/accuracies": 0.7250000238418579,
346
- "rewards/chosen": -0.857402503490448,
347
- "rewards/margins": 0.570809006690979,
348
- "rewards/rejected": -1.4282116889953613,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.23,
353
  "learning_rate": 4.7472967660421603e-07,
354
- "logits/chosen": -0.046684689819812775,
355
- "logits/rejected": 0.19581058621406555,
356
- "logps/chosen": -383.0498962402344,
357
- "logps/rejected": -437.441650390625,
358
- "loss": 0.5486,
359
- "rewards/accuracies": 0.706250011920929,
360
- "rewards/chosen": -0.9351595044136047,
361
- "rewards/margins": 0.6760483384132385,
362
- "rewards/rejected": -1.6112079620361328,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.24,
367
  "learning_rate": 4.705745280752585e-07,
368
- "logits/chosen": -0.4610627293586731,
369
- "logits/rejected": -0.08335449546575546,
370
- "logps/chosen": -404.11041259765625,
371
- "logps/rejected": -432.98236083984375,
372
- "loss": 0.5472,
373
- "rewards/accuracies": 0.7749999761581421,
374
- "rewards/chosen": -1.103994607925415,
375
- "rewards/margins": 0.7393767833709717,
376
- "rewards/rejected": -1.8433713912963867,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.25,
381
  "learning_rate": 4.6612438066572555e-07,
382
- "logits/chosen": -0.34058791399002075,
383
- "logits/rejected": 0.255656898021698,
384
- "logps/chosen": -370.15185546875,
385
- "logps/rejected": -384.154541015625,
386
- "loss": 0.5233,
387
- "rewards/accuracies": 0.706250011920929,
388
- "rewards/chosen": -0.9289584159851074,
389
- "rewards/margins": 0.6521793603897095,
390
- "rewards/rejected": -1.5811378955841064,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.26,
395
  "learning_rate": 4.6138518605333664e-07,
396
- "logits/chosen": 0.10690119117498398,
397
- "logits/rejected": 0.25003090500831604,
398
- "logps/chosen": -375.705810546875,
399
- "logps/rejected": -448.9117126464844,
400
- "loss": 0.5455,
401
- "rewards/accuracies": 0.699999988079071,
402
- "rewards/chosen": -1.0811748504638672,
403
- "rewards/margins": 0.5937505960464478,
404
- "rewards/rejected": -1.674925446510315,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.27,
409
  "learning_rate": 4.5636328249082514e-07,
410
- "logits/chosen": -0.6711705327033997,
411
- "logits/rejected": -0.24552011489868164,
412
- "logps/chosen": -369.1128845214844,
413
- "logps/rejected": -404.0663146972656,
414
- "loss": 0.5359,
415
- "rewards/accuracies": 0.75,
416
- "rewards/chosen": -0.8505151867866516,
417
- "rewards/margins": 0.611486554145813,
418
- "rewards/rejected": -1.4620015621185303,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.28,
423
  "learning_rate": 4.510653863290871e-07,
424
- "logits/chosen": -0.5455327033996582,
425
- "logits/rejected": -0.040910232812166214,
426
- "logps/chosen": -405.8040771484375,
427
- "logps/rejected": -431.89044189453125,
428
- "loss": 0.5219,
429
- "rewards/accuracies": 0.793749988079071,
430
- "rewards/chosen": -1.0395045280456543,
431
- "rewards/margins": 0.7771880030632019,
432
- "rewards/rejected": -1.816692590713501,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.29,
437
  "learning_rate": 4.4549858303465737e-07,
438
- "logits/chosen": -0.3963968753814697,
439
- "logits/rejected": 0.058730434626340866,
440
- "logps/chosen": -409.90142822265625,
441
- "logps/rejected": -475.1370544433594,
442
- "loss": 0.5194,
443
- "rewards/accuracies": 0.7749999761581421,
444
- "rewards/chosen": -1.1871297359466553,
445
- "rewards/margins": 0.7304742932319641,
446
- "rewards/rejected": -1.9176040887832642,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.3,
451
  "learning_rate": 4.396703177135261e-07,
452
- "logits/chosen": -0.1521393358707428,
453
- "logits/rejected": 0.2662551999092102,
454
- "logps/chosen": -398.43560791015625,
455
- "logps/rejected": -436.71746826171875,
456
- "loss": 0.5199,
457
- "rewards/accuracies": 0.7250000238418579,
458
- "rewards/chosen": -1.1340547800064087,
459
- "rewards/margins": 0.7097210884094238,
460
- "rewards/rejected": -1.843775987625122,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.31,
465
  "learning_rate": 4.335883851539693e-07,
466
- "logits/chosen": 0.4186009466648102,
467
- "logits/rejected": 0.993320107460022,
468
- "logps/chosen": -386.4576110839844,
469
- "logps/rejected": -449.5475158691406,
470
- "loss": 0.515,
471
- "rewards/accuracies": 0.762499988079071,
472
- "rewards/chosen": -1.0302515029907227,
473
- "rewards/margins": 0.9557968378067017,
474
- "rewards/rejected": -1.9860484600067139,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.31,
479
- "eval_logits/chosen": 0.29125669598579407,
480
- "eval_logits/rejected": 0.9044150114059448,
481
- "eval_logps/chosen": -384.6346435546875,
482
- "eval_logps/rejected": -449.0400695800781,
483
- "eval_loss": 0.5301220417022705,
484
- "eval_rewards/accuracies": 0.7658730149269104,
485
- "eval_rewards/chosen": -1.0052489042282104,
486
- "eval_rewards/margins": 0.8673661351203918,
487
- "eval_rewards/rejected": -1.872615098953247,
488
- "eval_runtime": 242.7199,
489
- "eval_samples_per_second": 8.24,
490
  "eval_steps_per_second": 0.26,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.32,
495
  "learning_rate": 4.272609194017105e-07,
496
- "logits/chosen": 0.7322943210601807,
497
- "logits/rejected": 1.3608242273330688,
498
- "logps/chosen": -382.03228759765625,
499
- "logps/rejected": -473.8851623535156,
500
- "loss": 0.471,
501
- "rewards/accuracies": 0.793749988079071,
502
- "rewards/chosen": -0.9981803894042969,
503
- "rewards/margins": 1.0494532585144043,
504
- "rewards/rejected": -2.047633647918701,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.33,
509
  "learning_rate": 4.2069638288135547e-07,
510
- "logits/chosen": 1.0567584037780762,
511
- "logits/rejected": 1.544245719909668,
512
- "logps/chosen": -416.4466857910156,
513
- "logps/rejected": -475.4097595214844,
514
- "loss": 0.5231,
515
- "rewards/accuracies": 0.699999988079071,
516
- "rewards/chosen": -1.3479177951812744,
517
- "rewards/margins": 0.7124323844909668,
518
- "rewards/rejected": -2.060349941253662,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.35,
523
  "learning_rate": 4.139035550786494e-07,
524
- "logits/chosen": 0.6867846250534058,
525
- "logits/rejected": 1.0369207859039307,
526
- "logps/chosen": -385.1805725097656,
527
- "logps/rejected": -399.9478759765625,
528
- "loss": 0.55,
529
  "rewards/accuracies": 0.7437499761581421,
530
- "rewards/chosen": -1.0560498237609863,
531
- "rewards/margins": 0.6113256216049194,
532
- "rewards/rejected": -1.6673755645751953,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.36,
537
  "learning_rate": 4.0689152079869306e-07,
538
- "logits/chosen": 0.10372958332300186,
539
- "logits/rejected": 0.788400411605835,
540
- "logps/chosen": -329.7562561035156,
541
- "logps/rejected": -365.8982849121094,
542
- "loss": 0.5614,
543
- "rewards/accuracies": 0.6625000238418579,
544
- "rewards/chosen": -0.8400253057479858,
545
- "rewards/margins": 0.576715350151062,
546
- "rewards/rejected": -1.4167406558990479,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.37,
551
  "learning_rate": 3.99669658015821e-07,
552
- "logits/chosen": 0.8255292773246765,
553
- "logits/rejected": 0.8684799075126648,
554
- "logps/chosen": -378.26593017578125,
555
- "logps/rejected": -455.6053161621094,
556
- "loss": 0.5377,
557
- "rewards/accuracies": 0.71875,
558
- "rewards/chosen": -1.0613641738891602,
559
- "rewards/margins": 0.7045412659645081,
560
- "rewards/rejected": -1.7659053802490234,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.38,
565
  "learning_rate": 3.92247625331392e-07,
566
- "logits/chosen": 1.5494980812072754,
567
- "logits/rejected": 1.8511909246444702,
568
- "logps/chosen": -408.739501953125,
569
- "logps/rejected": -443.5703125,
570
- "loss": 0.5162,
571
- "rewards/accuracies": 0.7437499761581421,
572
- "rewards/chosen": -1.2600252628326416,
573
- "rewards/margins": 0.748504638671875,
574
- "rewards/rejected": -2.0085299015045166,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.39,
579
  "learning_rate": 3.846353490562664e-07,
580
- "logits/chosen": 1.9055626392364502,
581
- "logits/rejected": 1.9892339706420898,
582
- "logps/chosen": -359.62933349609375,
583
- "logps/rejected": -462.4502868652344,
584
- "loss": 0.489,
585
  "rewards/accuracies": 0.7749999761581421,
586
- "rewards/chosen": -1.19487726688385,
587
- "rewards/margins": 0.8898428082466125,
588
- "rewards/rejected": -2.0847198963165283,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.4,
593
  "learning_rate": 3.768430099352445e-07,
594
- "logits/chosen": 0.5472872257232666,
595
- "logits/rejected": 1.5718077421188354,
596
- "logps/chosen": -404.38726806640625,
597
- "logps/rejected": -468.7950134277344,
598
- "loss": 0.4967,
599
- "rewards/accuracies": 0.7749999761581421,
600
- "rewards/chosen": -1.0777933597564697,
601
- "rewards/margins": 0.98560631275177,
602
- "rewards/rejected": -2.0634000301361084,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.41,
607
  "learning_rate": 3.6888102953122304e-07,
608
- "logits/chosen": 1.2810680866241455,
609
- "logits/rejected": 1.808607816696167,
610
- "logps/chosen": -383.5371398925781,
611
- "logps/rejected": -442.24725341796875,
612
- "loss": 0.5202,
613
- "rewards/accuracies": 0.7875000238418579,
614
- "rewards/chosen": -1.077757716178894,
615
- "rewards/margins": 0.9933692812919617,
616
- "rewards/rejected": -2.07112717628479,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.42,
621
  "learning_rate": 3.607600562872785e-07,
622
- "logits/chosen": 1.4377835988998413,
623
- "logits/rejected": 2.076244831085205,
624
- "logps/chosen": -447.5244140625,
625
- "logps/rejected": -489.15460205078125,
626
- "loss": 0.49,
627
- "rewards/accuracies": 0.706250011920929,
628
- "rewards/chosen": -1.4314160346984863,
629
- "rewards/margins": 0.8051286935806274,
630
- "rewards/rejected": -2.2365448474884033,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.42,
635
- "eval_logits/chosen": 1.5414855480194092,
636
- "eval_logits/rejected": 2.0847771167755127,
637
- "eval_logps/chosen": -409.72247314453125,
638
- "eval_logps/rejected": -473.94293212890625,
639
- "eval_loss": 0.5220484733581543,
640
- "eval_rewards/accuracies": 0.7599206566810608,
641
- "eval_rewards/chosen": -1.256127119064331,
642
- "eval_rewards/margins": 0.8655170202255249,
643
- "eval_rewards/rejected": -2.1216440200805664,
644
- "eval_runtime": 243.1768,
645
- "eval_samples_per_second": 8.224,
646
- "eval_steps_per_second": 0.259,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.43,
651
  "learning_rate": 3.5249095128531856e-07,
652
- "logits/chosen": 0.9375486373901367,
653
- "logits/rejected": 1.7120797634124756,
654
- "logps/chosen": -432.73822021484375,
655
- "logps/rejected": -490.5960388183594,
656
- "loss": 0.5179,
657
- "rewards/accuracies": 0.7437499761581421,
658
- "rewards/chosen": -1.1888015270233154,
659
- "rewards/margins": 0.8070329427719116,
660
- "rewards/rejected": -1.9958345890045166,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.44,
665
  "learning_rate": 3.4408477372034736e-07,
666
- "logits/chosen": 1.4186906814575195,
667
- "logits/rejected": 2.430680274963379,
668
- "logps/chosen": -378.02410888671875,
669
- "logps/rejected": -412.11212158203125,
670
- "loss": 0.5195,
671
- "rewards/accuracies": 0.706250011920929,
672
- "rewards/chosen": -1.1391704082489014,
673
- "rewards/margins": 0.682558000087738,
674
- "rewards/rejected": -1.8217283487319946,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.45,
679
  "learning_rate": 3.3555276610977276e-07,
680
- "logits/chosen": 1.6119515895843506,
681
- "logits/rejected": 2.5337882041931152,
682
- "logps/chosen": -370.18365478515625,
683
- "logps/rejected": -415.2383728027344,
684
- "loss": 0.5362,
685
- "rewards/accuracies": 0.737500011920929,
686
- "rewards/chosen": -1.0798513889312744,
687
- "rewards/margins": 0.6784455180168152,
688
- "rewards/rejected": -1.7582969665527344,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.46,
693
  "learning_rate": 3.269063392575352e-07,
694
- "logits/chosen": 2.2136051654815674,
695
- "logits/rejected": 1.8652782440185547,
696
- "logps/chosen": -380.4514465332031,
697
- "logps/rejected": -446.1416015625,
698
- "loss": 0.5086,
699
- "rewards/accuracies": 0.706250011920929,
700
- "rewards/chosen": -1.2145936489105225,
701
- "rewards/margins": 0.7581223845481873,
702
- "rewards/rejected": -1.9727160930633545,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.47,
707
  "learning_rate": 3.1815705699316964e-07,
708
- "logits/chosen": 2.1395087242126465,
709
- "logits/rejected": 2.30427885055542,
710
- "logps/chosen": -377.0716247558594,
711
- "logps/rejected": -443.8544006347656,
712
- "loss": 0.5254,
713
  "rewards/accuracies": 0.762499988079071,
714
- "rewards/chosen": -1.1291664838790894,
715
- "rewards/margins": 0.8725690841674805,
716
- "rewards/rejected": -2.0017354488372803,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.48,
721
  "learning_rate": 3.0931662070620794e-07,
722
- "logits/chosen": 1.6911855936050415,
723
- "logits/rejected": 2.427851438522339,
724
- "logps/chosen": -370.693603515625,
725
- "logps/rejected": -447.58087158203125,
726
- "loss": 0.5114,
727
- "rewards/accuracies": 0.7250000238418579,
728
- "rewards/chosen": -1.056607961654663,
729
- "rewards/margins": 0.851728618144989,
730
- "rewards/rejected": -1.9083364009857178,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.49,
735
  "learning_rate": 3.003968536966078e-07,
736
- "logits/chosen": 1.9878408908843994,
737
- "logits/rejected": 2.358523368835449,
738
- "logps/chosen": -415.61767578125,
739
- "logps/rejected": -470.3460388183594,
740
- "loss": 0.4998,
741
- "rewards/accuracies": 0.8125,
742
- "rewards/chosen": -1.1741617918014526,
743
- "rewards/margins": 0.9244480133056641,
744
- "rewards/rejected": -2.0986099243164062,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 0.5,
749
  "learning_rate": 2.9140968536213693e-07,
750
- "logits/chosen": 2.476728916168213,
751
- "logits/rejected": 3.153770923614502,
752
- "logps/chosen": -372.2202453613281,
753
- "logps/rejected": -451.8748474121094,
754
- "loss": 0.536,
755
- "rewards/accuracies": 0.7250000238418579,
756
- "rewards/chosen": -1.3948185443878174,
757
- "rewards/margins": 0.835403323173523,
758
- "rewards/rejected": -2.230221748352051,
759
  "step": 480
760
  },
761
  {
762
  "epoch": 0.51,
763
  "learning_rate": 2.823671352438608e-07,
764
- "logits/chosen": 2.6041836738586426,
765
- "logits/rejected": 2.8176186084747314,
766
- "logps/chosen": -410.2490234375,
767
- "logps/rejected": -458.1270446777344,
768
- "loss": 0.5011,
769
- "rewards/accuracies": 0.7250000238418579,
770
- "rewards/chosen": -1.3454747200012207,
771
- "rewards/margins": 0.7420424818992615,
772
- "rewards/rejected": -2.087517261505127,
773
  "step": 490
774
  },
775
  {
776
  "epoch": 0.52,
777
  "learning_rate": 2.73281296951072e-07,
778
- "logits/chosen": 2.672900676727295,
779
- "logits/rejected": 2.895838975906372,
780
- "logps/chosen": -403.9568176269531,
781
- "logps/rejected": -479.48419189453125,
782
- "loss": 0.513,
783
- "rewards/accuracies": 0.75,
784
- "rewards/chosen": -1.3237491846084595,
785
- "rewards/margins": 0.9759273529052734,
786
- "rewards/rejected": -2.2996764183044434,
787
  "step": 500
788
  },
789
  {
790
  "epoch": 0.52,
791
- "eval_logits/chosen": 2.26833438873291,
792
- "eval_logits/rejected": 2.77244234085083,
793
- "eval_logps/chosen": -406.2240295410156,
794
- "eval_logps/rejected": -474.9063720703125,
795
- "eval_loss": 0.514352023601532,
796
- "eval_rewards/accuracies": 0.7599206566810608,
797
- "eval_rewards/chosen": -1.2211425304412842,
798
- "eval_rewards/margins": 0.9101356863975525,
799
- "eval_rewards/rejected": -2.1312780380249023,
800
- "eval_runtime": 243.5018,
801
- "eval_samples_per_second": 8.213,
802
  "eval_steps_per_second": 0.259,
803
  "step": 500
804
  },
805
  {
806
  "epoch": 0.53,
807
  "learning_rate": 2.641643219871597e-07,
808
- "logits/chosen": 2.788573741912842,
809
- "logits/rejected": 3.020610809326172,
810
- "logps/chosen": -422.2259826660156,
811
- "logps/rejected": -471.6866760253906,
812
- "loss": 0.4786,
813
- "rewards/accuracies": 0.768750011920929,
814
- "rewards/chosen": -1.2089948654174805,
815
- "rewards/margins": 0.9920150637626648,
816
- "rewards/rejected": -2.201010227203369,
817
  "step": 510
818
  },
819
  {
820
  "epoch": 0.54,
821
  "learning_rate": 2.550284034980507e-07,
822
- "logits/chosen": 2.527721405029297,
823
- "logits/rejected": 3.2578673362731934,
824
- "logps/chosen": -401.86859130859375,
825
- "logps/rejected": -482.77459716796875,
826
- "loss": 0.4948,
827
- "rewards/accuracies": 0.7562500238418579,
828
- "rewards/chosen": -1.4271701574325562,
829
- "rewards/margins": 0.8836954832077026,
830
- "rewards/rejected": -2.310865879058838,
831
  "step": 520
832
  },
833
  {
834
  "epoch": 0.55,
835
  "learning_rate": 2.4588575996495794e-07,
836
- "logits/chosen": 2.8696699142456055,
837
- "logits/rejected": 3.047682285308838,
838
- "logps/chosen": -423.1412658691406,
839
- "logps/rejected": -492.03521728515625,
840
- "loss": 0.4951,
841
- "rewards/accuracies": 0.75,
842
- "rewards/chosen": -1.3899348974227905,
843
- "rewards/margins": 0.9606618881225586,
844
- "rewards/rejected": -2.3505969047546387,
845
  "step": 530
846
  },
847
  {
848
  "epoch": 0.57,
849
  "learning_rate": 2.367486188632446e-07,
850
- "logits/chosen": 2.335068464279175,
851
- "logits/rejected": 2.592519760131836,
852
- "logps/chosen": -424.984619140625,
853
- "logps/rejected": -532.7901611328125,
854
- "loss": 0.4963,
855
- "rewards/accuracies": 0.7562500238418579,
856
- "rewards/chosen": -1.3082420825958252,
857
- "rewards/margins": 1.017060399055481,
858
- "rewards/rejected": -2.3253026008605957,
859
  "step": 540
860
  },
861
  {
862
  "epoch": 0.58,
863
  "learning_rate": 2.276292003092593e-07,
864
- "logits/chosen": 2.321938991546631,
865
- "logits/rejected": 2.639091968536377,
866
- "logps/chosen": -382.3548889160156,
867
- "logps/rejected": -459.89447021484375,
868
- "loss": 0.5233,
869
  "rewards/accuracies": 0.75,
870
- "rewards/chosen": -1.1262718439102173,
871
- "rewards/margins": 0.9929410815238953,
872
- "rewards/rejected": -2.119213104248047,
873
  "step": 550
874
  },
875
  {
876
  "epoch": 0.59,
877
  "learning_rate": 2.185397007170141e-07,
878
- "logits/chosen": 2.198639154434204,
879
- "logits/rejected": 2.4941821098327637,
880
- "logps/chosen": -371.4447326660156,
881
- "logps/rejected": -420.7642517089844,
882
- "loss": 0.5109,
883
  "rewards/accuracies": 0.731249988079071,
884
- "rewards/chosen": -1.0467898845672607,
885
- "rewards/margins": 0.8263680338859558,
886
- "rewards/rejected": -1.8731578588485718,
887
  "step": 560
888
  },
889
  {
890
  "epoch": 0.6,
891
  "learning_rate": 2.094922764865619e-07,
892
- "logits/chosen": 2.5213406085968018,
893
- "logits/rejected": 3.0919876098632812,
894
- "logps/chosen": -412.0347595214844,
895
- "logps/rejected": -482.768310546875,
896
- "loss": 0.5151,
897
- "rewards/accuracies": 0.7437499761581421,
898
- "rewards/chosen": -1.3589495420455933,
899
- "rewards/margins": 0.8796421885490417,
900
- "rewards/rejected": -2.2385916709899902,
901
  "step": 570
902
  },
903
  {
904
  "epoch": 0.61,
905
  "learning_rate": 2.0049902774588797e-07,
906
- "logits/chosen": 2.797121047973633,
907
- "logits/rejected": 3.2302756309509277,
908
- "logps/chosen": -426.5211486816406,
909
- "logps/rejected": -481.0498046875,
910
- "loss": 0.5055,
911
- "rewards/accuracies": 0.731249988079071,
912
- "rewards/chosen": -1.5260117053985596,
913
- "rewards/margins": 0.9057762026786804,
914
- "rewards/rejected": -2.4317879676818848,
915
  "step": 580
916
  },
917
  {
918
  "epoch": 0.62,
919
  "learning_rate": 1.9157198216806238e-07,
920
- "logits/chosen": 2.2456047534942627,
921
- "logits/rejected": 3.1839499473571777,
922
- "logps/chosen": -393.4072265625,
923
- "logps/rejected": -469.48699951171875,
924
- "loss": 0.4999,
925
- "rewards/accuracies": 0.699999988079071,
926
- "rewards/chosen": -1.2036212682724,
927
- "rewards/margins": 0.6981213092803955,
928
- "rewards/rejected": -1.9017425775527954,
929
  "step": 590
930
  },
931
  {
932
  "epoch": 0.63,
933
  "learning_rate": 1.8272307888529274e-07,
934
- "logits/chosen": 2.4101009368896484,
935
- "logits/rejected": 3.006639242172241,
936
- "logps/chosen": -441.27850341796875,
937
- "logps/rejected": -519.7681884765625,
938
- "loss": 0.491,
939
- "rewards/accuracies": 0.7437499761581421,
940
- "rewards/chosen": -1.1530258655548096,
941
- "rewards/margins": 0.9237845540046692,
942
- "rewards/rejected": -2.076810359954834,
943
  "step": 600
944
  },
945
  {
946
  "epoch": 0.63,
947
- "eval_logits/chosen": 2.4024722576141357,
948
- "eval_logits/rejected": 3.066293239593506,
949
- "eval_logps/chosen": -408.8233337402344,
950
- "eval_logps/rejected": -485.0119323730469,
951
- "eval_loss": 0.5090602040290833,
952
- "eval_rewards/accuracies": 0.7698412537574768,
953
- "eval_rewards/chosen": -1.2471359968185425,
954
- "eval_rewards/margins": 0.985197901725769,
955
- "eval_rewards/rejected": -2.2323336601257324,
956
- "eval_runtime": 243.8184,
957
- "eval_samples_per_second": 8.203,
958
  "eval_steps_per_second": 0.258,
959
  "step": 600
960
  },
961
  {
962
  "epoch": 0.64,
963
  "learning_rate": 1.7396415252139288e-07,
964
- "logits/chosen": 2.587667465209961,
965
- "logits/rejected": 3.437826633453369,
966
- "logps/chosen": -402.96368408203125,
967
- "logps/rejected": -446.577392578125,
968
- "loss": 0.4763,
969
- "rewards/accuracies": 0.7437499761581421,
970
- "rewards/chosen": -1.2794382572174072,
971
- "rewards/margins": 0.9419866800308228,
972
- "rewards/rejected": -2.2214248180389404,
973
  "step": 610
974
  },
975
  {
976
  "epoch": 0.65,
977
  "learning_rate": 1.6530691736402316e-07,
978
- "logits/chosen": 2.719007968902588,
979
- "logits/rejected": 3.1902639865875244,
980
- "logps/chosen": -423.01019287109375,
981
- "logps/rejected": -469.666015625,
982
- "loss": 0.5028,
983
- "rewards/accuracies": 0.762499988079071,
984
- "rewards/chosen": -1.5098216533660889,
985
- "rewards/margins": 0.8855170011520386,
986
- "rewards/rejected": -2.395338773727417,
987
  "step": 620
988
  },
989
  {
990
  "epoch": 0.66,
991
  "learning_rate": 1.5676295169786864e-07,
992
- "logits/chosen": 3.1411049365997314,
993
- "logits/rejected": 3.1890902519226074,
994
- "logps/chosen": -398.02264404296875,
995
- "logps/rejected": -505.4256896972656,
996
- "loss": 0.4926,
997
- "rewards/accuracies": 0.762499988079071,
998
- "rewards/chosen": -1.3643443584442139,
999
- "rewards/margins": 1.135912299156189,
1000
- "rewards/rejected": -2.5002567768096924,
1001
  "step": 630
1002
  },
1003
  {
1004
  "epoch": 0.67,
1005
  "learning_rate": 1.483436823197092e-07,
1006
- "logits/chosen": 2.83595609664917,
1007
- "logits/rejected": 3.003196954727173,
1008
- "logps/chosen": -411.4205017089844,
1009
- "logps/rejected": -483.62841796875,
1010
- "loss": 0.5041,
1011
  "rewards/accuracies": 0.800000011920929,
1012
- "rewards/chosen": -1.267811894416809,
1013
- "rewards/margins": 0.9961770176887512,
1014
- "rewards/rejected": -2.263988971710205,
1015
  "step": 640
1016
  },
1017
  {
1018
  "epoch": 0.68,
1019
  "learning_rate": 1.4006036925609243e-07,
1020
- "logits/chosen": 2.1980767250061035,
1021
- "logits/rejected": 2.824580669403076,
1022
- "logps/chosen": -456.74945068359375,
1023
- "logps/rejected": -491.14300537109375,
1024
- "loss": 0.5051,
1025
- "rewards/accuracies": 0.75,
1026
- "rewards/chosen": -1.5043448209762573,
1027
- "rewards/margins": 0.7788913249969482,
1028
- "rewards/rejected": -2.283236503601074,
1029
  "step": 650
1030
  },
1031
  {
1032
  "epoch": 0.69,
1033
  "learning_rate": 1.319240907040458e-07,
1034
- "logits/chosen": 2.287083148956299,
1035
- "logits/rejected": 2.1161584854125977,
1036
- "logps/chosen": -436.58349609375,
1037
- "logps/rejected": -491.6561584472656,
1038
- "loss": 0.5061,
1039
- "rewards/accuracies": 0.762499988079071,
1040
- "rewards/chosen": -1.3502460718154907,
1041
- "rewards/margins": 0.8958118557929993,
1042
- "rewards/rejected": -2.2460577487945557,
1043
  "step": 660
1044
  },
1045
  {
1046
  "epoch": 0.7,
1047
  "learning_rate": 1.239457282149695e-07,
1048
- "logits/chosen": 2.343427896499634,
1049
- "logits/rejected": 3.018118381500244,
1050
- "logps/chosen": -414.76080322265625,
1051
- "logps/rejected": -491.9996032714844,
1052
- "loss": 0.495,
1053
- "rewards/accuracies": 0.75,
1054
- "rewards/chosen": -1.3160669803619385,
1055
- "rewards/margins": 0.9166922569274902,
1056
- "rewards/rejected": -2.2327592372894287,
1057
  "step": 670
1058
  },
1059
  {
1060
  "epoch": 0.71,
1061
  "learning_rate": 1.1613595214152711e-07,
1062
- "logits/chosen": 2.526611328125,
1063
- "logits/rejected": 2.8271474838256836,
1064
- "logps/chosen": -384.0827941894531,
1065
- "logps/rejected": -420.9378967285156,
1066
- "loss": 0.5301,
1067
- "rewards/accuracies": 0.6499999761581421,
1068
- "rewards/chosen": -1.226110577583313,
1069
- "rewards/margins": 0.6014381647109985,
1070
- "rewards/rejected": -1.827548623085022,
1071
  "step": 680
1072
  },
1073
  {
1074
  "epoch": 0.72,
1075
  "learning_rate": 1.0850520736699362e-07,
1076
- "logits/chosen": 2.0355257987976074,
1077
- "logits/rejected": 2.788853168487549,
1078
- "logps/chosen": -371.2467346191406,
1079
- "logps/rejected": -446.3160095214844,
1080
- "loss": 0.4871,
1081
- "rewards/accuracies": 0.7749999761581421,
1082
- "rewards/chosen": -1.0979232788085938,
1083
- "rewards/margins": 0.9463413953781128,
1084
- "rewards/rejected": -2.044264554977417,
1085
  "step": 690
1086
  },
1087
  {
1088
  "epoch": 0.73,
1089
  "learning_rate": 1.0106369933615042e-07,
1090
- "logits/chosen": 2.477355480194092,
1091
- "logits/rejected": 3.3466193675994873,
1092
- "logps/chosen": -396.9360046386719,
1093
- "logps/rejected": -487.60125732421875,
1094
- "loss": 0.4633,
1095
- "rewards/accuracies": 0.768750011920929,
1096
- "rewards/chosen": -1.4273808002471924,
1097
- "rewards/margins": 1.0007905960083008,
1098
- "rewards/rejected": -2.428171157836914,
1099
  "step": 700
1100
  },
1101
  {
1102
  "epoch": 0.73,
1103
- "eval_logits/chosen": 2.1854751110076904,
1104
- "eval_logits/rejected": 2.7685647010803223,
1105
- "eval_logps/chosen": -416.8968200683594,
1106
- "eval_logps/rejected": -490.6107482910156,
1107
- "eval_loss": 0.5028179883956909,
1108
- "eval_rewards/accuracies": 0.7797619104385376,
1109
- "eval_rewards/chosen": -1.3278706073760986,
1110
- "eval_rewards/margins": 0.9604514837265015,
1111
- "eval_rewards/rejected": -2.2883219718933105,
1112
- "eval_runtime": 242.2153,
1113
- "eval_samples_per_second": 8.257,
1114
- "eval_steps_per_second": 0.26,
1115
  "step": 700
1116
  },
1117
  {
1118
  "epoch": 0.74,
1119
  "learning_rate": 9.382138040640714e-08,
1120
- "logits/chosen": 2.2439451217651367,
1121
- "logits/rejected": 2.8265466690063477,
1122
- "logps/chosen": -438.583984375,
1123
- "logps/rejected": -455.88671875,
1124
- "loss": 0.4812,
1125
- "rewards/accuracies": 0.731249988079071,
1126
- "rewards/chosen": -1.3366320133209229,
1127
- "rewards/margins": 0.816901683807373,
1128
- "rewards/rejected": -2.153533697128296,
1129
  "step": 710
1130
  },
1131
  {
1132
  "epoch": 0.75,
1133
  "learning_rate": 8.678793653740632e-08,
1134
- "logits/chosen": 2.6949150562286377,
1135
- "logits/rejected": 2.825409412384033,
1136
- "logps/chosen": -425.30352783203125,
1137
- "logps/rejected": -497.6205139160156,
1138
- "loss": 0.4781,
1139
- "rewards/accuracies": 0.768750011920929,
1140
- "rewards/chosen": -1.3388516902923584,
1141
- "rewards/margins": 0.9720403552055359,
1142
- "rewards/rejected": -2.310892105102539,
1143
  "step": 720
1144
  },
1145
  {
1146
  "epoch": 0.76,
1147
  "learning_rate": 7.997277433690983e-08,
1148
- "logits/chosen": 2.578017473220825,
1149
- "logits/rejected": 2.9378750324249268,
1150
- "logps/chosen": -436.74761962890625,
1151
- "logps/rejected": -476.82110595703125,
1152
- "loss": 0.4849,
1153
- "rewards/accuracies": 0.7875000238418579,
1154
- "rewards/chosen": -1.3285281658172607,
1155
- "rewards/margins": 0.9200865626335144,
1156
- "rewards/rejected": -2.24861478805542,
1157
  "step": 730
1158
  },
1159
  {
1160
  "epoch": 0.77,
1161
  "learning_rate": 7.338500848029602e-08,
1162
- "logits/chosen": 2.7751357555389404,
1163
- "logits/rejected": 3.201183319091797,
1164
- "logps/chosen": -449.5452575683594,
1165
- "logps/rejected": -511.21063232421875,
1166
- "loss": 0.4977,
1167
- "rewards/accuracies": 0.824999988079071,
1168
- "rewards/chosen": -1.3524633646011353,
1169
- "rewards/margins": 1.1687639951705933,
1170
- "rewards/rejected": -2.5212273597717285,
1171
  "step": 740
1172
  },
1173
  {
1174
  "epoch": 0.78,
1175
  "learning_rate": 6.70334495204884e-08,
1176
- "logits/chosen": 2.4888412952423096,
1177
- "logits/rejected": 2.952457904815674,
1178
- "logps/chosen": -420.580078125,
1179
- "logps/rejected": -510.59136962890625,
1180
- "loss": 0.4833,
1181
- "rewards/accuracies": 0.7437499761581421,
1182
- "rewards/chosen": -1.372300386428833,
1183
- "rewards/margins": 0.9483728408813477,
1184
- "rewards/rejected": -2.3206734657287598,
1185
  "step": 750
1186
  },
1187
  {
1188
  "epoch": 0.8,
1189
  "learning_rate": 6.092659210462231e-08,
1190
- "logits/chosen": 2.788936138153076,
1191
- "logits/rejected": 2.778594970703125,
1192
- "logps/chosen": -423.2615661621094,
1193
- "logps/rejected": -482.78741455078125,
1194
- "loss": 0.467,
1195
- "rewards/accuracies": 0.7437499761581421,
1196
- "rewards/chosen": -1.4515461921691895,
1197
- "rewards/margins": 0.8695562481880188,
1198
- "rewards/rejected": -2.3211026191711426,
1199
  "step": 760
1200
  },
1201
  {
1202
  "epoch": 0.81,
1203
  "learning_rate": 5.507260361320737e-08,
1204
- "logits/chosen": 2.7476916313171387,
1205
- "logits/rejected": 3.142958164215088,
1206
- "logps/chosen": -450.19818115234375,
1207
- "logps/rejected": -545.2734985351562,
1208
- "loss": 0.4993,
1209
- "rewards/accuracies": 0.7124999761581421,
1210
- "rewards/chosen": -1.5620007514953613,
1211
- "rewards/margins": 0.8958643674850464,
1212
- "rewards/rejected": -2.4578652381896973,
1213
  "step": 770
1214
  },
1215
  {
1216
  "epoch": 0.82,
1217
  "learning_rate": 4.947931323697982e-08,
1218
- "logits/chosen": 3.1158547401428223,
1219
- "logits/rejected": 3.3449020385742188,
1220
- "logps/chosen": -406.75811767578125,
1221
- "logps/rejected": -469.87725830078125,
1222
- "loss": 0.502,
1223
  "rewards/accuracies": 0.706250011920929,
1224
- "rewards/chosen": -1.5055805444717407,
1225
- "rewards/margins": 0.8289793133735657,
1226
- "rewards/rejected": -2.334559917449951,
1227
  "step": 780
1228
  },
1229
  {
1230
  "epoch": 0.83,
1231
  "learning_rate": 4.415420150605398e-08,
1232
- "logits/chosen": 2.6201043128967285,
1233
- "logits/rejected": 3.0066559314727783,
1234
- "logps/chosen": -463.11016845703125,
1235
- "logps/rejected": -560.42041015625,
1236
- "loss": 0.496,
1237
- "rewards/accuracies": 0.768750011920929,
1238
- "rewards/chosen": -1.627664566040039,
1239
- "rewards/margins": 1.1310782432556152,
1240
- "rewards/rejected": -2.7587428092956543,
1241
  "step": 790
1242
  },
1243
  {
1244
  "epoch": 0.84,
1245
  "learning_rate": 3.9104390285376374e-08,
1246
- "logits/chosen": 2.2220206260681152,
1247
- "logits/rejected": 3.276045560836792,
1248
- "logps/chosen": -484.488525390625,
1249
- "logps/rejected": -536.6212158203125,
1250
- "loss": 0.4676,
1251
- "rewards/accuracies": 0.737500011920929,
1252
- "rewards/chosen": -1.432835340499878,
1253
- "rewards/margins": 1.1149303913116455,
1254
- "rewards/rejected": -2.5477657318115234,
1255
  "step": 800
1256
  },
1257
  {
1258
  "epoch": 0.84,
1259
- "eval_logits/chosen": 2.362839698791504,
1260
- "eval_logits/rejected": 3.007388114929199,
1261
- "eval_logps/chosen": -430.22711181640625,
1262
- "eval_logps/rejected": -510.2817077636719,
1263
- "eval_loss": 0.5003817677497864,
1264
  "eval_rewards/accuracies": 0.7678571343421936,
1265
- "eval_rewards/chosen": -1.4611738920211792,
1266
- "eval_rewards/margins": 1.0238579511642456,
1267
- "eval_rewards/rejected": -2.485031843185425,
1268
- "eval_runtime": 243.2983,
1269
- "eval_samples_per_second": 8.22,
1270
  "eval_steps_per_second": 0.259,
1271
  "step": 800
1272
  },
1273
  {
1274
  "epoch": 0.85,
1275
  "learning_rate": 3.433663324986208e-08,
1276
- "logits/chosen": 2.7204556465148926,
1277
- "logits/rejected": 3.516869306564331,
1278
- "logps/chosen": -434.813232421875,
1279
- "logps/rejected": -473.43084716796875,
1280
- "loss": 0.4944,
1281
- "rewards/accuracies": 0.7562500238418579,
1282
- "rewards/chosen": -1.553430199623108,
1283
- "rewards/margins": 0.7905280590057373,
1284
- "rewards/rejected": -2.3439581394195557,
1285
  "step": 810
1286
  },
1287
  {
1288
  "epoch": 0.86,
1289
  "learning_rate": 2.9857306851953897e-08,
1290
- "logits/chosen": 3.327028274536133,
1291
- "logits/rejected": 3.1481454372406006,
1292
- "logps/chosen": -385.9121398925781,
1293
- "logps/rejected": -464.67919921875,
1294
- "loss": 0.5097,
1295
  "rewards/accuracies": 0.762499988079071,
1296
- "rewards/chosen": -1.3324156999588013,
1297
- "rewards/margins": 0.9625638723373413,
1298
- "rewards/rejected": -2.2949795722961426,
1299
  "step": 820
1300
  },
1301
  {
1302
  "epoch": 0.87,
1303
  "learning_rate": 2.567240179368185e-08,
1304
- "logits/chosen": 2.4686942100524902,
1305
- "logits/rejected": 3.241407871246338,
1306
- "logps/chosen": -389.41162109375,
1307
- "logps/rejected": -479.04034423828125,
1308
- "loss": 0.484,
1309
- "rewards/accuracies": 0.7749999761581421,
1310
- "rewards/chosen": -1.4054991006851196,
1311
- "rewards/margins": 0.9627164602279663,
1312
- "rewards/rejected": -2.368215560913086,
1313
  "step": 830
1314
  },
1315
  {
1316
  "epoch": 0.88,
1317
  "learning_rate": 2.1787515014630357e-08,
1318
- "logits/chosen": 2.906862258911133,
1319
- "logits/rejected": 2.6694931983947754,
1320
- "logps/chosen": -454.332763671875,
1321
- "logps/rejected": -501.1805725097656,
1322
- "loss": 0.519,
1323
- "rewards/accuracies": 0.7124999761581421,
1324
- "rewards/chosen": -1.4119014739990234,
1325
- "rewards/margins": 0.7597278356552124,
1326
- "rewards/rejected": -2.1716294288635254,
1327
  "step": 840
1328
  },
1329
  {
1330
  "epoch": 0.89,
1331
  "learning_rate": 1.820784220652766e-08,
1332
- "logits/chosen": 2.4606220722198486,
1333
- "logits/rejected": 3.116736888885498,
1334
- "logps/chosen": -424.92657470703125,
1335
- "logps/rejected": -452.7645568847656,
1336
- "loss": 0.4959,
1337
  "rewards/accuracies": 0.7437499761581421,
1338
- "rewards/chosen": -1.2464890480041504,
1339
- "rewards/margins": 0.8311346769332886,
1340
- "rewards/rejected": -2.0776238441467285,
1341
  "step": 850
1342
  },
1343
  {
1344
  "epoch": 0.9,
1345
  "learning_rate": 1.4938170864468636e-08,
1346
- "logits/chosen": 2.591265916824341,
1347
- "logits/rejected": 3.241671085357666,
1348
- "logps/chosen": -417.1761779785156,
1349
- "logps/rejected": -489.3394470214844,
1350
- "loss": 0.4976,
1351
- "rewards/accuracies": 0.7562500238418579,
1352
- "rewards/chosen": -1.387465476989746,
1353
- "rewards/margins": 0.9968598484992981,
1354
- "rewards/rejected": -2.3843250274658203,
1355
  "step": 860
1356
  },
1357
  {
1358
  "epoch": 0.91,
1359
  "learning_rate": 1.1982873884064465e-08,
1360
- "logits/chosen": 2.5627589225769043,
1361
- "logits/rejected": 2.842689037322998,
1362
- "logps/chosen": -358.8465270996094,
1363
- "logps/rejected": -472.7076110839844,
1364
- "loss": 0.4834,
1365
- "rewards/accuracies": 0.824999988079071,
1366
- "rewards/chosen": -1.1185928583145142,
1367
- "rewards/margins": 1.1456910371780396,
1368
- "rewards/rejected": -2.2642838954925537,
1369
  "step": 870
1370
  },
1371
  {
1372
  "epoch": 0.92,
1373
  "learning_rate": 9.345903713082304e-09,
1374
- "logits/chosen": 2.062072992324829,
1375
- "logits/rejected": 2.8666763305664062,
1376
- "logps/chosen": -416.20916748046875,
1377
- "logps/rejected": -484.88873291015625,
1378
- "loss": 0.4878,
1379
- "rewards/accuracies": 0.7562500238418579,
1380
- "rewards/chosen": -1.4209827184677124,
1381
- "rewards/margins": 0.8945733904838562,
1382
- "rewards/rejected": -2.315556287765503,
1383
  "step": 880
1384
  },
1385
  {
1386
  "epoch": 0.93,
1387
  "learning_rate": 7.030787065396865e-09,
1388
- "logits/chosen": 2.154999256134033,
1389
- "logits/rejected": 3.1474082469940186,
1390
- "logps/chosen": -400.4173278808594,
1391
- "logps/rejected": -488.27923583984375,
1392
- "loss": 0.5078,
1393
- "rewards/accuracies": 0.6937500238418579,
1394
- "rewards/chosen": -1.3393958806991577,
1395
- "rewards/margins": 0.8886721730232239,
1396
- "rewards/rejected": -2.2280678749084473,
1397
  "step": 890
1398
  },
1399
  {
1400
  "epoch": 0.94,
1401
  "learning_rate": 5.04062020432286e-09,
1402
- "logits/chosen": 2.4136557579040527,
1403
- "logits/rejected": 3.1030116081237793,
1404
- "logps/chosen": -433.02392578125,
1405
- "logps/rejected": -505.378662109375,
1406
- "loss": 0.4959,
1407
  "rewards/accuracies": 0.71875,
1408
- "rewards/chosen": -1.3464595079421997,
1409
- "rewards/margins": 0.8647556304931641,
1410
- "rewards/rejected": -2.211215019226074,
1411
  "step": 900
1412
  },
1413
  {
1414
  "epoch": 0.94,
1415
- "eval_logits/chosen": 2.1701812744140625,
1416
- "eval_logits/rejected": 2.841163396835327,
1417
- "eval_logps/chosen": -416.8122253417969,
1418
- "eval_logps/rejected": -494.4040222167969,
1419
- "eval_loss": 0.4994244873523712,
1420
- "eval_rewards/accuracies": 0.773809552192688,
1421
- "eval_rewards/chosen": -1.3270248174667358,
1422
- "eval_rewards/margins": 0.9992297887802124,
1423
- "eval_rewards/rejected": -2.3262546062469482,
1424
- "eval_runtime": 243.1566,
1425
- "eval_samples_per_second": 8.225,
1426
- "eval_steps_per_second": 0.259,
1427
  "step": 900
1428
  },
1429
  {
1430
  "epoch": 0.95,
1431
  "learning_rate": 3.3780648016376866e-09,
1432
- "logits/chosen": 2.659358501434326,
1433
- "logits/rejected": 3.147043466567993,
1434
- "logps/chosen": -385.5265197753906,
1435
- "logps/rejected": -475.1536560058594,
1436
- "loss": 0.4939,
1437
- "rewards/accuracies": 0.78125,
1438
- "rewards/chosen": -1.3798553943634033,
1439
- "rewards/margins": 0.9567114114761353,
1440
- "rewards/rejected": -2.33656644821167,
1441
  "step": 910
1442
  },
1443
  {
1444
  "epoch": 0.96,
1445
  "learning_rate": 2.0453443778310766e-09,
1446
- "logits/chosen": 2.4830732345581055,
1447
- "logits/rejected": 2.9779207706451416,
1448
- "logps/chosen": -431.20941162109375,
1449
- "logps/rejected": -497.5403747558594,
1450
- "loss": 0.4729,
1451
  "rewards/accuracies": 0.8187500238418579,
1452
- "rewards/chosen": -1.3552982807159424,
1453
- "rewards/margins": 1.039147973060608,
1454
- "rewards/rejected": -2.39444637298584,
1455
  "step": 920
1456
  },
1457
  {
1458
  "epoch": 0.97,
1459
  "learning_rate": 1.0442413283435758e-09,
1460
- "logits/chosen": 2.1105334758758545,
1461
- "logits/rejected": 2.6144089698791504,
1462
- "logps/chosen": -407.96148681640625,
1463
- "logps/rejected": -485.81719970703125,
1464
- "loss": 0.4675,
1465
- "rewards/accuracies": 0.831250011920929,
1466
- "rewards/chosen": -1.3519207239151,
1467
- "rewards/margins": 1.0662956237792969,
1468
- "rewards/rejected": -2.4182167053222656,
1469
  "step": 930
1470
  },
1471
  {
1472
  "epoch": 0.98,
1473
  "learning_rate": 3.760945397705828e-10,
1474
- "logits/chosen": 2.271446704864502,
1475
- "logits/rejected": 2.7835686206817627,
1476
- "logps/chosen": -468.17633056640625,
1477
- "logps/rejected": -529.5701904296875,
1478
- "loss": 0.4775,
1479
- "rewards/accuracies": 0.7875000238418579,
1480
- "rewards/chosen": -1.3844900131225586,
1481
- "rewards/margins": 0.9976957440376282,
1482
- "rewards/rejected": -2.382185697555542,
1483
  "step": 940
1484
  },
1485
  {
1486
  "epoch": 0.99,
1487
  "learning_rate": 4.17975992204056e-11,
1488
- "logits/chosen": 2.0716397762298584,
1489
- "logits/rejected": 3.089191198348999,
1490
- "logps/chosen": -422.05474853515625,
1491
- "logps/rejected": -507.30828857421875,
1492
- "loss": 0.5028,
1493
- "rewards/accuracies": 0.7875000238418579,
1494
- "rewards/chosen": -1.3753665685653687,
1495
- "rewards/margins": 1.0656555891036987,
1496
- "rewards/rejected": -2.4410219192504883,
1497
  "step": 950
1498
  },
1499
  {
1500
  "epoch": 1.0,
1501
  "step": 955,
1502
  "total_flos": 0.0,
1503
- "train_loss": 0.5295458661324066,
1504
- "train_runtime": 21408.4408,
1505
- "train_samples_per_second": 2.856,
1506
- "train_steps_per_second": 0.045
1507
  }
1508
  ],
1509
  "logging_steps": 10,
1510
  "max_steps": 955,
1511
  "num_input_tokens_seen": 0,
1512
  "num_train_epochs": 1,
1513
- "save_steps": 100,
1514
  "total_flos": 0.0,
1515
  "train_batch_size": 8,
1516
  "trial_name": null,
 
15
  "logits/rejected": -2.686896800994873,
16
  "logps/chosen": -229.94229125976562,
17
  "logps/rejected": -214.70114135742188,
18
+ "loss": 0.9741,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
25
  {
26
  "epoch": 0.01,
27
  "learning_rate": 5.208333333333333e-08,
28
+ "logits/chosen": -2.6810548305511475,
29
+ "logits/rejected": -2.709120035171509,
30
+ "logps/chosen": -295.81451416015625,
31
+ "logps/rejected": -250.5977325439453,
32
+ "loss": 0.9742,
33
+ "rewards/accuracies": 0.4375,
34
+ "rewards/chosen": 0.0003499284212011844,
35
+ "rewards/margins": 0.0001925795222632587,
36
+ "rewards/rejected": 0.00015734886983409524,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.02,
41
  "learning_rate": 1.0416666666666667e-07,
42
+ "logits/chosen": -2.6195099353790283,
43
+ "logits/rejected": -2.625662088394165,
44
+ "logps/chosen": -271.3158264160156,
45
+ "logps/rejected": -246.94711303710938,
46
+ "loss": 0.9738,
47
+ "rewards/accuracies": 0.5249999761581421,
48
+ "rewards/chosen": 0.00036363088293001056,
49
+ "rewards/margins": 0.0009133815765380859,
50
+ "rewards/rejected": -0.0005497508682310581,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.03,
55
  "learning_rate": 1.5624999999999999e-07,
56
+ "logits/chosen": -2.7032079696655273,
57
+ "logits/rejected": -2.666191577911377,
58
+ "logps/chosen": -278.3299865722656,
59
+ "logps/rejected": -254.5498809814453,
60
+ "loss": 0.9731,
61
+ "rewards/accuracies": 0.6187499761581421,
62
+ "rewards/chosen": 0.0010723542654886842,
63
+ "rewards/margins": 0.0018442096188664436,
64
+ "rewards/rejected": -0.0007718555280007422,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.04,
69
  "learning_rate": 2.0833333333333333e-07,
70
+ "logits/chosen": -2.6499533653259277,
71
+ "logits/rejected": -2.6374642848968506,
72
+ "logps/chosen": -273.9149475097656,
73
+ "logps/rejected": -237.7373504638672,
74
+ "loss": 0.9713,
75
+ "rewards/accuracies": 0.675000011920929,
76
+ "rewards/chosen": 0.003091720398515463,
77
+ "rewards/margins": 0.005813647527247667,
78
+ "rewards/rejected": -0.002721927361562848,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.05,
83
  "learning_rate": 2.604166666666667e-07,
84
+ "logits/chosen": -2.6743884086608887,
85
+ "logits/rejected": -2.6382362842559814,
86
+ "logps/chosen": -296.06744384765625,
87
+ "logps/rejected": -274.7203063964844,
88
+ "loss": 0.9657,
89
+ "rewards/accuracies": 0.7124999761581421,
90
+ "rewards/chosen": 0.009085027500987053,
91
+ "rewards/margins": 0.016131814569234848,
92
+ "rewards/rejected": -0.007046787533909082,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.06,
97
  "learning_rate": 3.1249999999999997e-07,
98
+ "logits/chosen": -2.630288600921631,
99
+ "logits/rejected": -2.6329100131988525,
100
+ "logps/chosen": -285.3112487792969,
101
+ "logps/rejected": -274.3540954589844,
102
+ "loss": 0.9577,
103
+ "rewards/accuracies": 0.668749988079071,
104
+ "rewards/chosen": 0.02476242184638977,
105
+ "rewards/margins": 0.0284078661352396,
106
+ "rewards/rejected": -0.0036454431246966124,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.07,
111
  "learning_rate": 3.645833333333333e-07,
112
+ "logits/chosen": -2.6492714881896973,
113
+ "logits/rejected": -2.6846468448638916,
114
+ "logps/chosen": -310.99761962890625,
115
+ "logps/rejected": -290.5529479980469,
116
+ "loss": 0.9406,
117
+ "rewards/accuracies": 0.6875,
118
+ "rewards/chosen": 0.03506559133529663,
119
+ "rewards/margins": 0.05608881637454033,
120
+ "rewards/rejected": -0.021023228764533997,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.08,
125
  "learning_rate": 4.1666666666666667e-07,
126
+ "logits/chosen": -2.523487091064453,
127
+ "logits/rejected": -2.464901924133301,
128
+ "logps/chosen": -304.66845703125,
129
+ "logps/rejected": -281.81732177734375,
130
+ "loss": 0.9095,
131
+ "rewards/accuracies": 0.7437499761581421,
132
+ "rewards/chosen": -0.04107608273625374,
133
+ "rewards/margins": 0.11836276948451996,
134
+ "rewards/rejected": -0.1594388782978058,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.09,
139
  "learning_rate": 4.6874999999999996e-07,
140
+ "logits/chosen": -2.5307559967041016,
141
+ "logits/rejected": -2.5012693405151367,
142
+ "logps/chosen": -292.33392333984375,
143
+ "logps/rejected": -284.6061096191406,
144
+ "loss": 0.8948,
145
+ "rewards/accuracies": 0.706250011920929,
146
+ "rewards/chosen": -0.00477323355153203,
147
+ "rewards/margins": 0.17751149833202362,
148
+ "rewards/rejected": -0.18228471279144287,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.1,
153
  "learning_rate": 4.999732492681437e-07,
154
+ "logits/chosen": -2.501075267791748,
155
+ "logits/rejected": -2.491670846939087,
156
+ "logps/chosen": -340.92401123046875,
157
+ "logps/rejected": -345.025390625,
158
+ "loss": 0.8957,
159
  "rewards/accuracies": 0.7124999761581421,
160
+ "rewards/chosen": -0.35498708486557007,
161
+ "rewards/margins": 0.17348773777484894,
162
+ "rewards/rejected": -0.5284748077392578,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.1,
167
+ "eval_logits/chosen": -2.54604434967041,
168
+ "eval_logits/rejected": -2.5107295513153076,
169
+ "eval_logps/chosen": -336.2060241699219,
170
+ "eval_logps/rejected": -330.2667541503906,
171
+ "eval_loss": 0.9028440117835999,
172
+ "eval_rewards/accuracies": 0.6904761791229248,
173
+ "eval_rewards/chosen": -0.5209627151489258,
174
+ "eval_rewards/margins": 0.1639193296432495,
175
+ "eval_rewards/rejected": -0.6848820447921753,
176
+ "eval_runtime": 245.63,
177
+ "eval_samples_per_second": 8.142,
178
+ "eval_steps_per_second": 0.256,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.12,
183
  "learning_rate": 4.996723692767926e-07,
184
+ "logits/chosen": -2.258904218673706,
185
+ "logits/rejected": -2.185375452041626,
186
+ "logps/chosen": -319.14654541015625,
187
+ "logps/rejected": -304.89739990234375,
188
+ "loss": 0.8703,
189
+ "rewards/accuracies": 0.668749988079071,
190
+ "rewards/chosen": -0.5882319211959839,
191
+ "rewards/margins": 0.2905711531639099,
192
+ "rewards/rejected": -0.8788030743598938,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.13,
197
  "learning_rate": 4.990375746213598e-07,
198
+ "logits/chosen": -1.46225106716156,
199
+ "logits/rejected": -1.342179775238037,
200
+ "logps/chosen": -348.0282897949219,
201
+ "logps/rejected": -337.330078125,
202
+ "loss": 0.8274,
203
+ "rewards/accuracies": 0.7437499761581421,
204
+ "rewards/chosen": -0.42451825737953186,
205
+ "rewards/margins": 0.41508832573890686,
206
+ "rewards/rejected": -0.8396065831184387,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.14,
211
  "learning_rate": 4.980697142834314e-07,
212
+ "logits/chosen": -1.1638177633285522,
213
+ "logits/rejected": -0.918566107749939,
214
+ "logps/chosen": -371.7701721191406,
215
+ "logps/rejected": -366.410400390625,
216
+ "loss": 0.8256,
217
+ "rewards/accuracies": 0.706250011920929,
218
+ "rewards/chosen": -0.5147355198860168,
219
+ "rewards/margins": 0.4547084867954254,
220
+ "rewards/rejected": -0.9694439172744751,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.15,
225
  "learning_rate": 4.967700826904229e-07,
226
+ "logits/chosen": 0.09236583858728409,
227
+ "logits/rejected": 0.07060788571834564,
228
+ "logps/chosen": -294.2372741699219,
229
+ "logps/rejected": -336.0912170410156,
230
+ "loss": 0.771,
231
+ "rewards/accuracies": 0.7124999761581421,
232
+ "rewards/chosen": -0.5445362329483032,
233
+ "rewards/margins": 0.5613424181938171,
234
+ "rewards/rejected": -1.1058785915374756,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.16,
239
  "learning_rate": 4.951404179843962e-07,
240
+ "logits/chosen": 0.7083513140678406,
241
+ "logits/rejected": 0.5464950203895569,
242
+ "logps/chosen": -364.0424499511719,
243
+ "logps/rejected": -411.08209228515625,
244
+ "loss": 0.8494,
245
+ "rewards/accuracies": 0.637499988079071,
246
+ "rewards/chosen": -0.8699826002120972,
247
+ "rewards/margins": 0.39457255601882935,
248
+ "rewards/rejected": -1.2645552158355713,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.17,
253
  "learning_rate": 4.931828996974498e-07,
254
+ "logits/chosen": 0.4938809871673584,
255
+ "logits/rejected": 0.9410598874092102,
256
+ "logps/chosen": -424.72430419921875,
257
+ "logps/rejected": -468.349609375,
258
+ "loss": 0.7631,
259
  "rewards/accuracies": 0.6937500238418579,
260
+ "rewards/chosen": -1.2891677618026733,
261
+ "rewards/margins": 0.488609254360199,
262
+ "rewards/rejected": -1.777777075767517,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.18,
267
  "learning_rate": 4.909001458367866e-07,
268
+ "logits/chosen": 0.14566074311733246,
269
+ "logits/rejected": 0.20485401153564453,
270
+ "logps/chosen": -388.87158203125,
271
+ "logps/rejected": -427.6263732910156,
272
+ "loss": 0.7772,
273
+ "rewards/accuracies": 0.6937500238418579,
274
+ "rewards/chosen": -1.1580675840377808,
275
+ "rewards/margins": 0.6463179588317871,
276
+ "rewards/rejected": -1.8043855428695679,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.19,
281
  "learning_rate": 4.882952093833627e-07,
282
+ "logits/chosen": 0.1968054324388504,
283
+ "logits/rejected": 0.5141702890396118,
284
+ "logps/chosen": -351.22821044921875,
285
+ "logps/rejected": -409.0249938964844,
286
+ "loss": 0.7395,
287
+ "rewards/accuracies": 0.7124999761581421,
288
+ "rewards/chosen": -1.02028489112854,
289
+ "rewards/margins": 0.7496393322944641,
290
+ "rewards/rejected": -1.7699241638183594,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.2,
295
  "learning_rate": 4.853715742087946e-07,
296
+ "logits/chosen": 0.310161292552948,
297
+ "logits/rejected": 0.9174222946166992,
298
+ "logps/chosen": -406.7144775390625,
299
+ "logps/rejected": -464.5382385253906,
300
+ "loss": 0.7373,
301
+ "rewards/accuracies": 0.762499988079071,
302
+ "rewards/chosen": -1.2787965536117554,
303
+ "rewards/margins": 0.8055577278137207,
304
+ "rewards/rejected": -2.0843544006347656,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.21,
309
  "learning_rate": 4.821331504159906e-07,
310
+ "logits/chosen": 0.49894601106643677,
311
+ "logits/rejected": 1.0194227695465088,
312
+ "logps/chosen": -405.57330322265625,
313
+ "logps/rejected": -467.26641845703125,
314
+ "loss": 0.7658,
315
+ "rewards/accuracies": 0.7250000238418579,
316
+ "rewards/chosen": -1.1890487670898438,
317
+ "rewards/margins": 0.7286871671676636,
318
+ "rewards/rejected": -1.9177358150482178,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.21,
323
+ "eval_logits/chosen": -0.152938574552536,
324
+ "eval_logits/rejected": 0.33469492197036743,
325
+ "eval_logps/chosen": -378.24761962890625,
326
+ "eval_logps/rejected": -431.1014709472656,
327
+ "eval_loss": 0.7649896144866943,
328
+ "eval_rewards/accuracies": 0.7460317611694336,
329
+ "eval_rewards/chosen": -0.941378653049469,
330
+ "eval_rewards/margins": 0.7518512010574341,
331
+ "eval_rewards/rejected": -1.6932299137115479,
332
+ "eval_runtime": 243.5152,
333
+ "eval_samples_per_second": 8.213,
334
+ "eval_steps_per_second": 0.259,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.22,
339
  "learning_rate": 4.785842691097342e-07,
340
+ "logits/chosen": -0.3292608857154846,
341
+ "logits/rejected": 0.1720762550830841,
342
+ "logps/chosen": -389.8594665527344,
343
+ "logps/rejected": -401.64581298828125,
344
+ "loss": 0.7634,
345
  "rewards/accuracies": 0.7250000238418579,
346
+ "rewards/chosen": -0.8458383679389954,
347
+ "rewards/margins": 0.6316131353378296,
348
+ "rewards/rejected": -1.4774516820907593,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.23,
353
  "learning_rate": 4.7472967660421603e-07,
354
+ "logits/chosen": 0.5375509858131409,
355
+ "logits/rejected": 0.9775497317314148,
356
+ "logps/chosen": -387.09014892578125,
357
+ "logps/rejected": -446.6785583496094,
358
+ "loss": 0.7559,
359
+ "rewards/accuracies": 0.7437499761581421,
360
+ "rewards/chosen": -0.9755613207817078,
361
+ "rewards/margins": 0.7280157208442688,
362
+ "rewards/rejected": -1.7035770416259766,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.24,
367
  "learning_rate": 4.705745280752585e-07,
368
+ "logits/chosen": 0.5464267134666443,
369
+ "logits/rejected": 1.0262590646743774,
370
+ "logps/chosen": -444.06072998046875,
371
+ "logps/rejected": -483.55926513671875,
372
+ "loss": 0.7417,
373
+ "rewards/accuracies": 0.7437499761581421,
374
+ "rewards/chosen": -1.5034980773925781,
375
+ "rewards/margins": 0.8456419706344604,
376
+ "rewards/rejected": -2.349139928817749,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.25,
381
  "learning_rate": 4.6612438066572555e-07,
382
+ "logits/chosen": 0.6961275935173035,
383
+ "logits/rejected": 1.6300386190414429,
384
+ "logps/chosen": -418.9384765625,
385
+ "logps/rejected": -454.033935546875,
386
+ "loss": 0.6919,
387
+ "rewards/accuracies": 0.7124999761581421,
388
+ "rewards/chosen": -1.4168249368667603,
389
+ "rewards/margins": 0.863106369972229,
390
+ "rewards/rejected": -2.2799313068389893,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.26,
395
  "learning_rate": 4.6138518605333664e-07,
396
+ "logits/chosen": -0.6623570919036865,
397
+ "logits/rejected": -0.1930474489927292,
398
+ "logps/chosen": -347.79315185546875,
399
+ "logps/rejected": -429.45465087890625,
400
+ "loss": 0.7378,
401
+ "rewards/accuracies": 0.6875,
402
+ "rewards/chosen": -0.8020486831665039,
403
+ "rewards/margins": 0.6783057451248169,
404
+ "rewards/rejected": -1.4803544282913208,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.27,
409
  "learning_rate": 4.5636328249082514e-07,
410
+ "logits/chosen": -1.4540965557098389,
411
+ "logits/rejected": -0.7168424725532532,
412
+ "logps/chosen": -342.86102294921875,
413
+ "logps/rejected": -387.8767395019531,
414
+ "loss": 0.7447,
415
+ "rewards/accuracies": 0.78125,
416
+ "rewards/chosen": -0.5879959464073181,
417
+ "rewards/margins": 0.7121099233627319,
418
+ "rewards/rejected": -1.3001058101654053,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.28,
423
  "learning_rate": 4.510653863290871e-07,
424
+ "logits/chosen": -0.18550051748752594,
425
+ "logits/rejected": 0.8060259819030762,
426
+ "logps/chosen": -402.1506042480469,
427
+ "logps/rejected": -452.3603515625,
428
+ "loss": 0.7185,
429
+ "rewards/accuracies": 0.768750011920929,
430
+ "rewards/chosen": -1.002969741821289,
431
+ "rewards/margins": 1.018422245979309,
432
+ "rewards/rejected": -2.0213921070098877,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.29,
437
  "learning_rate": 4.4549858303465737e-07,
438
+ "logits/chosen": 0.21997830271720886,
439
+ "logits/rejected": 0.8937602043151855,
440
+ "logps/chosen": -420.73773193359375,
441
+ "logps/rejected": -503.6924743652344,
442
+ "loss": 0.6991,
443
+ "rewards/accuracies": 0.78125,
444
+ "rewards/chosen": -1.295493721961975,
445
+ "rewards/margins": 0.9076651334762573,
446
+ "rewards/rejected": -2.2031588554382324,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.3,
451
  "learning_rate": 4.396703177135261e-07,
452
+ "logits/chosen": 0.11293928325176239,
453
+ "logits/rejected": 0.5330738425254822,
454
+ "logps/chosen": -414.304931640625,
455
+ "logps/rejected": -458.75946044921875,
456
+ "loss": 0.7064,
457
+ "rewards/accuracies": 0.737500011920929,
458
+ "rewards/chosen": -1.2927477359771729,
459
+ "rewards/margins": 0.7714477777481079,
460
+ "rewards/rejected": -2.0641958713531494,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.31,
465
  "learning_rate": 4.335883851539693e-07,
466
+ "logits/chosen": 0.7407528162002563,
467
+ "logits/rejected": 1.589734435081482,
468
+ "logps/chosen": -405.79022216796875,
469
+ "logps/rejected": -488.19036865234375,
470
+ "loss": 0.7079,
471
+ "rewards/accuracies": 0.7749999761581421,
472
+ "rewards/chosen": -1.223577857017517,
473
+ "rewards/margins": 1.1488986015319824,
474
+ "rewards/rejected": -2.372476816177368,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.31,
479
+ "eval_logits/chosen": 0.87442946434021,
480
+ "eval_logits/rejected": 1.8370469808578491,
481
+ "eval_logps/chosen": -422.4754333496094,
482
+ "eval_logps/rejected": -510.4591064453125,
483
+ "eval_loss": 0.7289105653762817,
484
+ "eval_rewards/accuracies": 0.7559523582458496,
485
+ "eval_rewards/chosen": -1.383657455444336,
486
+ "eval_rewards/margins": 1.1031482219696045,
487
+ "eval_rewards/rejected": -2.4868052005767822,
488
+ "eval_runtime": 242.4112,
489
+ "eval_samples_per_second": 8.25,
490
  "eval_steps_per_second": 0.26,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.32,
495
  "learning_rate": 4.272609194017105e-07,
496
+ "logits/chosen": 0.7091141939163208,
497
+ "logits/rejected": 1.7715873718261719,
498
+ "logps/chosen": -404.2896728515625,
499
+ "logps/rejected": -514.3563842773438,
500
+ "loss": 0.6597,
501
+ "rewards/accuracies": 0.7749999761581421,
502
+ "rewards/chosen": -1.2207536697387695,
503
+ "rewards/margins": 1.231592059135437,
504
+ "rewards/rejected": -2.452345848083496,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.33,
509
  "learning_rate": 4.2069638288135547e-07,
510
+ "logits/chosen": 0.6078277826309204,
511
+ "logits/rejected": 1.4416134357452393,
512
+ "logps/chosen": -417.4878845214844,
513
+ "logps/rejected": -491.2012634277344,
514
+ "loss": 0.7219,
515
+ "rewards/accuracies": 0.7124999761581421,
516
+ "rewards/chosen": -1.358330249786377,
517
+ "rewards/margins": 0.8599346280097961,
518
+ "rewards/rejected": -2.2182650566101074,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.35,
523
  "learning_rate": 4.139035550786494e-07,
524
+ "logits/chosen": 0.2845739424228668,
525
+ "logits/rejected": 0.876905620098114,
526
+ "logps/chosen": -394.2958068847656,
527
+ "logps/rejected": -421.72393798828125,
528
+ "loss": 0.7567,
529
  "rewards/accuracies": 0.7437499761581421,
530
+ "rewards/chosen": -1.147202730178833,
531
+ "rewards/margins": 0.7379333972930908,
532
+ "rewards/rejected": -1.8851358890533447,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.36,
537
  "learning_rate": 4.0689152079869306e-07,
538
+ "logits/chosen": -0.5365289449691772,
539
+ "logits/rejected": 0.34862279891967773,
540
+ "logps/chosen": -342.41485595703125,
541
+ "logps/rejected": -388.524658203125,
542
+ "loss": 0.776,
543
+ "rewards/accuracies": 0.6499999761581421,
544
+ "rewards/chosen": -0.9666112065315247,
545
+ "rewards/margins": 0.6763932108879089,
546
+ "rewards/rejected": -1.6430044174194336,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.37,
551
  "learning_rate": 3.99669658015821e-07,
552
+ "logits/chosen": 0.45323339104652405,
553
+ "logits/rejected": 0.7332956194877625,
554
+ "logps/chosen": -399.76519775390625,
555
+ "logps/rejected": -506.49658203125,
556
+ "loss": 0.7213,
557
+ "rewards/accuracies": 0.7124999761581421,
558
+ "rewards/chosen": -1.27635657787323,
559
+ "rewards/margins": 0.998461127281189,
560
+ "rewards/rejected": -2.2748172283172607,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.38,
565
  "learning_rate": 3.92247625331392e-07,
566
+ "logits/chosen": 0.6511309742927551,
567
+ "logits/rejected": 1.2110098600387573,
568
+ "logps/chosen": -422.53778076171875,
569
+ "logps/rejected": -472.4964294433594,
570
+ "loss": 0.7003,
571
+ "rewards/accuracies": 0.7562500238418579,
572
+ "rewards/chosen": -1.3980082273483276,
573
+ "rewards/margins": 0.8997832536697388,
574
+ "rewards/rejected": -2.2977914810180664,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.39,
579
  "learning_rate": 3.846353490562664e-07,
580
+ "logits/chosen": 0.30174368619918823,
581
+ "logits/rejected": 0.9621099233627319,
582
+ "logps/chosen": -372.94635009765625,
583
+ "logps/rejected": -497.25714111328125,
584
+ "loss": 0.6659,
585
  "rewards/accuracies": 0.7749999761581421,
586
+ "rewards/chosen": -1.3280470371246338,
587
+ "rewards/margins": 1.1047414541244507,
588
+ "rewards/rejected": -2.432788372039795,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.4,
593
  "learning_rate": 3.768430099352445e-07,
594
+ "logits/chosen": -0.1737074851989746,
595
+ "logits/rejected": 1.1676225662231445,
596
+ "logps/chosen": -460.0953063964844,
597
+ "logps/rejected": -537.3989868164062,
598
+ "loss": 0.6788,
599
+ "rewards/accuracies": 0.75,
600
+ "rewards/chosen": -1.6348745822906494,
601
+ "rewards/margins": 1.1145647764205933,
602
+ "rewards/rejected": -2.7494394779205322,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 0.41,
607
  "learning_rate": 3.6888102953122304e-07,
608
+ "logits/chosen": -0.2348991185426712,
609
+ "logits/rejected": 0.7535260915756226,
610
+ "logps/chosen": -421.128173828125,
611
+ "logps/rejected": -482.41656494140625,
612
+ "loss": 0.7027,
613
+ "rewards/accuracies": 0.762499988079071,
614
+ "rewards/chosen": -1.4536683559417725,
615
+ "rewards/margins": 1.019152045249939,
616
+ "rewards/rejected": -2.47282075881958,
617
  "step": 390
618
  },
619
  {
620
  "epoch": 0.42,
621
  "learning_rate": 3.607600562872785e-07,
622
+ "logits/chosen": -0.24078145623207092,
623
+ "logits/rejected": 0.8144145011901855,
624
+ "logps/chosen": -454.4244689941406,
625
+ "logps/rejected": -500.9457092285156,
626
+ "loss": 0.6806,
627
+ "rewards/accuracies": 0.699999988079071,
628
+ "rewards/chosen": -1.5004165172576904,
629
+ "rewards/margins": 0.854039192199707,
630
+ "rewards/rejected": -2.3544554710388184,
631
  "step": 400
632
  },
633
  {
634
  "epoch": 0.42,
635
+ "eval_logits/chosen": 0.09922664612531662,
636
+ "eval_logits/rejected": 1.2713433504104614,
637
+ "eval_logps/chosen": -416.9630432128906,
638
+ "eval_logps/rejected": -503.67401123046875,
639
+ "eval_loss": 0.7040213346481323,
640
+ "eval_rewards/accuracies": 0.7698412537574768,
641
+ "eval_rewards/chosen": -1.3285325765609741,
642
+ "eval_rewards/margins": 1.0904221534729004,
643
+ "eval_rewards/rejected": -2.418954849243164,
644
+ "eval_runtime": 242.7344,
645
+ "eval_samples_per_second": 8.239,
646
+ "eval_steps_per_second": 0.26,
647
  "step": 400
648
  },
649
  {
650
  "epoch": 0.43,
651
  "learning_rate": 3.5249095128531856e-07,
652
+ "logits/chosen": -0.06878291815519333,
653
+ "logits/rejected": 1.236665964126587,
654
+ "logps/chosen": -443.3794860839844,
655
+ "logps/rejected": -522.2283325195312,
656
+ "loss": 0.7085,
657
+ "rewards/accuracies": 0.75,
658
+ "rewards/chosen": -1.2952146530151367,
659
+ "rewards/margins": 1.0169426202774048,
660
+ "rewards/rejected": -2.312157154083252,
661
  "step": 410
662
  },
663
  {
664
  "epoch": 0.44,
665
  "learning_rate": 3.4408477372034736e-07,
666
+ "logits/chosen": 0.041589152067899704,
667
+ "logits/rejected": 1.3509619235992432,
668
+ "logps/chosen": -390.40008544921875,
669
+ "logps/rejected": -440.0779724121094,
670
+ "loss": 0.7102,
671
+ "rewards/accuracies": 0.6875,
672
+ "rewards/chosen": -1.2629300355911255,
673
+ "rewards/margins": 0.8384572863578796,
674
+ "rewards/rejected": -2.1013875007629395,
675
  "step": 420
676
  },
677
  {
678
  "epoch": 0.45,
679
  "learning_rate": 3.3555276610977276e-07,
680
+ "logits/chosen": 0.06685711443424225,
681
+ "logits/rejected": 1.2138116359710693,
682
+ "logps/chosen": -372.45196533203125,
683
+ "logps/rejected": -431.705078125,
684
+ "loss": 0.7376,
685
+ "rewards/accuracies": 0.7562500238418579,
686
+ "rewards/chosen": -1.1025346517562866,
687
+ "rewards/margins": 0.8204299211502075,
688
+ "rewards/rejected": -1.9229644536972046,
689
  "step": 430
690
  },
691
  {
692
  "epoch": 0.46,
693
  "learning_rate": 3.269063392575352e-07,
694
+ "logits/chosen": 0.9198445081710815,
695
+ "logits/rejected": 0.9934859275817871,
696
+ "logps/chosen": -396.35968017578125,
697
+ "logps/rejected": -476.847900390625,
698
+ "loss": 0.6966,
699
+ "rewards/accuracies": 0.7250000238418579,
700
+ "rewards/chosen": -1.3736763000488281,
701
+ "rewards/margins": 0.9061026573181152,
702
+ "rewards/rejected": -2.2797789573669434,
703
  "step": 440
704
  },
705
  {
706
  "epoch": 0.47,
707
  "learning_rate": 3.1815705699316964e-07,
708
+ "logits/chosen": 0.9360873103141785,
709
+ "logits/rejected": 1.5897537469863892,
710
+ "logps/chosen": -401.81207275390625,
711
+ "logps/rejected": -485.36041259765625,
712
+ "loss": 0.715,
713
  "rewards/accuracies": 0.762499988079071,
714
+ "rewards/chosen": -1.3765710592269897,
715
+ "rewards/margins": 1.040223479270935,
716
+ "rewards/rejected": -2.416795015335083,
717
  "step": 450
718
  },
719
  {
720
  "epoch": 0.48,
721
  "learning_rate": 3.0931662070620794e-07,
722
+ "logits/chosen": 0.9600452184677124,
723
+ "logits/rejected": 2.169283866882324,
724
+ "logps/chosen": -413.7256774902344,
725
+ "logps/rejected": -508.83892822265625,
726
+ "loss": 0.6982,
727
+ "rewards/accuracies": 0.75,
728
+ "rewards/chosen": -1.4869290590286255,
729
+ "rewards/margins": 1.0339877605438232,
730
+ "rewards/rejected": -2.5209171772003174,
731
  "step": 460
732
  },
733
  {
734
  "epoch": 0.49,
735
  "learning_rate": 3.003968536966078e-07,
736
+ "logits/chosen": 1.2197582721710205,
737
+ "logits/rejected": 2.1600019931793213,
738
+ "logps/chosen": -441.291748046875,
739
+ "logps/rejected": -515.36083984375,
740
+ "loss": 0.6864,
741
+ "rewards/accuracies": 0.793749988079071,
742
+ "rewards/chosen": -1.4309029579162598,
743
+ "rewards/margins": 1.1178550720214844,
744
+ "rewards/rejected": -2.548758029937744,
745
  "step": 470
746
  },
747
  {
748
  "epoch": 0.5,
749
  "learning_rate": 2.9140968536213693e-07,
750
+ "logits/chosen": 1.2262110710144043,
751
+ "logits/rejected": 2.440544843673706,
752
+ "logps/chosen": -372.9617919921875,
753
+ "logps/rejected": -461.28765869140625,
754
+ "loss": 0.7197,
755
+ "rewards/accuracies": 0.731249988079071,
756
+ "rewards/chosen": -1.4022343158721924,
757
+ "rewards/margins": 0.9221154451370239,
758
+ "rewards/rejected": -2.324349880218506,
759
  "step": 480
760
  },
761
  {
762
  "epoch": 0.51,
763
  "learning_rate": 2.823671352438608e-07,
764
+ "logits/chosen": 1.2201405763626099,
765
+ "logits/rejected": 2.058537721633911,
766
+ "logps/chosen": -409.15814208984375,
767
+ "logps/rejected": -467.56573486328125,
768
+ "loss": 0.6978,
769
+ "rewards/accuracies": 0.706250011920929,
770
+ "rewards/chosen": -1.334566354751587,
771
+ "rewards/margins": 0.8473381996154785,
772
+ "rewards/rejected": -2.1819043159484863,
773
  "step": 490
774
  },
775
  {
776
  "epoch": 0.52,
777
  "learning_rate": 2.73281296951072e-07,
778
+ "logits/chosen": 1.904309868812561,
779
+ "logits/rejected": 2.613346576690674,
780
+ "logps/chosen": -433.7508850097656,
781
+ "logps/rejected": -524.71044921875,
782
+ "loss": 0.7129,
783
+ "rewards/accuracies": 0.768750011920929,
784
+ "rewards/chosen": -1.621690034866333,
785
+ "rewards/margins": 1.130249261856079,
786
+ "rewards/rejected": -2.751939058303833,
787
  "step": 500
788
  },
789
  {
790
  "epoch": 0.52,
791
+ "eval_logits/chosen": 1.4090828895568848,
792
+ "eval_logits/rejected": 2.334313154220581,
793
+ "eval_logps/chosen": -430.316650390625,
794
+ "eval_logps/rejected": -514.4609375,
795
+ "eval_loss": 0.6979612112045288,
796
+ "eval_rewards/accuracies": 0.7440476417541504,
797
+ "eval_rewards/chosen": -1.4620689153671265,
798
+ "eval_rewards/margins": 1.0647554397583008,
799
+ "eval_rewards/rejected": -2.5268242359161377,
800
+ "eval_runtime": 243.0213,
801
+ "eval_samples_per_second": 8.23,
802
  "eval_steps_per_second": 0.259,
803
  "step": 500
804
  },
805
  {
806
  "epoch": 0.53,
807
  "learning_rate": 2.641643219871597e-07,
808
+ "logits/chosen": 1.5503180027008057,
809
+ "logits/rejected": 2.431549072265625,
810
+ "logps/chosen": -441.70721435546875,
811
+ "logps/rejected": -510.1366271972656,
812
+ "loss": 0.6483,
813
+ "rewards/accuracies": 0.8187500238418579,
814
+ "rewards/chosen": -1.403807282447815,
815
+ "rewards/margins": 1.1817026138305664,
816
+ "rewards/rejected": -2.585510015487671,
817
  "step": 510
818
  },
819
  {
820
  "epoch": 0.54,
821
  "learning_rate": 2.550284034980507e-07,
822
+ "logits/chosen": 1.6724971532821655,
823
+ "logits/rejected": 2.7851836681365967,
824
+ "logps/chosen": -426.255859375,
825
+ "logps/rejected": -528.8610229492188,
826
+ "loss": 0.701,
827
+ "rewards/accuracies": 0.7437499761581421,
828
+ "rewards/chosen": -1.6710426807403564,
829
+ "rewards/margins": 1.100687861442566,
830
+ "rewards/rejected": -2.771730661392212,
831
  "step": 520
832
  },
833
  {
834
  "epoch": 0.55,
835
  "learning_rate": 2.4588575996495794e-07,
836
+ "logits/chosen": 1.8039824962615967,
837
+ "logits/rejected": 2.613548755645752,
838
+ "logps/chosen": -449.572265625,
839
+ "logps/rejected": -537.4893798828125,
840
+ "loss": 0.6894,
841
+ "rewards/accuracies": 0.737500011920929,
842
+ "rewards/chosen": -1.654245376586914,
843
+ "rewards/margins": 1.1508926153182983,
844
+ "rewards/rejected": -2.805138111114502,
845
  "step": 530
846
  },
847
  {
848
  "epoch": 0.57,
849
  "learning_rate": 2.367486188632446e-07,
850
+ "logits/chosen": 1.2713382244110107,
851
+ "logits/rejected": 2.1325278282165527,
852
+ "logps/chosen": -456.769775390625,
853
+ "logps/rejected": -589.4976806640625,
854
+ "loss": 0.6831,
855
+ "rewards/accuracies": 0.75,
856
+ "rewards/chosen": -1.6260936260223389,
857
+ "rewards/margins": 1.2662837505340576,
858
+ "rewards/rejected": -2.8923773765563965,
859
  "step": 540
860
  },
861
  {
862
  "epoch": 0.58,
863
  "learning_rate": 2.276292003092593e-07,
864
+ "logits/chosen": 1.5338995456695557,
865
+ "logits/rejected": 2.471559524536133,
866
+ "logps/chosen": -422.6114807128906,
867
+ "logps/rejected": -517.1149291992188,
868
+ "loss": 0.7116,
869
  "rewards/accuracies": 0.75,
870
+ "rewards/chosen": -1.5288379192352295,
871
+ "rewards/margins": 1.1625785827636719,
872
+ "rewards/rejected": -2.6914165019989014,
873
  "step": 550
874
  },
875
  {
876
  "epoch": 0.59,
877
  "learning_rate": 2.185397007170141e-07,
878
+ "logits/chosen": 1.3305785655975342,
879
+ "logits/rejected": 1.9697010517120361,
880
+ "logps/chosen": -409.57635498046875,
881
+ "logps/rejected": -467.7806091308594,
882
+ "loss": 0.7032,
883
  "rewards/accuracies": 0.731249988079071,
884
+ "rewards/chosen": -1.4281054735183716,
885
+ "rewards/margins": 0.915216326713562,
886
+ "rewards/rejected": -2.3433218002319336,
887
  "step": 560
888
  },
889
  {
890
  "epoch": 0.6,
891
  "learning_rate": 2.094922764865619e-07,
892
+ "logits/chosen": 1.1964863538742065,
893
+ "logits/rejected": 2.188833236694336,
894
+ "logps/chosen": -427.9024353027344,
895
+ "logps/rejected": -497.96917724609375,
896
+ "loss": 0.7036,
897
+ "rewards/accuracies": 0.737500011920929,
898
+ "rewards/chosen": -1.5176267623901367,
899
+ "rewards/margins": 0.8729730844497681,
900
+ "rewards/rejected": -2.3905997276306152,
901
  "step": 570
902
  },
903
  {
904
  "epoch": 0.61,
905
  "learning_rate": 2.0049902774588797e-07,
906
+ "logits/chosen": 1.2527071237564087,
907
+ "logits/rejected": 2.2398314476013184,
908
+ "logps/chosen": -436.9583435058594,
909
+ "logps/rejected": -504.8207092285156,
910
+ "loss": 0.6924,
911
+ "rewards/accuracies": 0.768750011920929,
912
+ "rewards/chosen": -1.6303842067718506,
913
+ "rewards/margins": 1.0391124486923218,
914
+ "rewards/rejected": -2.669496774673462,
915
  "step": 580
916
  },
917
  {
918
  "epoch": 0.62,
919
  "learning_rate": 1.9157198216806238e-07,
920
+ "logits/chosen": 0.6701461672782898,
921
+ "logits/rejected": 1.8555580377578735,
922
+ "logps/chosen": -405.43902587890625,
923
+ "logps/rejected": -496.9740295410156,
924
+ "loss": 0.6922,
925
+ "rewards/accuracies": 0.706250011920929,
926
+ "rewards/chosen": -1.323939561843872,
927
+ "rewards/margins": 0.8526731729507446,
928
+ "rewards/rejected": -2.176612615585327,
929
  "step": 590
930
  },
931
  {
932
  "epoch": 0.63,
933
  "learning_rate": 1.8272307888529274e-07,
934
+ "logits/chosen": 0.462153822183609,
935
+ "logits/rejected": 1.8794240951538086,
936
+ "logps/chosen": -451.150634765625,
937
+ "logps/rejected": -545.4595947265625,
938
+ "loss": 0.6636,
939
+ "rewards/accuracies": 0.7562500238418579,
940
+ "rewards/chosen": -1.2517478466033936,
941
+ "rewards/margins": 1.0819759368896484,
942
+ "rewards/rejected": -2.333723783493042,
943
  "step": 600
944
  },
945
  {
946
  "epoch": 0.63,
947
+ "eval_logits/chosen": 0.7470372319221497,
948
+ "eval_logits/rejected": 2.2081830501556396,
949
+ "eval_logps/chosen": -417.3849792480469,
950
+ "eval_logps/rejected": -513.6627197265625,
951
+ "eval_loss": 0.6876600980758667,
952
+ "eval_rewards/accuracies": 0.75,
953
+ "eval_rewards/chosen": -1.3327523469924927,
954
+ "eval_rewards/margins": 1.1860896348953247,
955
+ "eval_rewards/rejected": -2.5188419818878174,
956
+ "eval_runtime": 244.0904,
957
+ "eval_samples_per_second": 8.194,
958
  "eval_steps_per_second": 0.258,
959
  "step": 600
960
  },
961
  {
962
  "epoch": 0.64,
963
  "learning_rate": 1.7396415252139288e-07,
964
+ "logits/chosen": 1.345348834991455,
965
+ "logits/rejected": 2.903435707092285,
966
+ "logps/chosen": -418.60797119140625,
967
+ "logps/rejected": -482.93035888671875,
968
+ "loss": 0.6686,
969
+ "rewards/accuracies": 0.75,
970
+ "rewards/chosen": -1.4358808994293213,
971
+ "rewards/margins": 1.1490730047225952,
972
+ "rewards/rejected": -2.584954261779785,
973
  "step": 610
974
  },
975
  {
976
  "epoch": 0.65,
977
  "learning_rate": 1.6530691736402316e-07,
978
+ "logits/chosen": 2.0353918075561523,
979
+ "logits/rejected": 3.255995512008667,
980
+ "logps/chosen": -455.14959716796875,
981
+ "logps/rejected": -525.127685546875,
982
+ "loss": 0.6894,
983
+ "rewards/accuracies": 0.7250000238418579,
984
+ "rewards/chosen": -1.831215500831604,
985
+ "rewards/margins": 1.118740200996399,
986
+ "rewards/rejected": -2.949955701828003,
987
  "step": 620
988
  },
989
  {
990
  "epoch": 0.66,
991
  "learning_rate": 1.5676295169786864e-07,
992
+ "logits/chosen": 2.8849244117736816,
993
+ "logits/rejected": 3.68190336227417,
994
+ "logps/chosen": -441.6712951660156,
995
+ "logps/rejected": -579.8939819335938,
996
+ "loss": 0.6633,
997
+ "rewards/accuracies": 0.7875000238418579,
998
+ "rewards/chosen": -1.800830602645874,
999
+ "rewards/margins": 1.4441092014312744,
1000
+ "rewards/rejected": -3.2449398040771484,
1001
  "step": 630
1002
  },
1003
  {
1004
  "epoch": 0.67,
1005
  "learning_rate": 1.483436823197092e-07,
1006
+ "logits/chosen": 1.9812686443328857,
1007
+ "logits/rejected": 2.8578484058380127,
1008
+ "logps/chosen": -427.90460205078125,
1009
+ "logps/rejected": -523.0911865234375,
1010
+ "loss": 0.6953,
1011
  "rewards/accuracies": 0.800000011920929,
1012
+ "rewards/chosen": -1.432652235031128,
1013
+ "rewards/margins": 1.2259643077850342,
1014
+ "rewards/rejected": -2.658616542816162,
1015
  "step": 640
1016
  },
1017
  {
1018
  "epoch": 0.68,
1019
  "learning_rate": 1.4006036925609243e-07,
1020
+ "logits/chosen": 1.2402979135513306,
1021
+ "logits/rejected": 2.491854190826416,
1022
+ "logps/chosen": -458.84344482421875,
1023
+ "logps/rejected": -508.12847900390625,
1024
+ "loss": 0.6884,
1025
+ "rewards/accuracies": 0.768750011920929,
1026
+ "rewards/chosen": -1.5252853631973267,
1027
+ "rewards/margins": 0.9278051257133484,
1028
+ "rewards/rejected": -2.4530904293060303,
1029
  "step": 650
1030
  },
1031
  {
1032
  "epoch": 0.69,
1033
  "learning_rate": 1.319240907040458e-07,
1034
+ "logits/chosen": 1.5803894996643066,
1035
+ "logits/rejected": 2.081526756286621,
1036
+ "logps/chosen": -454.36309814453125,
1037
+ "logps/rejected": -526.6981201171875,
1038
+ "loss": 0.6824,
1039
+ "rewards/accuracies": 0.75,
1040
+ "rewards/chosen": -1.5280418395996094,
1041
+ "rewards/margins": 1.0684349536895752,
1042
+ "rewards/rejected": -2.5964770317077637,
1043
  "step": 660
1044
  },
1045
  {
1046
  "epoch": 0.7,
1047
  "learning_rate": 1.239457282149695e-07,
1048
+ "logits/chosen": 1.9639816284179688,
1049
+ "logits/rejected": 3.2135062217712402,
1050
+ "logps/chosen": -435.85888671875,
1051
+ "logps/rejected": -534.4019775390625,
1052
+ "loss": 0.6736,
1053
+ "rewards/accuracies": 0.737500011920929,
1054
+ "rewards/chosen": -1.5270483493804932,
1055
+ "rewards/margins": 1.1297345161437988,
1056
+ "rewards/rejected": -2.656782627105713,
1057
  "step": 670
1058
  },
1059
  {
1060
  "epoch": 0.71,
1061
  "learning_rate": 1.1613595214152711e-07,
1062
+ "logits/chosen": 2.091609477996826,
1063
+ "logits/rejected": 2.95405912399292,
1064
+ "logps/chosen": -404.2757873535156,
1065
+ "logps/rejected": -456.81732177734375,
1066
+ "loss": 0.7108,
1067
+ "rewards/accuracies": 0.6312500238418579,
1068
+ "rewards/chosen": -1.4280402660369873,
1069
+ "rewards/margins": 0.7583026885986328,
1070
+ "rewards/rejected": -2.186342716217041,
1071
  "step": 680
1072
  },
1073
  {
1074
  "epoch": 0.72,
1075
  "learning_rate": 1.0850520736699362e-07,
1076
+ "logits/chosen": 1.712774634361267,
1077
+ "logits/rejected": 3.230499267578125,
1078
+ "logps/chosen": -395.8936767578125,
1079
+ "logps/rejected": -499.32965087890625,
1080
+ "loss": 0.655,
1081
+ "rewards/accuracies": 0.7875000238418579,
1082
+ "rewards/chosen": -1.3443926572799683,
1083
+ "rewards/margins": 1.2300078868865967,
1084
+ "rewards/rejected": -2.5744004249572754,
1085
  "step": 690
1086
  },
1087
  {
1088
  "epoch": 0.73,
1089
  "learning_rate": 1.0106369933615042e-07,
1090
+ "logits/chosen": 2.6943306922912598,
1091
+ "logits/rejected": 4.300943851470947,
1092
+ "logps/chosen": -449.5069274902344,
1093
+ "logps/rejected": -574.8884887695312,
1094
+ "loss": 0.6217,
1095
+ "rewards/accuracies": 0.7875000238418579,
1096
+ "rewards/chosen": -1.9530900716781616,
1097
+ "rewards/margins": 1.3479530811309814,
1098
+ "rewards/rejected": -3.3010432720184326,
1099
  "step": 700
1100
  },
1101
  {
1102
  "epoch": 0.73,
1103
+ "eval_logits/chosen": 2.593170166015625,
1104
+ "eval_logits/rejected": 3.816297769546509,
1105
+ "eval_logps/chosen": -473.18865966796875,
1106
+ "eval_logps/rejected": -579.6353759765625,
1107
+ "eval_loss": 0.6762357354164124,
1108
+ "eval_rewards/accuracies": 0.7698412537574768,
1109
+ "eval_rewards/chosen": -1.8907891511917114,
1110
+ "eval_rewards/margins": 1.2877792119979858,
1111
+ "eval_rewards/rejected": -3.1785686016082764,
1112
+ "eval_runtime": 244.2095,
1113
+ "eval_samples_per_second": 8.19,
1114
+ "eval_steps_per_second": 0.258,
1115
  "step": 700
1116
  },
1117
  {
1118
  "epoch": 0.74,
1119
  "learning_rate": 9.382138040640714e-08,
1120
+ "logits/chosen": 2.803864002227783,
1121
+ "logits/rejected": 3.797267198562622,
1122
+ "logps/chosen": -505.75677490234375,
1123
+ "logps/rejected": -552.4486083984375,
1124
+ "loss": 0.6429,
1125
+ "rewards/accuracies": 0.7437499761581421,
1126
+ "rewards/chosen": -2.0083601474761963,
1127
+ "rewards/margins": 1.1107932329177856,
1128
+ "rewards/rejected": -3.1191532611846924,
1129
  "step": 710
1130
  },
1131
  {
1132
  "epoch": 0.75,
1133
  "learning_rate": 8.678793653740632e-08,
1134
+ "logits/chosen": 3.4752883911132812,
1135
+ "logits/rejected": 4.102308750152588,
1136
+ "logps/chosen": -513.325439453125,
1137
+ "logps/rejected": -616.204345703125,
1138
+ "loss": 0.6616,
1139
+ "rewards/accuracies": 0.7562500238418579,
1140
+ "rewards/chosen": -2.2190704345703125,
1141
+ "rewards/margins": 1.2776607275009155,
1142
+ "rewards/rejected": -3.4967312812805176,
1143
  "step": 720
1144
  },
1145
  {
1146
  "epoch": 0.76,
1147
  "learning_rate": 7.997277433690983e-08,
1148
+ "logits/chosen": 3.118082046508789,
1149
+ "logits/rejected": 4.068647861480713,
1150
+ "logps/chosen": -515.4818725585938,
1151
+ "logps/rejected": -578.458984375,
1152
+ "loss": 0.6692,
1153
+ "rewards/accuracies": 0.78125,
1154
+ "rewards/chosen": -2.115870475769043,
1155
+ "rewards/margins": 1.1491236686706543,
1156
+ "rewards/rejected": -3.2649941444396973,
1157
  "step": 730
1158
  },
1159
  {
1160
  "epoch": 0.77,
1161
  "learning_rate": 7.338500848029602e-08,
1162
+ "logits/chosen": 3.267651319503784,
1163
+ "logits/rejected": 4.356374263763428,
1164
+ "logps/chosen": -517.941650390625,
1165
+ "logps/rejected": -612.4710693359375,
1166
+ "loss": 0.6626,
1167
+ "rewards/accuracies": 0.831250011920929,
1168
+ "rewards/chosen": -2.0364277362823486,
1169
+ "rewards/margins": 1.4974032640457153,
1170
+ "rewards/rejected": -3.5338311195373535,
1171
  "step": 740
1172
  },
1173
  {
1174
  "epoch": 0.78,
1175
  "learning_rate": 6.70334495204884e-08,
1176
+ "logits/chosen": 2.9188966751098633,
1177
+ "logits/rejected": 3.9505248069763184,
1178
+ "logps/chosen": -495.83734130859375,
1179
+ "logps/rejected": -612.5488891601562,
1180
+ "loss": 0.6476,
1181
+ "rewards/accuracies": 0.762499988079071,
1182
+ "rewards/chosen": -2.124872922897339,
1183
+ "rewards/margins": 1.2153751850128174,
1184
+ "rewards/rejected": -3.3402485847473145,
1185
  "step": 750
1186
  },
1187
  {
1188
  "epoch": 0.8,
1189
  "learning_rate": 6.092659210462231e-08,
1190
+ "logits/chosen": 3.145782709121704,
1191
+ "logits/rejected": 3.7394192218780518,
1192
+ "logps/chosen": -498.886474609375,
1193
+ "logps/rejected": -587.1813354492188,
1194
+ "loss": 0.6186,
1195
+ "rewards/accuracies": 0.7562500238418579,
1196
+ "rewards/chosen": -2.2077949047088623,
1197
+ "rewards/margins": 1.1572462320327759,
1198
+ "rewards/rejected": -3.3650412559509277,
1199
  "step": 760
1200
  },
1201
  {
1202
  "epoch": 0.81,
1203
  "learning_rate": 5.507260361320737e-08,
1204
+ "logits/chosen": 3.364577531814575,
1205
+ "logits/rejected": 4.243520259857178,
1206
+ "logps/chosen": -541.8561401367188,
1207
+ "logps/rejected": -659.7272338867188,
1208
+ "loss": 0.6917,
1209
+ "rewards/accuracies": 0.731249988079071,
1210
+ "rewards/chosen": -2.4785799980163574,
1211
+ "rewards/margins": 1.123822569847107,
1212
+ "rewards/rejected": -3.602402448654175,
1213
  "step": 770
1214
  },
1215
  {
1216
  "epoch": 0.82,
1217
  "learning_rate": 4.947931323697982e-08,
1218
+ "logits/chosen": 3.392416477203369,
1219
+ "logits/rejected": 4.268471717834473,
1220
+ "logps/chosen": -480.16455078125,
1221
+ "logps/rejected": -572.0571899414062,
1222
+ "loss": 0.6843,
1223
  "rewards/accuracies": 0.706250011920929,
1224
+ "rewards/chosen": -2.2396440505981445,
1225
+ "rewards/margins": 1.116714596748352,
1226
+ "rewards/rejected": -3.356358766555786,
1227
  "step": 780
1228
  },
1229
  {
1230
  "epoch": 0.83,
1231
  "learning_rate": 4.415420150605398e-08,
1232
+ "logits/chosen": 2.9198615550994873,
1233
+ "logits/rejected": 3.857909679412842,
1234
+ "logps/chosen": -536.2203369140625,
1235
+ "logps/rejected": -663.2730102539062,
1236
+ "loss": 0.6706,
1237
+ "rewards/accuracies": 0.8062499761581421,
1238
+ "rewards/chosen": -2.358766794204712,
1239
+ "rewards/margins": 1.428501844406128,
1240
+ "rewards/rejected": -3.787268877029419,
1241
  "step": 790
1242
  },
1243
  {
1244
  "epoch": 0.84,
1245
  "learning_rate": 3.9104390285376374e-08,
1246
+ "logits/chosen": 2.381801128387451,
1247
+ "logits/rejected": 4.157925605773926,
1248
+ "logps/chosen": -550.6214599609375,
1249
+ "logps/rejected": -629.8056030273438,
1250
+ "loss": 0.6418,
1251
+ "rewards/accuracies": 0.7437499761581421,
1252
+ "rewards/chosen": -2.0941646099090576,
1253
+ "rewards/margins": 1.3854446411132812,
1254
+ "rewards/rejected": -3.4796090126037598,
1255
  "step": 800
1256
  },
1257
  {
1258
  "epoch": 0.84,
1259
+ "eval_logits/chosen": 2.6092491149902344,
1260
+ "eval_logits/rejected": 3.865464448928833,
1261
+ "eval_logps/chosen": -494.04217529296875,
1262
+ "eval_logps/rejected": -602.0606689453125,
1263
+ "eval_loss": 0.6711603403091431,
1264
  "eval_rewards/accuracies": 0.7678571343421936,
1265
+ "eval_rewards/chosen": -2.0993239879608154,
1266
+ "eval_rewards/margins": 1.3034968376159668,
1267
+ "eval_rewards/rejected": -3.4028208255767822,
1268
+ "eval_runtime": 243.1214,
1269
+ "eval_samples_per_second": 8.226,
1270
  "eval_steps_per_second": 0.259,
1271
  "step": 800
1272
  },
1273
  {
1274
  "epoch": 0.85,
1275
  "learning_rate": 3.433663324986208e-08,
1276
+ "logits/chosen": 3.123400926589966,
1277
+ "logits/rejected": 4.341358184814453,
1278
+ "logps/chosen": -507.13848876953125,
1279
+ "logps/rejected": -566.4569091796875,
1280
+ "loss": 0.6669,
1281
+ "rewards/accuracies": 0.7437499761581421,
1282
+ "rewards/chosen": -2.2766828536987305,
1283
+ "rewards/margins": 0.9975360035896301,
1284
+ "rewards/rejected": -3.274219036102295,
1285
  "step": 810
1286
  },
1287
  {
1288
  "epoch": 0.86,
1289
  "learning_rate": 2.9857306851953897e-08,
1290
+ "logits/chosen": 3.36864972114563,
1291
+ "logits/rejected": 3.826308488845825,
1292
+ "logps/chosen": -454.4519958496094,
1293
+ "logps/rejected": -553.3275756835938,
1294
+ "loss": 0.7048,
1295
  "rewards/accuracies": 0.762499988079071,
1296
+ "rewards/chosen": -2.0178139209747314,
1297
+ "rewards/margins": 1.1636488437652588,
1298
+ "rewards/rejected": -3.1814627647399902,
1299
  "step": 820
1300
  },
1301
  {
1302
  "epoch": 0.87,
1303
  "learning_rate": 2.567240179368185e-08,
1304
+ "logits/chosen": 2.7587532997131348,
1305
+ "logits/rejected": 3.9622387886047363,
1306
+ "logps/chosen": -461.407470703125,
1307
+ "logps/rejected": -577.1260986328125,
1308
+ "loss": 0.6538,
1309
+ "rewards/accuracies": 0.78125,
1310
+ "rewards/chosen": -2.125457286834717,
1311
+ "rewards/margins": 1.223615050315857,
1312
+ "rewards/rejected": -3.3490726947784424,
1313
  "step": 830
1314
  },
1315
  {
1316
  "epoch": 0.88,
1317
  "learning_rate": 2.1787515014630357e-08,
1318
+ "logits/chosen": 3.194129228591919,
1319
+ "logits/rejected": 3.398770809173584,
1320
+ "logps/chosen": -526.1636962890625,
1321
+ "logps/rejected": -602.7716064453125,
1322
+ "loss": 0.7025,
1323
+ "rewards/accuracies": 0.7250000238418579,
1324
+ "rewards/chosen": -2.1302103996276855,
1325
+ "rewards/margins": 1.0573285818099976,
1326
+ "rewards/rejected": -3.1875391006469727,
1327
  "step": 840
1328
  },
1329
  {
1330
  "epoch": 0.89,
1331
  "learning_rate": 1.820784220652766e-08,
1332
+ "logits/chosen": 2.8333277702331543,
1333
+ "logits/rejected": 3.949988842010498,
1334
+ "logps/chosen": -497.5380859375,
1335
+ "logps/rejected": -547.4320068359375,
1336
+ "loss": 0.6689,
1337
  "rewards/accuracies": 0.7437499761581421,
1338
+ "rewards/chosen": -1.9726041555404663,
1339
+ "rewards/margins": 1.0516941547393799,
1340
+ "rewards/rejected": -3.0242981910705566,
1341
  "step": 850
1342
  },
1343
  {
1344
  "epoch": 0.9,
1345
  "learning_rate": 1.4938170864468636e-08,
1346
+ "logits/chosen": 2.8082051277160645,
1347
+ "logits/rejected": 4.054238796234131,
1348
+ "logps/chosen": -488.4153747558594,
1349
+ "logps/rejected": -592.6768798828125,
1350
+ "loss": 0.6703,
1351
+ "rewards/accuracies": 0.731249988079071,
1352
+ "rewards/chosen": -2.0998573303222656,
1353
+ "rewards/margins": 1.3178421258926392,
1354
+ "rewards/rejected": -3.4176993370056152,
1355
  "step": 860
1356
  },
1357
  {
1358
  "epoch": 0.91,
1359
  "learning_rate": 1.1982873884064465e-08,
1360
+ "logits/chosen": 2.500764846801758,
1361
+ "logits/rejected": 3.581740140914917,
1362
+ "logps/chosen": -425.3038024902344,
1363
+ "logps/rejected": -571.4251708984375,
1364
+ "loss": 0.6579,
1365
+ "rewards/accuracies": 0.8187500238418579,
1366
+ "rewards/chosen": -1.7831649780273438,
1367
+ "rewards/margins": 1.468294382095337,
1368
+ "rewards/rejected": -3.2514591217041016,
1369
  "step": 870
1370
  },
1371
  {
1372
  "epoch": 0.92,
1373
  "learning_rate": 9.345903713082304e-09,
1374
+ "logits/chosen": 2.309072971343994,
1375
+ "logits/rejected": 3.692427158355713,
1376
+ "logps/chosen": -488.32049560546875,
1377
+ "logps/rejected": -586.3157958984375,
1378
+ "loss": 0.6445,
1379
+ "rewards/accuracies": 0.737500011920929,
1380
+ "rewards/chosen": -2.1420960426330566,
1381
+ "rewards/margins": 1.187731146812439,
1382
+ "rewards/rejected": -3.329827070236206,
1383
  "step": 880
1384
  },
1385
  {
1386
  "epoch": 0.93,
1387
  "learning_rate": 7.030787065396865e-09,
1388
+ "logits/chosen": 2.455681800842285,
1389
+ "logits/rejected": 3.950096607208252,
1390
+ "logps/chosen": -487.016357421875,
1391
+ "logps/rejected": -593.2111206054688,
1392
+ "loss": 0.701,
1393
+ "rewards/accuracies": 0.7250000238418579,
1394
+ "rewards/chosen": -2.2053864002227783,
1395
+ "rewards/margins": 1.07200026512146,
1396
+ "rewards/rejected": -3.277386426925659,
1397
  "step": 890
1398
  },
1399
  {
1400
  "epoch": 0.94,
1401
  "learning_rate": 5.04062020432286e-09,
1402
+ "logits/chosen": 2.607274293899536,
1403
+ "logits/rejected": 3.9918441772460938,
1404
+ "logps/chosen": -505.3465881347656,
1405
+ "logps/rejected": -603.4442138671875,
1406
+ "loss": 0.6678,
1407
  "rewards/accuracies": 0.71875,
1408
+ "rewards/chosen": -2.0696864128112793,
1409
+ "rewards/margins": 1.1221836805343628,
1410
+ "rewards/rejected": -3.1918704509735107,
1411
  "step": 900
1412
  },
1413
  {
1414
  "epoch": 0.94,
1415
+ "eval_logits/chosen": 2.4517972469329834,
1416
+ "eval_logits/rejected": 3.733224630355835,
1417
+ "eval_logps/chosen": -487.1844482421875,
1418
+ "eval_logps/rejected": -594.1102905273438,
1419
+ "eval_loss": 0.6716480851173401,
1420
+ "eval_rewards/accuracies": 0.7638888955116272,
1421
+ "eval_rewards/chosen": -2.030747175216675,
1422
+ "eval_rewards/margins": 1.2925708293914795,
1423
+ "eval_rewards/rejected": -3.323317766189575,
1424
+ "eval_runtime": 244.2155,
1425
+ "eval_samples_per_second": 8.189,
1426
+ "eval_steps_per_second": 0.258,
1427
  "step": 900
1428
  },
1429
  {
1430
  "epoch": 0.95,
1431
  "learning_rate": 3.3780648016376866e-09,
1432
+ "logits/chosen": 2.913886308670044,
1433
+ "logits/rejected": 4.131613731384277,
1434
+ "logps/chosen": -459.2386779785156,
1435
+ "logps/rejected": -578.1640014648438,
1436
+ "loss": 0.6622,
1437
+ "rewards/accuracies": 0.793749988079071,
1438
+ "rewards/chosen": -2.1169772148132324,
1439
+ "rewards/margins": 1.2496932744979858,
1440
+ "rewards/rejected": -3.3666698932647705,
1441
  "step": 910
1442
  },
1443
  {
1444
  "epoch": 0.96,
1445
  "learning_rate": 2.0453443778310766e-09,
1446
+ "logits/chosen": 2.6672825813293457,
1447
+ "logits/rejected": 3.8567919731140137,
1448
+ "logps/chosen": -503.61395263671875,
1449
+ "logps/rejected": -597.9525756835938,
1450
+ "loss": 0.6413,
1451
  "rewards/accuracies": 0.8187500238418579,
1452
+ "rewards/chosen": -2.079343557357788,
1453
+ "rewards/margins": 1.3192239999771118,
1454
+ "rewards/rejected": -3.3985676765441895,
1455
  "step": 920
1456
  },
1457
  {
1458
  "epoch": 0.97,
1459
  "learning_rate": 1.0442413283435758e-09,
1460
+ "logits/chosen": 2.472501754760742,
1461
+ "logits/rejected": 3.5483956336975098,
1462
+ "logps/chosen": -481.59515380859375,
1463
+ "logps/rejected": -590.560791015625,
1464
+ "loss": 0.6391,
1465
+ "rewards/accuracies": 0.8125,
1466
+ "rewards/chosen": -2.0882575511932373,
1467
+ "rewards/margins": 1.377394676208496,
1468
+ "rewards/rejected": -3.4656529426574707,
1469
  "step": 930
1470
  },
1471
  {
1472
  "epoch": 0.98,
1473
  "learning_rate": 3.760945397705828e-10,
1474
+ "logits/chosen": 2.6073191165924072,
1475
+ "logits/rejected": 3.743886947631836,
1476
+ "logps/chosen": -536.0969848632812,
1477
+ "logps/rejected": -624.2562255859375,
1478
+ "loss": 0.6442,
1479
+ "rewards/accuracies": 0.7562500238418579,
1480
+ "rewards/chosen": -2.063697338104248,
1481
+ "rewards/margins": 1.2653493881225586,
1482
+ "rewards/rejected": -3.3290467262268066,
1483
  "step": 940
1484
  },
1485
  {
1486
  "epoch": 0.99,
1487
  "learning_rate": 4.17975992204056e-11,
1488
+ "logits/chosen": 2.321948766708374,
1489
+ "logits/rejected": 3.977466583251953,
1490
+ "logps/chosen": -496.20025634765625,
1491
+ "logps/rejected": -609.7675170898438,
1492
+ "loss": 0.6933,
1493
+ "rewards/accuracies": 0.768750011920929,
1494
+ "rewards/chosen": -2.116821527481079,
1495
+ "rewards/margins": 1.348793387413025,
1496
+ "rewards/rejected": -3.4656143188476562,
1497
  "step": 950
1498
  },
1499
  {
1500
  "epoch": 1.0,
1501
  "step": 955,
1502
  "total_flos": 0.0,
1503
+ "train_loss": 0.7263204834224042,
1504
+ "train_runtime": 20734.7169,
1505
+ "train_samples_per_second": 2.948,
1506
+ "train_steps_per_second": 0.046
1507
  }
1508
  ],
1509
  "logging_steps": 10,
1510
  "max_steps": 955,
1511
  "num_input_tokens_seen": 0,
1512
  "num_train_epochs": 1,
1513
+ "save_steps": 100000000,
1514
  "total_flos": 0.0,
1515
  "train_batch_size": 8,
1516
  "trial_name": null,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1bd9cb35476a01b10da6b3209784859e726bf3cc59b947897695e5d5ffe6b13
3
- size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96176aeb701ce092111002b23ff886e7082dc5b7b988ccce9913225c1f4e42a5
3
+ size 6008