AmberYifan commited on
Commit
cb37968
·
verified ·
1 Parent(s): f5961af

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: AmberYifan/llama2-7b-sft-ultrachat-safeRLHF
3
+ library_name: transformers
4
+ model_name: Llama-2-7b-relevant-clean-filtered
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - dpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Llama-2-7b-relevant-clean-filtered
13
+
14
+ This model is a fine-tuned version of [AmberYifan/llama2-7b-sft-ultrachat-safeRLHF](https://huggingface.co/AmberYifan/llama2-7b-sft-ultrachat-safeRLHF).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="AmberYifan/Llama-2-7b-relevant-clean-filtered", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yifanwang/huggingface/runs/s6afx75w)
31
+
32
+ This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.12.2
37
+ - Transformers: 4.46.3
38
+ - Pytorch: 2.5.1+cu118
39
+ - Datasets: 3.2.0
40
+ - Tokenizers: 0.20.3
41
+
42
+ ## Citations
43
+
44
+ Cite DPO as:
45
+
46
+ ```bibtex
47
+ @inproceedings{rafailov2023direct,
48
+ title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
49
+ author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
50
+ year = 2023,
51
+ booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
52
+ url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
53
+ editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
54
+ }
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.051078493730762065,
5
+ "train_runtime": 2008.9755,
6
+ "train_samples": 3321,
7
+ "train_samples_per_second": 4.959,
8
+ "train_steps_per_second": 0.155
9
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "do_sample": true,
4
+ "eos_token_id": 2,
5
+ "max_length": 4096,
6
+ "pad_token_id": 0,
7
+ "temperature": 0.6,
8
+ "top_p": 0.9,
9
+ "transformers_version": "4.46.3"
10
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.051078493730762065,
5
+ "train_runtime": 2008.9755,
6
+ "train_samples": 3321,
7
+ "train_samples_per_second": 4.959,
8
+ "train_steps_per_second": 0.155
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 312,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009615384615384616,
13
+ "grad_norm": 28.194065437278258,
14
+ "learning_rate": 1.5625e-08,
15
+ "logits/chosen": -1.609375,
16
+ "logits/rejected": -1.6484375,
17
+ "logps/chosen": -152.0,
18
+ "logps/rejected": -127.5,
19
+ "loss": 0.6914,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.09615384615384616,
28
+ "grad_norm": 26.916436077863562,
29
+ "learning_rate": 1.5624999999999999e-07,
30
+ "logits/chosen": -1.6015625,
31
+ "logits/rejected": -1.484375,
32
+ "logps/chosen": -149.0,
33
+ "logps/rejected": -128.0,
34
+ "loss": 0.6868,
35
+ "rewards/accuracies": 0.3194444477558136,
36
+ "rewards/chosen": -0.0045166015625,
37
+ "rewards/margins": 0.01080322265625,
38
+ "rewards/rejected": -0.0152587890625,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.19230769230769232,
43
+ "grad_norm": 20.740344035629946,
44
+ "learning_rate": 3.1249999999999997e-07,
45
+ "logits/chosen": -1.59375,
46
+ "logits/rejected": -1.4765625,
47
+ "logps/chosen": -160.0,
48
+ "logps/rejected": -133.0,
49
+ "loss": 0.5799,
50
+ "rewards/accuracies": 0.9624999761581421,
51
+ "rewards/chosen": 0.03662109375,
52
+ "rewards/margins": 0.2734375,
53
+ "rewards/rejected": -0.2373046875,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.28846153846153844,
58
+ "grad_norm": 6.570831756440352,
59
+ "learning_rate": 4.6874999999999996e-07,
60
+ "logits/chosen": -1.5703125,
61
+ "logits/rejected": -1.484375,
62
+ "logps/chosen": -142.0,
63
+ "logps/rejected": -142.0,
64
+ "loss": 0.254,
65
+ "rewards/accuracies": 1.0,
66
+ "rewards/chosen": 0.0072021484375,
67
+ "rewards/margins": 1.390625,
68
+ "rewards/rejected": -1.3828125,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.38461538461538464,
73
+ "grad_norm": 0.3430324547403658,
74
+ "learning_rate": 4.857142857142857e-07,
75
+ "logits/chosen": -1.59375,
76
+ "logits/rejected": -1.546875,
77
+ "logps/chosen": -169.0,
78
+ "logps/rejected": -180.0,
79
+ "loss": 0.0296,
80
+ "rewards/accuracies": 1.0,
81
+ "rewards/chosen": -0.341796875,
82
+ "rewards/margins": 4.8125,
83
+ "rewards/rejected": -5.15625,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.4807692307692308,
88
+ "grad_norm": 1.0998802903857687,
89
+ "learning_rate": 4.6785714285714283e-07,
90
+ "logits/chosen": -1.6484375,
91
+ "logits/rejected": -1.5234375,
92
+ "logps/chosen": -165.0,
93
+ "logps/rejected": -200.0,
94
+ "loss": 0.0098,
95
+ "rewards/accuracies": 1.0,
96
+ "rewards/chosen": -0.7265625,
97
+ "rewards/margins": 6.625,
98
+ "rewards/rejected": -7.34375,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.5769230769230769,
103
+ "grad_norm": 2.398247571338227,
104
+ "learning_rate": 4.5e-07,
105
+ "logits/chosen": -1.625,
106
+ "logits/rejected": -1.5703125,
107
+ "logps/chosen": -173.0,
108
+ "logps/rejected": -213.0,
109
+ "loss": 0.0084,
110
+ "rewards/accuracies": 1.0,
111
+ "rewards/chosen": -0.9609375,
112
+ "rewards/margins": 7.4375,
113
+ "rewards/rejected": -8.375,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.6730769230769231,
118
+ "grad_norm": 3.272238721827958,
119
+ "learning_rate": 4.3214285714285713e-07,
120
+ "logits/chosen": -1.640625,
121
+ "logits/rejected": -1.5625,
122
+ "logps/chosen": -169.0,
123
+ "logps/rejected": -219.0,
124
+ "loss": 0.0037,
125
+ "rewards/accuracies": 1.0,
126
+ "rewards/chosen": -0.796875,
127
+ "rewards/margins": 8.125,
128
+ "rewards/rejected": -8.875,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.7692307692307693,
133
+ "grad_norm": 0.03234130751760337,
134
+ "learning_rate": 4.142857142857143e-07,
135
+ "logits/chosen": -1.65625,
136
+ "logits/rejected": -1.609375,
137
+ "logps/chosen": -151.0,
138
+ "logps/rejected": -229.0,
139
+ "loss": 0.0027,
140
+ "rewards/accuracies": 1.0,
141
+ "rewards/chosen": -0.90625,
142
+ "rewards/margins": 9.0,
143
+ "rewards/rejected": -9.875,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.8653846153846154,
148
+ "grad_norm": 2.22694231928167,
149
+ "learning_rate": 3.9642857142857137e-07,
150
+ "logits/chosen": -1.640625,
151
+ "logits/rejected": -1.65625,
152
+ "logps/chosen": -154.0,
153
+ "logps/rejected": -229.0,
154
+ "loss": 0.0023,
155
+ "rewards/accuracies": 1.0,
156
+ "rewards/chosen": -1.0390625,
157
+ "rewards/margins": 8.75,
158
+ "rewards/rejected": -9.8125,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.9615384615384616,
163
+ "grad_norm": 0.04456450837390773,
164
+ "learning_rate": 3.785714285714285e-07,
165
+ "logits/chosen": -1.671875,
166
+ "logits/rejected": -1.6328125,
167
+ "logps/chosen": -174.0,
168
+ "logps/rejected": -232.0,
169
+ "loss": 0.0029,
170
+ "rewards/accuracies": 1.0,
171
+ "rewards/chosen": -0.87109375,
172
+ "rewards/margins": 9.0625,
173
+ "rewards/rejected": -9.9375,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 1.0,
178
+ "eval_logits/chosen": -1.4765625,
179
+ "eval_logits/rejected": -1.4140625,
180
+ "eval_logps/chosen": -119.0,
181
+ "eval_logps/rejected": -235.0,
182
+ "eval_loss": 0.02374856546521187,
183
+ "eval_rewards/accuracies": 1.0,
184
+ "eval_rewards/chosen": -0.83203125,
185
+ "eval_rewards/margins": 9.8125,
186
+ "eval_rewards/rejected": -10.625,
187
+ "eval_runtime": 5.2091,
188
+ "eval_samples_per_second": 13.054,
189
+ "eval_steps_per_second": 0.576,
190
+ "step": 104
191
+ },
192
+ {
193
+ "epoch": 1.0576923076923077,
194
+ "grad_norm": 0.020791139211903984,
195
+ "learning_rate": 3.607142857142857e-07,
196
+ "logits/chosen": -1.6875,
197
+ "logits/rejected": -1.65625,
198
+ "logps/chosen": -148.0,
199
+ "logps/rejected": -226.0,
200
+ "loss": 0.0017,
201
+ "rewards/accuracies": 1.0,
202
+ "rewards/chosen": -0.83203125,
203
+ "rewards/margins": 9.25,
204
+ "rewards/rejected": -10.0625,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 1.1538461538461537,
209
+ "grad_norm": 0.04771399447611739,
210
+ "learning_rate": 3.4285714285714286e-07,
211
+ "logits/chosen": -1.6484375,
212
+ "logits/rejected": -1.5859375,
213
+ "logps/chosen": -167.0,
214
+ "logps/rejected": -234.0,
215
+ "loss": 0.0006,
216
+ "rewards/accuracies": 1.0,
217
+ "rewards/chosen": -1.3125,
218
+ "rewards/margins": 9.5625,
219
+ "rewards/rejected": -10.875,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 1.25,
224
+ "grad_norm": 0.2534459160228267,
225
+ "learning_rate": 3.25e-07,
226
+ "logits/chosen": -1.671875,
227
+ "logits/rejected": -1.671875,
228
+ "logps/chosen": -165.0,
229
+ "logps/rejected": -234.0,
230
+ "loss": 0.001,
231
+ "rewards/accuracies": 1.0,
232
+ "rewards/chosen": -1.3359375,
233
+ "rewards/margins": 9.3125,
234
+ "rewards/rejected": -10.6875,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 1.3461538461538463,
239
+ "grad_norm": 0.07091308610265967,
240
+ "learning_rate": 3.0714285714285716e-07,
241
+ "logits/chosen": -1.6640625,
242
+ "logits/rejected": -1.640625,
243
+ "logps/chosen": -179.0,
244
+ "logps/rejected": -245.0,
245
+ "loss": 0.0006,
246
+ "rewards/accuracies": 1.0,
247
+ "rewards/chosen": -1.25,
248
+ "rewards/margins": 10.125,
249
+ "rewards/rejected": -11.375,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 1.4423076923076923,
254
+ "grad_norm": 0.01427238285225871,
255
+ "learning_rate": 2.892857142857143e-07,
256
+ "logits/chosen": -1.734375,
257
+ "logits/rejected": -1.703125,
258
+ "logps/chosen": -176.0,
259
+ "logps/rejected": -240.0,
260
+ "loss": 0.0007,
261
+ "rewards/accuracies": 1.0,
262
+ "rewards/chosen": -1.25,
263
+ "rewards/margins": 9.75,
264
+ "rewards/rejected": -11.0625,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 1.5384615384615383,
269
+ "grad_norm": 0.06865294866224332,
270
+ "learning_rate": 2.714285714285714e-07,
271
+ "logits/chosen": -1.7578125,
272
+ "logits/rejected": -1.6796875,
273
+ "logps/chosen": -183.0,
274
+ "logps/rejected": -241.0,
275
+ "loss": 0.0007,
276
+ "rewards/accuracies": 1.0,
277
+ "rewards/chosen": -1.140625,
278
+ "rewards/margins": 10.25,
279
+ "rewards/rejected": -11.375,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 1.6346153846153846,
284
+ "grad_norm": 0.07896118186411183,
285
+ "learning_rate": 2.5357142857142855e-07,
286
+ "logits/chosen": -1.625,
287
+ "logits/rejected": -1.5234375,
288
+ "logps/chosen": -160.0,
289
+ "logps/rejected": -246.0,
290
+ "loss": 0.0004,
291
+ "rewards/accuracies": 1.0,
292
+ "rewards/chosen": -1.2109375,
293
+ "rewards/margins": 10.5625,
294
+ "rewards/rejected": -11.75,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 1.7307692307692308,
299
+ "grad_norm": 0.026222949044449873,
300
+ "learning_rate": 2.357142857142857e-07,
301
+ "logits/chosen": -1.671875,
302
+ "logits/rejected": -1.6484375,
303
+ "logps/chosen": -178.0,
304
+ "logps/rejected": -245.0,
305
+ "loss": 0.0015,
306
+ "rewards/accuracies": 1.0,
307
+ "rewards/chosen": -1.3984375,
308
+ "rewards/margins": 10.25,
309
+ "rewards/rejected": -11.625,
310
+ "step": 180
311
+ },
312
+ {
313
+ "epoch": 1.8269230769230769,
314
+ "grad_norm": 0.011793924192378881,
315
+ "learning_rate": 2.1785714285714284e-07,
316
+ "logits/chosen": -1.671875,
317
+ "logits/rejected": -1.71875,
318
+ "logps/chosen": -159.0,
319
+ "logps/rejected": -245.0,
320
+ "loss": 0.0006,
321
+ "rewards/accuracies": 1.0,
322
+ "rewards/chosen": -1.25,
323
+ "rewards/margins": 10.4375,
324
+ "rewards/rejected": -11.6875,
325
+ "step": 190
326
+ },
327
+ {
328
+ "epoch": 1.9230769230769231,
329
+ "grad_norm": 0.060978420362998054,
330
+ "learning_rate": 2e-07,
331
+ "logits/chosen": -1.703125,
332
+ "logits/rejected": -1.703125,
333
+ "logps/chosen": -158.0,
334
+ "logps/rejected": -243.0,
335
+ "loss": 0.0003,
336
+ "rewards/accuracies": 1.0,
337
+ "rewards/chosen": -1.0859375,
338
+ "rewards/margins": 10.5,
339
+ "rewards/rejected": -11.5625,
340
+ "step": 200
341
+ },
342
+ {
343
+ "epoch": 2.0,
344
+ "eval_logits/chosen": -1.5,
345
+ "eval_logits/rejected": -1.4453125,
346
+ "eval_logps/chosen": -121.0,
347
+ "eval_logps/rejected": -248.0,
348
+ "eval_loss": 0.006824689917266369,
349
+ "eval_rewards/accuracies": 1.0,
350
+ "eval_rewards/chosen": -1.0078125,
351
+ "eval_rewards/margins": 10.875,
352
+ "eval_rewards/rejected": -11.875,
353
+ "eval_runtime": 6.628,
354
+ "eval_samples_per_second": 10.26,
355
+ "eval_steps_per_second": 0.453,
356
+ "step": 208
357
+ },
358
+ {
359
+ "epoch": 2.019230769230769,
360
+ "grad_norm": 0.025694364273425255,
361
+ "learning_rate": 1.8214285714285714e-07,
362
+ "logits/chosen": -1.640625,
363
+ "logits/rejected": -1.640625,
364
+ "logps/chosen": -162.0,
365
+ "logps/rejected": -244.0,
366
+ "loss": 0.0004,
367
+ "rewards/accuracies": 1.0,
368
+ "rewards/chosen": -1.015625,
369
+ "rewards/margins": 10.625,
370
+ "rewards/rejected": -11.625,
371
+ "step": 210
372
+ },
373
+ {
374
+ "epoch": 2.1153846153846154,
375
+ "grad_norm": 1.318722077689268,
376
+ "learning_rate": 1.6428571428571429e-07,
377
+ "logits/chosen": -1.71875,
378
+ "logits/rejected": -1.6796875,
379
+ "logps/chosen": -165.0,
380
+ "logps/rejected": -250.0,
381
+ "loss": 0.0005,
382
+ "rewards/accuracies": 1.0,
383
+ "rewards/chosen": -1.03125,
384
+ "rewards/margins": 10.8125,
385
+ "rewards/rejected": -11.8125,
386
+ "step": 220
387
+ },
388
+ {
389
+ "epoch": 2.2115384615384617,
390
+ "grad_norm": 0.15842660516208706,
391
+ "learning_rate": 1.4642857142857143e-07,
392
+ "logits/chosen": -1.734375,
393
+ "logits/rejected": -1.7109375,
394
+ "logps/chosen": -179.0,
395
+ "logps/rejected": -255.0,
396
+ "loss": 0.0004,
397
+ "rewards/accuracies": 1.0,
398
+ "rewards/chosen": -1.5234375,
399
+ "rewards/margins": 10.75,
400
+ "rewards/rejected": -12.25,
401
+ "step": 230
402
+ },
403
+ {
404
+ "epoch": 2.3076923076923075,
405
+ "grad_norm": 0.1644318912700695,
406
+ "learning_rate": 1.2857142857142855e-07,
407
+ "logits/chosen": -1.6953125,
408
+ "logits/rejected": -1.734375,
409
+ "logps/chosen": -161.0,
410
+ "logps/rejected": -251.0,
411
+ "loss": 0.0008,
412
+ "rewards/accuracies": 1.0,
413
+ "rewards/chosen": -1.1484375,
414
+ "rewards/margins": 10.5625,
415
+ "rewards/rejected": -11.6875,
416
+ "step": 240
417
+ },
418
+ {
419
+ "epoch": 2.4038461538461537,
420
+ "grad_norm": 0.1117902717935036,
421
+ "learning_rate": 1.107142857142857e-07,
422
+ "logits/chosen": -1.703125,
423
+ "logits/rejected": -1.6875,
424
+ "logps/chosen": -174.0,
425
+ "logps/rejected": -249.0,
426
+ "loss": 0.0003,
427
+ "rewards/accuracies": 1.0,
428
+ "rewards/chosen": -1.25,
429
+ "rewards/margins": 10.9375,
430
+ "rewards/rejected": -12.1875,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 2.5,
435
+ "grad_norm": 0.04593607447714544,
436
+ "learning_rate": 9.285714285714286e-08,
437
+ "logits/chosen": -1.6484375,
438
+ "logits/rejected": -1.6328125,
439
+ "logps/chosen": -166.0,
440
+ "logps/rejected": -251.0,
441
+ "loss": 0.0003,
442
+ "rewards/accuracies": 1.0,
443
+ "rewards/chosen": -1.1328125,
444
+ "rewards/margins": 11.1875,
445
+ "rewards/rejected": -12.3125,
446
+ "step": 260
447
+ },
448
+ {
449
+ "epoch": 2.5961538461538463,
450
+ "grad_norm": 0.08723297585467851,
451
+ "learning_rate": 7.5e-08,
452
+ "logits/chosen": -1.7109375,
453
+ "logits/rejected": -1.65625,
454
+ "logps/chosen": -175.0,
455
+ "logps/rejected": -250.0,
456
+ "loss": 0.0003,
457
+ "rewards/accuracies": 1.0,
458
+ "rewards/chosen": -1.328125,
459
+ "rewards/margins": 10.4375,
460
+ "rewards/rejected": -11.8125,
461
+ "step": 270
462
+ },
463
+ {
464
+ "epoch": 2.6923076923076925,
465
+ "grad_norm": 0.16735539683605713,
466
+ "learning_rate": 5.714285714285714e-08,
467
+ "logits/chosen": -1.71875,
468
+ "logits/rejected": -1.6875,
469
+ "logps/chosen": -181.0,
470
+ "logps/rejected": -252.0,
471
+ "loss": 0.0005,
472
+ "rewards/accuracies": 1.0,
473
+ "rewards/chosen": -1.2734375,
474
+ "rewards/margins": 10.9375,
475
+ "rewards/rejected": -12.25,
476
+ "step": 280
477
+ },
478
+ {
479
+ "epoch": 2.7884615384615383,
480
+ "grad_norm": 0.10696117602485962,
481
+ "learning_rate": 3.9285714285714285e-08,
482
+ "logits/chosen": -1.625,
483
+ "logits/rejected": -1.6640625,
484
+ "logps/chosen": -150.0,
485
+ "logps/rejected": -248.0,
486
+ "loss": 0.0004,
487
+ "rewards/accuracies": 1.0,
488
+ "rewards/chosen": -1.109375,
489
+ "rewards/margins": 11.0,
490
+ "rewards/rejected": -12.0625,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 2.8846153846153846,
495
+ "grad_norm": 0.00967600454300797,
496
+ "learning_rate": 2.142857142857143e-08,
497
+ "logits/chosen": -1.671875,
498
+ "logits/rejected": -1.671875,
499
+ "logps/chosen": -171.0,
500
+ "logps/rejected": -258.0,
501
+ "loss": 0.0004,
502
+ "rewards/accuracies": 1.0,
503
+ "rewards/chosen": -1.171875,
504
+ "rewards/margins": 11.25,
505
+ "rewards/rejected": -12.4375,
506
+ "step": 300
507
+ },
508
+ {
509
+ "epoch": 2.980769230769231,
510
+ "grad_norm": 0.12453557861807049,
511
+ "learning_rate": 3.571428571428571e-09,
512
+ "logits/chosen": -1.6640625,
513
+ "logits/rejected": -1.671875,
514
+ "logps/chosen": -164.0,
515
+ "logps/rejected": -245.0,
516
+ "loss": 0.0006,
517
+ "rewards/accuracies": 1.0,
518
+ "rewards/chosen": -1.1796875,
519
+ "rewards/margins": 10.8125,
520
+ "rewards/rejected": -12.0,
521
+ "step": 310
522
+ },
523
+ {
524
+ "epoch": 3.0,
525
+ "eval_logits/chosen": -1.5,
526
+ "eval_logits/rejected": -1.453125,
527
+ "eval_logps/chosen": -121.5,
528
+ "eval_logps/rejected": -253.0,
529
+ "eval_loss": 0.004145369865000248,
530
+ "eval_rewards/accuracies": 1.0,
531
+ "eval_rewards/chosen": -1.0859375,
532
+ "eval_rewards/margins": 11.25,
533
+ "eval_rewards/rejected": -12.3125,
534
+ "eval_runtime": 6.3498,
535
+ "eval_samples_per_second": 10.709,
536
+ "eval_steps_per_second": 0.472,
537
+ "step": 312
538
+ },
539
+ {
540
+ "epoch": 3.0,
541
+ "step": 312,
542
+ "total_flos": 0.0,
543
+ "train_loss": 0.051078493730762065,
544
+ "train_runtime": 2008.9755,
545
+ "train_samples_per_second": 4.959,
546
+ "train_steps_per_second": 0.155
547
+ }
548
+ ],
549
+ "logging_steps": 10,
550
+ "max_steps": 312,
551
+ "num_input_tokens_seen": 0,
552
+ "num_train_epochs": 3,
553
+ "save_steps": 500,
554
+ "stateful_callbacks": {
555
+ "TrainerControl": {
556
+ "args": {
557
+ "should_epoch_stop": false,
558
+ "should_evaluate": false,
559
+ "should_log": false,
560
+ "should_save": true,
561
+ "should_training_stop": true
562
+ },
563
+ "attributes": {}
564
+ }
565
+ },
566
+ "total_flos": 0.0,
567
+ "train_batch_size": 8,
568
+ "trial_name": null,
569
+ "trial_params": null
570
+ }