hZzy commited on
Commit
6a3238b
1 Parent(s): f534eaf

Training in progress, step 530

Browse files
README.md ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: hZzy/qwen2.5-0.5b-sft-news-IFT
4
+ tags:
5
+ - alignment-handbook
6
+ - ndcg
7
+ - trl
8
+ - expo
9
+ - generated_from_trainer
10
+ - trl
11
+ - expo
12
+ - alignment-handbook
13
+ - ndcg
14
+ - generated_from_trainer
15
+ datasets:
16
+ - hZzy/train_pairwise
17
+ model-index:
18
+ - name: qwen2.5-0.5b-expo-DPO-ES-TRY
19
+ results: []
20
+ ---
21
+
22
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
23
+ should probably proofread and complete it, then remove this comment. -->
24
+
25
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/zhiyuzha-university-of-florida/huggingface/runs/8qngt5rk)
26
+ # qwen2.5-0.5b-expo-DPO-ES-TRY
27
+
28
+ This model is a fine-tuned version of [hZzy/qwen2.5-0.5b-sft-news-IFT](https://huggingface.co/hZzy/qwen2.5-0.5b-sft-news-IFT) on the hZzy/train_pairwise dataset.
29
+ It achieves the following results on the evaluation set:
30
+ - Loss: 0.8842
31
+ - Logps: -122.3753
32
+ - Logits: -2.2914
33
+ - Objective: 0.9181
34
+ - Dpo Loss: 0.9181
35
+ - Regularize: 0.9181
36
+ - Ranking Simple: 0.5642
37
+ - Ranking Idealized: 0.6046
38
+ - Ranking Idealized Expo: 0.5280
39
+ - Dpo Wo Beta: -6.4269
40
+
41
+ ## Model description
42
+
43
+ More information needed
44
+
45
+ ## Intended uses & limitations
46
+
47
+ More information needed
48
+
49
+ ## Training and evaluation data
50
+
51
+ More information needed
52
+
53
+ ## Training procedure
54
+
55
+ ### Training hyperparameters
56
+
57
+ The following hyperparameters were used during training:
58
+ - learning_rate: 5e-06
59
+ - train_batch_size: 2
60
+ - eval_batch_size: 2
61
+ - seed: 42
62
+ - distributed_type: multi-GPU
63
+ - num_devices: 6
64
+ - gradient_accumulation_steps: 12
65
+ - total_train_batch_size: 144
66
+ - total_eval_batch_size: 12
67
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
68
+ - lr_scheduler_type: cosine
69
+ - lr_scheduler_warmup_ratio: 0.1
70
+ - num_epochs: 3
71
+
72
+ ### Training results
73
+
74
+ | Training Loss | Epoch | Step | Dpo Loss | Dpo Wo Beta | Logits | Logps | Validation Loss | Objective | Ranking Idealized | Ranking Idealized Expo | Ranking Simple | Regularize |
75
+ |:-------------:|:------:|:----:|:--------:|:-----------:|:-------:|:---------:|:---------------:|:---------:|:-----------------:|:----------------------:|:--------------:|:----------:|
76
+ | 0.5954 | 0.3004 | 53 | 0.7113 | -2.2659 | -1.8928 | -101.3674 | 0.6816 | 0.7113 | 0.5888 | 0.5093 | 0.5238 | 0.7113 |
77
+ | 0.4618 | 0.6009 | 106 | 0.6936 | -2.4624 | -1.9007 | -94.3571 | 0.6913 | 0.6936 | 0.5888 | 0.5093 | 0.5351 | 0.6936 |
78
+ | 0.3986 | 0.9013 | 159 | 0.7215 | -3.1229 | -2.1450 | -95.6001 | 0.7014 | 0.7215 | 0.5888 | 0.5093 | 0.5351 | 0.7215 |
79
+ | 0.2551 | 1.2017 | 212 | 0.7525 | -3.7750 | -2.2678 | -98.1427 | 0.7351 | 0.7525 | 0.5888 | 0.5093 | 0.5372 | 0.7525 |
80
+ | 0.2623 | 1.5021 | 265 | 0.7739 | -4.1634 | -2.1478 | -100.8313 | 0.7400 | 0.7739 | 0.5888 | 0.5093 | 0.5393 | 0.7739 |
81
+ | 0.2571 | 1.8026 | 318 | 0.7665 | -4.0950 | -1.9888 | -102.3712 | 0.7401 | 0.7665 | 0.5888 | 0.5093 | 0.5393 | 0.7665 |
82
+ | 0.1227 | 2.1030 | 371 | 0.9224 | -6.4510 | -1.8645 | -122.0016 | 0.8844 | 0.9224 | 0.5888 | 0.5093 | 0.5424 | 0.9224 |
83
+ | 0.133 | 2.4034 | 424 | 0.8786 | -5.8878 | -2.0277 | -117.1217 | 0.8448 | 0.8786 | 0.5888 | 0.5093 | 0.5413 | 0.8786 |
84
+ | 0.1211 | 2.7085 | 477 | 0.8739 | -5.8152 | -2.0272 | -116.4230 | 0.8371 | 0.8739 | 0.5888 | 0.5093 | 0.5403 | 0.8739 |
85
+
86
+
87
+ ### Framework versions
88
+
89
+ - Transformers 4.42.0
90
+ - Pytorch 2.3.0+cu121
91
+ - Datasets 2.19.1
92
+ - Tokenizers 0.19.1
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9976381672177608,
3
+ "eval_dpo_loss": 0.918109118938446,
4
+ "eval_dpo_wo_beta": -6.42689847946167,
5
+ "eval_logits": -2.2913548946380615,
6
+ "eval_logps": -122.37532806396484,
7
+ "eval_loss": 0.8842386603355408,
8
+ "eval_objective": 0.918109118938446,
9
+ "eval_ranking_idealized": 0.6045548915863037,
10
+ "eval_ranking_idealized_expo": 0.5279502868652344,
11
+ "eval_ranking_simple": 0.5641822218894958,
12
+ "eval_regularize": 0.918109118938446,
13
+ "eval_runtime": 319.8808,
14
+ "eval_samples": 5790,
15
+ "eval_samples_per_second": 18.1,
16
+ "eval_steps_per_second": 1.51,
17
+ "total_flos": 0.0,
18
+ "train_loss": 0.0,
19
+ "train_runtime": 9.3184,
20
+ "train_samples": 50802,
21
+ "train_samples_per_second": 16355.433,
22
+ "train_steps_per_second": 56.662
23
+ }
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "hZzy/qwen2.5-0.5b-sft-news-IFT",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151644,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 24,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "pad_token_id": 151645,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": 32768,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.42.0",
26
+ "use_cache": false,
27
+ "use_mrope": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151665
30
+ }
eval_results.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9976381672177608,
3
+ "eval_dpo_loss": 0.918109118938446,
4
+ "eval_dpo_wo_beta": -6.42689847946167,
5
+ "eval_logits": -2.2913548946380615,
6
+ "eval_logps": -122.37532806396484,
7
+ "eval_loss": 0.8842386603355408,
8
+ "eval_objective": 0.918109118938446,
9
+ "eval_ranking_idealized": 0.6045548915863037,
10
+ "eval_ranking_idealized_expo": 0.5279502868652344,
11
+ "eval_ranking_simple": 0.5641822218894958,
12
+ "eval_regularize": 0.918109118938446,
13
+ "eval_runtime": 319.8808,
14
+ "eval_samples": 5790,
15
+ "eval_samples_per_second": 18.1,
16
+ "eval_steps_per_second": 1.51
17
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151644,
3
+ "eos_token_id": 151645,
4
+ "max_new_tokens": 2048,
5
+ "pad_token_id": 151645,
6
+ "transformers_version": "4.42.0"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54861397bb33df54aeefb71f8632780795a463771984efd19be223d9f17578d4
3
+ size 1975192208
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<|im_end|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ }
27
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>"
185
+ ],
186
+ "bos_token": "<|im_start|>",
187
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
188
+ "clean_up_tokenization_spaces": false,
189
+ "eos_token": "<|im_end|>",
190
+ "errors": "replace",
191
+ "model_max_length": 2048,
192
+ "pad_token": "<|im_end|>",
193
+ "split_special_tokens": false,
194
+ "tokenizer_class": "Qwen2Tokenizer",
195
+ "unk_token": null
196
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9976381672177608,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 9.3184,
6
+ "train_samples": 50802,
7
+ "train_samples_per_second": 16355.433,
8
+ "train_steps_per_second": 56.662
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5423553586006165,
3
+ "best_model_checkpoint": "./qwen2.5-0.5b/qwen2.5-0.5b-expo-DPO-ES-TRY/checkpoint-371",
4
+ "epoch": 2.9976381672177608,
5
+ "eval_steps": 53,
6
+ "global_step": 528,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "dpo_loss": 0.6931471824645996,
13
+ "dpo_wo_beta": -0.6931471824645996,
14
+ "epoch": 0.005668398677373642,
15
+ "grad_norm": 13.433600669124935,
16
+ "learning_rate": 9.433962264150944e-08,
17
+ "logits": -1.3874311447143555,
18
+ "logps": -88.43561553955078,
19
+ "loss": 0.6931,
20
+ "objective": 0.6931471824645996,
21
+ "ranking_idealized": 0.625,
22
+ "ranking_idealized_expo": 0.5208333134651184,
23
+ "ranking_simple": 0.5208333134651184,
24
+ "regularize": 0.6931471824645996,
25
+ "step": 1
26
+ },
27
+ {
28
+ "dpo_loss": 0.693236768245697,
29
+ "dpo_wo_beta": -0.6993356347084045,
30
+ "epoch": 0.02834199338686821,
31
+ "grad_norm": 13.640653628388394,
32
+ "learning_rate": 4.716981132075472e-07,
33
+ "logits": -1.4090652465820312,
34
+ "logps": -84.34337615966797,
35
+ "loss": 0.693,
36
+ "objective": 0.693236768245697,
37
+ "ranking_idealized": 0.6197916865348816,
38
+ "ranking_idealized_expo": 0.546875,
39
+ "ranking_simple": 0.546875,
40
+ "regularize": 0.693236768245697,
41
+ "step": 5
42
+ },
43
+ {
44
+ "dpo_loss": 0.6845630407333374,
45
+ "dpo_wo_beta": -0.7111619710922241,
46
+ "epoch": 0.05668398677373642,
47
+ "grad_norm": 12.626074407134174,
48
+ "learning_rate": 9.433962264150944e-07,
49
+ "logits": -1.4784893989562988,
50
+ "logps": -81.94055938720703,
51
+ "loss": 0.6892,
52
+ "objective": 0.6845630407333374,
53
+ "ranking_idealized": 0.6000000238418579,
54
+ "ranking_idealized_expo": 0.5083333253860474,
55
+ "ranking_simple": 0.512499988079071,
56
+ "regularize": 0.6845630407333374,
57
+ "step": 10
58
+ },
59
+ {
60
+ "dpo_loss": 0.6825469136238098,
61
+ "dpo_wo_beta": -0.8259204626083374,
62
+ "epoch": 0.08502598016060463,
63
+ "grad_norm": 12.374180595083178,
64
+ "learning_rate": 1.4150943396226415e-06,
65
+ "logits": -1.4932299852371216,
66
+ "logps": -81.52880096435547,
67
+ "loss": 0.6814,
68
+ "objective": 0.6825469136238098,
69
+ "ranking_idealized": 0.5916666388511658,
70
+ "ranking_idealized_expo": 0.5041666626930237,
71
+ "ranking_simple": 0.512499988079071,
72
+ "regularize": 0.6825469136238098,
73
+ "step": 15
74
+ },
75
+ {
76
+ "dpo_loss": 0.6950914263725281,
77
+ "dpo_wo_beta": -1.2390469312667847,
78
+ "epoch": 0.11336797354747284,
79
+ "grad_norm": 14.839934392200913,
80
+ "learning_rate": 1.8867924528301889e-06,
81
+ "logits": -1.5371100902557373,
82
+ "logps": -82.72624969482422,
83
+ "loss": 0.6711,
84
+ "objective": 0.6950914263725281,
85
+ "ranking_idealized": 0.625,
86
+ "ranking_idealized_expo": 0.5375000238418579,
87
+ "ranking_simple": 0.5249999761581421,
88
+ "regularize": 0.6950914263725281,
89
+ "step": 20
90
+ },
91
+ {
92
+ "dpo_loss": 0.6556071043014526,
93
+ "dpo_wo_beta": -1.110619068145752,
94
+ "epoch": 0.14170996693434104,
95
+ "grad_norm": 12.89805052529156,
96
+ "learning_rate": 2.358490566037736e-06,
97
+ "logits": -1.6399922370910645,
98
+ "logps": -81.59695434570312,
99
+ "loss": 0.6589,
100
+ "objective": 0.6556071043014526,
101
+ "ranking_idealized": 0.5958333611488342,
102
+ "ranking_idealized_expo": 0.4958333373069763,
103
+ "ranking_simple": 0.5249999761581421,
104
+ "regularize": 0.6556071043014526,
105
+ "step": 25
106
+ },
107
+ {
108
+ "dpo_loss": 0.6518108248710632,
109
+ "dpo_wo_beta": -1.2506839036941528,
110
+ "epoch": 0.17005196032120926,
111
+ "grad_norm": 12.64998937636519,
112
+ "learning_rate": 2.830188679245283e-06,
113
+ "logits": -1.6404598951339722,
114
+ "logps": -83.20111846923828,
115
+ "loss": 0.6451,
116
+ "objective": 0.6518108248710632,
117
+ "ranking_idealized": 0.6041666865348816,
118
+ "ranking_idealized_expo": 0.5291666388511658,
119
+ "ranking_simple": 0.5666666626930237,
120
+ "regularize": 0.6518108248710632,
121
+ "step": 30
122
+ },
123
+ {
124
+ "dpo_loss": 0.6226770877838135,
125
+ "dpo_wo_beta": -1.394917368888855,
126
+ "epoch": 0.19839395370807747,
127
+ "grad_norm": 13.760162421635227,
128
+ "learning_rate": 3.30188679245283e-06,
129
+ "logits": -1.6237396001815796,
130
+ "logps": -87.80964660644531,
131
+ "loss": 0.6189,
132
+ "objective": 0.6226770877838135,
133
+ "ranking_idealized": 0.612500011920929,
134
+ "ranking_idealized_expo": 0.5041666626930237,
135
+ "ranking_simple": 0.5583333373069763,
136
+ "regularize": 0.6226770877838135,
137
+ "step": 35
138
+ },
139
+ {
140
+ "dpo_loss": 0.5924390554428101,
141
+ "dpo_wo_beta": -1.422450304031372,
142
+ "epoch": 0.22673594709494568,
143
+ "grad_norm": 16.810886476613117,
144
+ "learning_rate": 3.7735849056603777e-06,
145
+ "logits": -1.620682954788208,
146
+ "logps": -91.93690490722656,
147
+ "loss": 0.6076,
148
+ "objective": 0.5924390554428101,
149
+ "ranking_idealized": 0.6000000238418579,
150
+ "ranking_idealized_expo": 0.5208333134651184,
151
+ "ranking_simple": 0.5708333253860474,
152
+ "regularize": 0.5924390554428101,
153
+ "step": 40
154
+ },
155
+ {
156
+ "dpo_loss": 0.573756217956543,
157
+ "dpo_wo_beta": -1.3691534996032715,
158
+ "epoch": 0.25507794048181387,
159
+ "grad_norm": 13.798774501924722,
160
+ "learning_rate": 4.245283018867925e-06,
161
+ "logits": -1.7814558744430542,
162
+ "logps": -92.24474334716797,
163
+ "loss": 0.5989,
164
+ "objective": 0.573756217956543,
165
+ "ranking_idealized": 0.6208333373069763,
166
+ "ranking_idealized_expo": 0.5375000238418579,
167
+ "ranking_simple": 0.5666666626930237,
168
+ "regularize": 0.573756217956543,
169
+ "step": 45
170
+ },
171
+ {
172
+ "dpo_loss": 0.5726417899131775,
173
+ "dpo_wo_beta": -1.3605374097824097,
174
+ "epoch": 0.2834199338686821,
175
+ "grad_norm": 12.568473894025988,
176
+ "learning_rate": 4.716981132075472e-06,
177
+ "logits": -1.808895468711853,
178
+ "logps": -90.65751647949219,
179
+ "loss": 0.5954,
180
+ "objective": 0.5726417899131775,
181
+ "ranking_idealized": 0.5416666865348816,
182
+ "ranking_idealized_expo": 0.42500001192092896,
183
+ "ranking_simple": 0.574999988079071,
184
+ "regularize": 0.5726417899131775,
185
+ "step": 50
186
+ },
187
+ {
188
+ "epoch": 0.300425129900803,
189
+ "eval_dpo_loss": 0.7112604975700378,
190
+ "eval_dpo_wo_beta": -2.2659413814544678,
191
+ "eval_logits": -1.892814040184021,
192
+ "eval_logps": -101.36742401123047,
193
+ "eval_loss": 0.6816489100456238,
194
+ "eval_objective": 0.7112604975700378,
195
+ "eval_ranking_idealized": 0.5888429880142212,
196
+ "eval_ranking_idealized_expo": 0.5092975497245789,
197
+ "eval_ranking_simple": 0.5237603187561035,
198
+ "eval_regularize": 0.7112604975700378,
199
+ "eval_runtime": 211.6587,
200
+ "eval_samples_per_second": 27.355,
201
+ "eval_steps_per_second": 1.143,
202
+ "step": 53
203
+ },
204
+ {
205
+ "dpo_loss": 0.5827316045761108,
206
+ "dpo_wo_beta": -1.6213361024856567,
207
+ "epoch": 0.3117619272555503,
208
+ "grad_norm": 14.442715913160086,
209
+ "learning_rate": 4.999781286194085e-06,
210
+ "logits": -1.8762638568878174,
211
+ "logps": -93.41423797607422,
212
+ "loss": 0.5721,
213
+ "objective": 0.5827316045761108,
214
+ "ranking_idealized": 0.625,
215
+ "ranking_idealized_expo": 0.5416666865348816,
216
+ "ranking_simple": 0.6041666865348816,
217
+ "regularize": 0.5827316045761108,
218
+ "step": 55
219
+ },
220
+ {
221
+ "dpo_loss": 0.5553872585296631,
222
+ "dpo_wo_beta": -1.6468366384506226,
223
+ "epoch": 0.3401039206424185,
224
+ "grad_norm": 13.845514282811145,
225
+ "learning_rate": 4.997321195347154e-06,
226
+ "logits": -1.8914529085159302,
227
+ "logps": -90.59642028808594,
228
+ "loss": 0.5756,
229
+ "objective": 0.5553872585296631,
230
+ "ranking_idealized": 0.6333333253860474,
231
+ "ranking_idealized_expo": 0.5083333253860474,
232
+ "ranking_simple": 0.5791666507720947,
233
+ "regularize": 0.5553872585296631,
234
+ "step": 60
235
+ },
236
+ {
237
+ "dpo_loss": 0.5302771329879761,
238
+ "dpo_wo_beta": -1.3166770935058594,
239
+ "epoch": 0.3684459140292867,
240
+ "grad_norm": 10.846857687148022,
241
+ "learning_rate": 4.992130320438411e-06,
242
+ "logits": -1.8399535417556763,
243
+ "logps": -86.60197448730469,
244
+ "loss": 0.5586,
245
+ "objective": 0.5302771329879761,
246
+ "ranking_idealized": 0.5708333253860474,
247
+ "ranking_idealized_expo": 0.5083333253860474,
248
+ "ranking_simple": 0.6208333373069763,
249
+ "regularize": 0.5302771329879761,
250
+ "step": 65
251
+ },
252
+ {
253
+ "dpo_loss": 0.5711485743522644,
254
+ "dpo_wo_beta": -1.7437169551849365,
255
+ "epoch": 0.39678790741615494,
256
+ "grad_norm": 13.787840238803502,
257
+ "learning_rate": 4.984214337613357e-06,
258
+ "logits": -1.8178967237472534,
259
+ "logps": -91.10688781738281,
260
+ "loss": 0.5701,
261
+ "objective": 0.5711485743522644,
262
+ "ranking_idealized": 0.6000000238418579,
263
+ "ranking_idealized_expo": 0.5166666507720947,
264
+ "ranking_simple": 0.5874999761581421,
265
+ "regularize": 0.5711485743522644,
266
+ "step": 70
267
+ },
268
+ {
269
+ "dpo_loss": 0.523643434047699,
270
+ "dpo_wo_beta": -1.669514536857605,
271
+ "epoch": 0.42512990080302315,
272
+ "grad_norm": 13.192298437287352,
273
+ "learning_rate": 4.97358190288299e-06,
274
+ "logits": -1.8182169198989868,
275
+ "logps": -94.8000717163086,
276
+ "loss": 0.5205,
277
+ "objective": 0.523643434047699,
278
+ "ranking_idealized": 0.625,
279
+ "ranking_idealized_expo": 0.5458333492279053,
280
+ "ranking_simple": 0.6583333611488342,
281
+ "regularize": 0.523643434047699,
282
+ "step": 75
283
+ },
284
+ {
285
+ "dpo_loss": 0.51079261302948,
286
+ "dpo_wo_beta": -1.7271808385849,
287
+ "epoch": 0.45347189418989137,
288
+ "grad_norm": 15.151373786996814,
289
+ "learning_rate": 4.9602446426585845e-06,
290
+ "logits": -1.8920824527740479,
291
+ "logps": -93.58238220214844,
292
+ "loss": 0.5285,
293
+ "objective": 0.51079261302948,
294
+ "ranking_idealized": 0.5833333134651184,
295
+ "ranking_idealized_expo": 0.5166666507720947,
296
+ "ranking_simple": 0.6458333134651184,
297
+ "regularize": 0.51079261302948,
298
+ "step": 80
299
+ },
300
+ {
301
+ "dpo_loss": 0.5066012144088745,
302
+ "dpo_wo_beta": -1.5956443548202515,
303
+ "epoch": 0.4818138875767596,
304
+ "grad_norm": 12.328960275584794,
305
+ "learning_rate": 4.944217141038379e-06,
306
+ "logits": -1.8741406202316284,
307
+ "logps": -87.06742858886719,
308
+ "loss": 0.5202,
309
+ "objective": 0.5066012144088745,
310
+ "ranking_idealized": 0.6000000238418579,
311
+ "ranking_idealized_expo": 0.5249999761581421,
312
+ "ranking_simple": 0.6208333373069763,
313
+ "regularize": 0.5066012144088745,
314
+ "step": 85
315
+ },
316
+ {
317
+ "dpo_loss": 0.5358369946479797,
318
+ "dpo_wo_beta": -1.9357556104660034,
319
+ "epoch": 0.5101558809636277,
320
+ "grad_norm": 12.694483590051824,
321
+ "learning_rate": 4.925516923860083e-06,
322
+ "logits": -1.7968534231185913,
323
+ "logps": -86.77802276611328,
324
+ "loss": 0.4858,
325
+ "objective": 0.5358369946479797,
326
+ "ranking_idealized": 0.5458333492279053,
327
+ "ranking_idealized_expo": 0.4749999940395355,
328
+ "ranking_simple": 0.5874999761581421,
329
+ "regularize": 0.5358369946479797,
330
+ "step": 90
331
+ },
332
+ {
333
+ "dpo_loss": 0.4783257842063904,
334
+ "dpo_wo_beta": -1.9098786115646362,
335
+ "epoch": 0.538497874350496,
336
+ "grad_norm": 14.474706973531484,
337
+ "learning_rate": 4.904164439536626e-06,
338
+ "logits": -1.8568389415740967,
339
+ "logps": -88.12813568115234,
340
+ "loss": 0.4865,
341
+ "objective": 0.4783257842063904,
342
+ "ranking_idealized": 0.6291666626930237,
343
+ "ranking_idealized_expo": 0.5458333492279053,
344
+ "ranking_simple": 0.6916666626930237,
345
+ "regularize": 0.4783257842063904,
346
+ "step": 95
347
+ },
348
+ {
349
+ "dpo_loss": 0.4654810130596161,
350
+ "dpo_wo_beta": -1.9254087209701538,
351
+ "epoch": 0.5668398677373642,
352
+ "grad_norm": 13.577084707122001,
353
+ "learning_rate": 4.880183036696123e-06,
354
+ "logits": -1.938937783241272,
355
+ "logps": -92.29436492919922,
356
+ "loss": 0.5016,
357
+ "objective": 0.4654810130596161,
358
+ "ranking_idealized": 0.612500011920929,
359
+ "ranking_idealized_expo": 0.5416666865348816,
360
+ "ranking_simple": 0.6875,
361
+ "regularize": 0.4654810130596161,
362
+ "step": 100
363
+ },
364
+ {
365
+ "dpo_loss": 0.4374677240848541,
366
+ "dpo_wo_beta": -1.4267934560775757,
367
+ "epoch": 0.5951818611242324,
368
+ "grad_norm": 11.14545328639218,
369
+ "learning_rate": 4.853598938650487e-06,
370
+ "logits": -1.8158982992172241,
371
+ "logps": -90.21449279785156,
372
+ "loss": 0.4618,
373
+ "objective": 0.4374677240848541,
374
+ "ranking_idealized": 0.625,
375
+ "ranking_idealized_expo": 0.5333333611488342,
376
+ "ranking_simple": 0.6666666865348816,
377
+ "regularize": 0.4374677240848541,
378
+ "step": 105
379
+ },
380
+ {
381
+ "epoch": 0.600850259801606,
382
+ "eval_dpo_loss": 0.6936022639274597,
383
+ "eval_dpo_wo_beta": -2.462427854537964,
384
+ "eval_logits": -1.9007418155670166,
385
+ "eval_logps": -94.35714721679688,
386
+ "eval_loss": 0.6912521123886108,
387
+ "eval_objective": 0.6936022639274597,
388
+ "eval_ranking_idealized": 0.5888429880142212,
389
+ "eval_ranking_idealized_expo": 0.5092975497245789,
390
+ "eval_ranking_simple": 0.5351239442825317,
391
+ "eval_regularize": 0.6936022639274597,
392
+ "eval_runtime": 210.2297,
393
+ "eval_samples_per_second": 27.541,
394
+ "eval_steps_per_second": 1.151,
395
+ "step": 106
396
+ },
397
+ {
398
+ "dpo_loss": 0.47933149337768555,
399
+ "dpo_wo_beta": -1.9683055877685547,
400
+ "epoch": 0.6235238545111006,
401
+ "grad_norm": 12.39392340166307,
402
+ "learning_rate": 4.824441214720629e-06,
403
+ "logits": -1.9334439039230347,
404
+ "logps": -87.35523223876953,
405
+ "loss": 0.4633,
406
+ "objective": 0.47933149337768555,
407
+ "ranking_idealized": 0.5708333253860474,
408
+ "ranking_idealized_expo": 0.4541666805744171,
409
+ "ranking_simple": 0.6625000238418579,
410
+ "regularize": 0.47933149337768555,
411
+ "step": 110
412
+ },
413
+ {
414
+ "dpo_loss": 0.4749464690685272,
415
+ "dpo_wo_beta": -1.7375919818878174,
416
+ "epoch": 0.6518658478979689,
417
+ "grad_norm": 12.612865651893962,
418
+ "learning_rate": 4.7927417484495756e-06,
419
+ "logits": -1.9057692289352417,
420
+ "logps": -87.68991088867188,
421
+ "loss": 0.4712,
422
+ "objective": 0.4749464690685272,
423
+ "ranking_idealized": 0.574999988079071,
424
+ "ranking_idealized_expo": 0.4833333194255829,
425
+ "ranking_simple": 0.6333333253860474,
426
+ "regularize": 0.4749464690685272,
427
+ "step": 115
428
+ },
429
+ {
430
+ "dpo_loss": 0.4848935306072235,
431
+ "dpo_wo_beta": -1.9273093938827515,
432
+ "epoch": 0.680207841284837,
433
+ "grad_norm": 13.836239066838136,
434
+ "learning_rate": 4.758535202738287e-06,
435
+ "logits": -1.8775906562805176,
436
+ "logps": -87.8878173828125,
437
+ "loss": 0.4641,
438
+ "objective": 0.4848935306072235,
439
+ "ranking_idealized": 0.5874999761581421,
440
+ "ranking_idealized_expo": 0.5291666388511658,
441
+ "ranking_simple": 0.6625000238418579,
442
+ "regularize": 0.4848935306072235,
443
+ "step": 120
444
+ },
445
+ {
446
+ "dpo_loss": 0.4785127639770508,
447
+ "dpo_wo_beta": -1.814666748046875,
448
+ "epoch": 0.7085498346717053,
449
+ "grad_norm": 12.105170057238437,
450
+ "learning_rate": 4.721858981942284e-06,
451
+ "logits": -1.8346068859100342,
452
+ "logps": -86.40522766113281,
453
+ "loss": 0.4801,
454
+ "objective": 0.4785127639770508,
455
+ "ranking_idealized": 0.625,
456
+ "ranking_idealized_expo": 0.550000011920929,
457
+ "ranking_simple": 0.6875,
458
+ "regularize": 0.4785127639770508,
459
+ "step": 125
460
+ },
461
+ {
462
+ "dpo_loss": 0.4548089802265167,
463
+ "dpo_wo_beta": -1.4164987802505493,
464
+ "epoch": 0.7368918280585735,
465
+ "grad_norm": 11.895980627109102,
466
+ "learning_rate": 4.682753190970533e-06,
467
+ "logits": -1.9488608837127686,
468
+ "logps": -79.42195129394531,
469
+ "loss": 0.4538,
470
+ "objective": 0.4548089802265167,
471
+ "ranking_idealized": 0.5416666865348816,
472
+ "ranking_idealized_expo": 0.44999998807907104,
473
+ "ranking_simple": 0.6291666626930237,
474
+ "regularize": 0.4548089802265167,
475
+ "step": 130
476
+ },
477
+ {
478
+ "dpo_loss": 0.49760884046554565,
479
+ "dpo_wo_beta": -1.994195818901062,
480
+ "epoch": 0.7652338214454416,
481
+ "grad_norm": 12.298776298341995,
482
+ "learning_rate": 4.641260591431315e-06,
483
+ "logits": -1.9813282489776611,
484
+ "logps": -82.40634155273438,
485
+ "loss": 0.4433,
486
+ "objective": 0.49760884046554565,
487
+ "ranking_idealized": 0.5874999761581421,
488
+ "ranking_idealized_expo": 0.5249999761581421,
489
+ "ranking_simple": 0.6583333611488342,
490
+ "regularize": 0.49760884046554565,
491
+ "step": 135
492
+ },
493
+ {
494
+ "dpo_loss": 0.41459351778030396,
495
+ "dpo_wo_beta": -1.187635064125061,
496
+ "epoch": 0.7935758148323099,
497
+ "grad_norm": 12.618720178096575,
498
+ "learning_rate": 4.597426554873037e-06,
499
+ "logits": -1.97609281539917,
500
+ "logps": -83.44467163085938,
501
+ "loss": 0.4236,
502
+ "objective": 0.41459351778030396,
503
+ "ranking_idealized": 0.5791666507720947,
504
+ "ranking_idealized_expo": 0.5041666626930237,
505
+ "ranking_simple": 0.6791666746139526,
506
+ "regularize": 0.41459351778030396,
507
+ "step": 140
508
+ },
509
+ {
510
+ "dpo_loss": 0.4073801636695862,
511
+ "dpo_wo_beta": -1.311059832572937,
512
+ "epoch": 0.821917808219178,
513
+ "grad_norm": 14.417917904409194,
514
+ "learning_rate": 4.551299013171111e-06,
515
+ "logits": -2.0718839168548584,
516
+ "logps": -84.2674560546875,
517
+ "loss": 0.4215,
518
+ "objective": 0.4073801636695862,
519
+ "ranking_idealized": 0.5791666507720947,
520
+ "ranking_idealized_expo": 0.4833333194255829,
521
+ "ranking_simple": 0.6916666626930237,
522
+ "regularize": 0.4073801636695862,
523
+ "step": 145
524
+ },
525
+ {
526
+ "dpo_loss": 0.4207518398761749,
527
+ "dpo_wo_beta": -1.50857675075531,
528
+ "epoch": 0.8502598016060463,
529
+ "grad_norm": 11.543599868064442,
530
+ "learning_rate": 4.502928406115152e-06,
531
+ "logits": -2.0730583667755127,
532
+ "logps": -82.68958282470703,
533
+ "loss": 0.4276,
534
+ "objective": 0.4207518398761749,
535
+ "ranking_idealized": 0.6166666746139526,
536
+ "ranking_idealized_expo": 0.5416666865348816,
537
+ "ranking_simple": 0.7208333611488342,
538
+ "regularize": 0.4207518398761749,
539
+ "step": 150
540
+ },
541
+ {
542
+ "dpo_loss": 0.3847941756248474,
543
+ "dpo_wo_beta": -1.4449684619903564,
544
+ "epoch": 0.8786017949929145,
545
+ "grad_norm": 12.08771803065001,
546
+ "learning_rate": 4.452367626253805e-06,
547
+ "logits": -2.0991933345794678,
548
+ "logps": -85.211181640625,
549
+ "loss": 0.3986,
550
+ "objective": 0.3847941756248474,
551
+ "ranking_idealized": 0.612500011920929,
552
+ "ranking_idealized_expo": 0.5083333253860474,
553
+ "ranking_simple": 0.7250000238418579,
554
+ "regularize": 0.3847941756248474,
555
+ "step": 155
556
+ },
557
+ {
558
+ "epoch": 0.9012753897024091,
559
+ "eval_dpo_loss": 0.7214789390563965,
560
+ "eval_dpo_wo_beta": -3.1229145526885986,
561
+ "eval_logits": -2.1450352668762207,
562
+ "eval_logps": -95.60012817382812,
563
+ "eval_loss": 0.7013870477676392,
564
+ "eval_objective": 0.7214789390563965,
565
+ "eval_ranking_idealized": 0.5888429880142212,
566
+ "eval_ranking_idealized_expo": 0.5092975497245789,
567
+ "eval_ranking_simple": 0.5351239442825317,
568
+ "eval_regularize": 0.7214789390563965,
569
+ "eval_runtime": 210.3593,
570
+ "eval_samples_per_second": 27.524,
571
+ "eval_steps_per_second": 1.15,
572
+ "step": 159
573
+ },
574
+ {
575
+ "dpo_loss": 0.4162478744983673,
576
+ "dpo_wo_beta": -1.6461573839187622,
577
+ "epoch": 0.9069437883797827,
578
+ "grad_norm": 12.82345397067452,
579
+ "learning_rate": 4.399671961057523e-06,
580
+ "logits": -2.0759384632110596,
581
+ "logps": -89.25846862792969,
582
+ "loss": 0.4236,
583
+ "objective": 0.4162478744983673,
584
+ "ranking_idealized": 0.5708333253860474,
585
+ "ranking_idealized_expo": 0.49166667461395264,
586
+ "ranking_simple": 0.699999988079071,
587
+ "regularize": 0.4162478744983673,
588
+ "step": 160
589
+ },
590
+ {
591
+ "dpo_loss": 0.41358453035354614,
592
+ "dpo_wo_beta": -1.648630976676941,
593
+ "epoch": 0.9352857817666509,
594
+ "grad_norm": 12.860537676624453,
595
+ "learning_rate": 4.3448990324625244e-06,
596
+ "logits": -2.024477481842041,
597
+ "logps": -88.03329467773438,
598
+ "loss": 0.4026,
599
+ "objective": 0.41358453035354614,
600
+ "ranking_idealized": 0.6041666865348816,
601
+ "ranking_idealized_expo": 0.5208333134651184,
602
+ "ranking_simple": 0.7333333492279053,
603
+ "regularize": 0.41358453035354614,
604
+ "step": 165
605
+ },
606
+ {
607
+ "dpo_loss": 0.378000408411026,
608
+ "dpo_wo_beta": -1.2966532707214355,
609
+ "epoch": 0.9636277751535192,
610
+ "grad_norm": 11.533711130228069,
611
+ "learning_rate": 4.288108733862064e-06,
612
+ "logits": -2.042527437210083,
613
+ "logps": -90.26854705810547,
614
+ "loss": 0.3925,
615
+ "objective": 0.378000408411026,
616
+ "ranking_idealized": 0.5791666507720947,
617
+ "ranking_idealized_expo": 0.512499988079071,
618
+ "ranking_simple": 0.7166666388511658,
619
+ "regularize": 0.378000408411026,
620
+ "step": 170
621
+ },
622
+ {
623
+ "dpo_loss": 0.3764660954475403,
624
+ "dpo_wo_beta": -1.3978971242904663,
625
+ "epoch": 0.9919697685403873,
626
+ "grad_norm": 12.165192869157089,
627
+ "learning_rate": 4.229363164613874e-06,
628
+ "logits": -2.0610477924346924,
629
+ "logps": -89.8354721069336,
630
+ "loss": 0.3793,
631
+ "objective": 0.3764660954475403,
632
+ "ranking_idealized": 0.6541666388511658,
633
+ "ranking_idealized_expo": 0.574999988079071,
634
+ "ranking_simple": 0.7916666865348816,
635
+ "regularize": 0.3764660954475403,
636
+ "step": 175
637
+ },
638
+ {
639
+ "dpo_loss": 0.27626773715019226,
640
+ "dpo_wo_beta": -0.8504549860954285,
641
+ "epoch": 1.0203117619272555,
642
+ "grad_norm": 10.141692447282386,
643
+ "learning_rate": 4.168726562135432e-06,
644
+ "logits": -2.2514243125915527,
645
+ "logps": -90.8476333618164,
646
+ "loss": 0.2852,
647
+ "objective": 0.27626773715019226,
648
+ "ranking_idealized": 0.6166666746139526,
649
+ "ranking_idealized_expo": 0.5416666865348816,
650
+ "ranking_simple": 0.8083333373069763,
651
+ "regularize": 0.27626773715019226,
652
+ "step": 180
653
+ },
654
+ {
655
+ "dpo_loss": 0.23696589469909668,
656
+ "dpo_wo_beta": -0.6947117447853088,
657
+ "epoch": 1.0486537553141237,
658
+ "grad_norm": 13.78702272812957,
659
+ "learning_rate": 4.106265231661292e-06,
660
+ "logits": -2.158977746963501,
661
+ "logps": -95.00120544433594,
662
+ "loss": 0.2429,
663
+ "objective": 0.23696589469909668,
664
+ "ranking_idealized": 0.5833333134651184,
665
+ "ranking_idealized_expo": 0.49166667461395264,
666
+ "ranking_simple": 0.8083333373069763,
667
+ "regularize": 0.23696589469909668,
668
+ "step": 185
669
+ },
670
+ {
671
+ "dpo_loss": 0.26388806104660034,
672
+ "dpo_wo_beta": -0.9112051725387573,
673
+ "epoch": 1.076995748700992,
674
+ "grad_norm": 14.740228375586371,
675
+ "learning_rate": 4.042047473739278e-06,
676
+ "logits": -2.1533920764923096,
677
+ "logps": -101.71949768066406,
678
+ "loss": 0.2517,
679
+ "objective": 0.26388806104660034,
680
+ "ranking_idealized": 0.625,
681
+ "ranking_idealized_expo": 0.5541666746139526,
682
+ "ranking_simple": 0.8416666388511658,
683
+ "regularize": 0.26388806104660034,
684
+ "step": 190
685
+ },
686
+ {
687
+ "dpo_loss": 0.2244579941034317,
688
+ "dpo_wo_beta": -0.6430780291557312,
689
+ "epoch": 1.10533774208786,
690
+ "grad_norm": 10.169064121599527,
691
+ "learning_rate": 3.976143509544843e-06,
692
+ "logits": -2.1589295864105225,
693
+ "logps": -96.5248031616211,
694
+ "loss": 0.2467,
695
+ "objective": 0.2244579941034317,
696
+ "ranking_idealized": 0.6333333253860474,
697
+ "ranking_idealized_expo": 0.5666666626930237,
698
+ "ranking_simple": 0.8083333373069763,
699
+ "regularize": 0.2244579941034317,
700
+ "step": 195
701
+ },
702
+ {
703
+ "dpo_loss": 0.24179764091968536,
704
+ "dpo_wo_beta": -0.6332272291183472,
705
+ "epoch": 1.1336797354747283,
706
+ "grad_norm": 9.444774343787891,
707
+ "learning_rate": 3.908625404095242e-06,
708
+ "logits": -2.2753493785858154,
709
+ "logps": -91.93312072753906,
710
+ "loss": 0.2563,
711
+ "objective": 0.24179764091968536,
712
+ "ranking_idealized": 0.6000000238418579,
713
+ "ranking_idealized_expo": 0.5083333253860474,
714
+ "ranking_simple": 0.8458333611488342,
715
+ "regularize": 0.24179764091968536,
716
+ "step": 200
717
+ },
718
+ {
719
+ "dpo_loss": 0.25683078169822693,
720
+ "dpo_wo_beta": -0.8531176447868347,
721
+ "epoch": 1.1620217288615966,
722
+ "grad_norm": 9.240319326762517,
723
+ "learning_rate": 3.839566987447492e-06,
724
+ "logits": -2.2432618141174316,
725
+ "logps": -91.3159408569336,
726
+ "loss": 0.2584,
727
+ "objective": 0.25683078169822693,
728
+ "ranking_idealized": 0.5708333253860474,
729
+ "ranking_idealized_expo": 0.49166667461395264,
730
+ "ranking_simple": 0.8166666626930237,
731
+ "regularize": 0.25683078169822693,
732
+ "step": 205
733
+ },
734
+ {
735
+ "dpo_loss": 0.24292893707752228,
736
+ "dpo_wo_beta": -0.8205318450927734,
737
+ "epoch": 1.1903637222484649,
738
+ "grad_norm": 9.283856100785183,
739
+ "learning_rate": 3.7690437739662928e-06,
740
+ "logits": -2.2361652851104736,
741
+ "logps": -90.6613998413086,
742
+ "loss": 0.2551,
743
+ "objective": 0.24292893707752228,
744
+ "ranking_idealized": 0.5833333134651184,
745
+ "ranking_idealized_expo": 0.5,
746
+ "ranking_simple": 0.800000011920929,
747
+ "regularize": 0.24292893707752228,
748
+ "step": 210
749
+ },
750
+ {
751
+ "epoch": 1.201700519603212,
752
+ "eval_dpo_loss": 0.7525234222412109,
753
+ "eval_dpo_wo_beta": -3.7749528884887695,
754
+ "eval_logits": -2.267778158187866,
755
+ "eval_logps": -98.14269256591797,
756
+ "eval_loss": 0.7350714206695557,
757
+ "eval_objective": 0.7525234222412109,
758
+ "eval_ranking_idealized": 0.5888429880142212,
759
+ "eval_ranking_idealized_expo": 0.5092975497245789,
760
+ "eval_ranking_simple": 0.5371900796890259,
761
+ "eval_regularize": 0.7525234222412109,
762
+ "eval_runtime": 210.8898,
763
+ "eval_samples_per_second": 27.455,
764
+ "eval_steps_per_second": 1.148,
765
+ "step": 212
766
+ },
767
+ {
768
+ "dpo_loss": 0.289533793926239,
769
+ "dpo_wo_beta": -0.8810125589370728,
770
+ "epoch": 1.2187057156353331,
771
+ "grad_norm": 10.72372972136692,
772
+ "learning_rate": 3.697132879750174e-06,
773
+ "logits": -2.1757090091705322,
774
+ "logps": -93.64250183105469,
775
+ "loss": 0.2578,
776
+ "objective": 0.289533793926239,
777
+ "ranking_idealized": 0.5583333373069763,
778
+ "ranking_idealized_expo": 0.4833333194255829,
779
+ "ranking_simple": 0.7875000238418579,
780
+ "regularize": 0.289533793926239,
781
+ "step": 215
782
+ },
783
+ {
784
+ "dpo_loss": 0.25134381651878357,
785
+ "dpo_wo_beta": -0.8703542947769165,
786
+ "epoch": 1.2470477090222012,
787
+ "grad_norm": 12.940604838816247,
788
+ "learning_rate": 3.6239129383061764e-06,
789
+ "logits": -2.121750593185425,
790
+ "logps": -94.44015502929688,
791
+ "loss": 0.2676,
792
+ "objective": 0.25134381651878357,
793
+ "ranking_idealized": 0.6541666388511658,
794
+ "ranking_idealized_expo": 0.6041666865348816,
795
+ "ranking_simple": 0.8208333253860474,
796
+ "regularize": 0.25134381651878357,
797
+ "step": 220
798
+ },
799
+ {
800
+ "dpo_loss": 0.23937886953353882,
801
+ "dpo_wo_beta": -0.7396827936172485,
802
+ "epoch": 1.2753897024090695,
803
+ "grad_norm": 9.645711793319885,
804
+ "learning_rate": 3.5494640145652647e-06,
805
+ "logits": -2.0901684761047363,
806
+ "logps": -94.10260772705078,
807
+ "loss": 0.2637,
808
+ "objective": 0.23937886953353882,
809
+ "ranking_idealized": 0.6166666746139526,
810
+ "ranking_idealized_expo": 0.5333333611488342,
811
+ "ranking_simple": 0.8541666865348816,
812
+ "regularize": 0.23937886953353882,
813
+ "step": 225
814
+ },
815
+ {
816
+ "dpo_loss": 0.2818019688129425,
817
+ "dpo_wo_beta": -1.1170729398727417,
818
+ "epoch": 1.3037316957959377,
819
+ "grad_norm": 8.80210598601974,
820
+ "learning_rate": 3.4738675173325008e-06,
821
+ "logits": -1.9860222339630127,
822
+ "logps": -92.9978256225586,
823
+ "loss": 0.2776,
824
+ "objective": 0.2818019688129425,
825
+ "ranking_idealized": 0.5791666507720947,
826
+ "ranking_idealized_expo": 0.5041666626930237,
827
+ "ranking_simple": 0.7749999761581421,
828
+ "regularize": 0.2818019688129425,
829
+ "step": 230
830
+ },
831
+ {
832
+ "dpo_loss": 0.22621506452560425,
833
+ "dpo_wo_beta": -0.35843732953071594,
834
+ "epoch": 1.3320736891828058,
835
+ "grad_norm": 9.267612473930496,
836
+ "learning_rate": 3.397206110267713e-06,
837
+ "logits": -2.1131467819213867,
838
+ "logps": -87.49403381347656,
839
+ "loss": 0.2618,
840
+ "objective": 0.22621506452560425,
841
+ "ranking_idealized": 0.6041666865348816,
842
+ "ranking_idealized_expo": 0.4833333194255829,
843
+ "ranking_simple": 0.8333333134651184,
844
+ "regularize": 0.22621506452560425,
845
+ "step": 235
846
+ },
847
+ {
848
+ "dpo_loss": 0.23632274568080902,
849
+ "dpo_wo_beta": -0.6697984933853149,
850
+ "epoch": 1.360415682569674,
851
+ "grad_norm": 10.68594080832048,
852
+ "learning_rate": 3.3195636214939943e-06,
853
+ "logits": -2.130047559738159,
854
+ "logps": -91.7619857788086,
855
+ "loss": 0.2584,
856
+ "objective": 0.23632274568080902,
857
+ "ranking_idealized": 0.5541666746139526,
858
+ "ranking_idealized_expo": 0.4791666567325592,
859
+ "ranking_simple": 0.8125,
860
+ "regularize": 0.23632274568080902,
861
+ "step": 240
862
+ },
863
+ {
864
+ "dpo_loss": 0.2982023358345032,
865
+ "dpo_wo_beta": -1.1124054193496704,
866
+ "epoch": 1.3887576759565423,
867
+ "grad_norm": 10.330360151122868,
868
+ "learning_rate": 3.2410249519328848e-06,
869
+ "logits": -2.1718757152557373,
870
+ "logps": -93.45353698730469,
871
+ "loss": 0.2692,
872
+ "objective": 0.2982023358345032,
873
+ "ranking_idealized": 0.5916666388511658,
874
+ "ranking_idealized_expo": 0.5208333134651184,
875
+ "ranking_simple": 0.8041666746139526,
876
+ "regularize": 0.2982023358345032,
877
+ "step": 245
878
+ },
879
+ {
880
+ "dpo_loss": 0.2403133064508438,
881
+ "dpo_wo_beta": -0.7000442147254944,
882
+ "epoch": 1.4170996693434104,
883
+ "grad_norm": 13.026578288520353,
884
+ "learning_rate": 3.1616759824664543e-06,
885
+ "logits": -2.145325183868408,
886
+ "logps": -94.18195343017578,
887
+ "loss": 0.269,
888
+ "objective": 0.2403133064508438,
889
+ "ranking_idealized": 0.574999988079071,
890
+ "ranking_idealized_expo": 0.47083333134651184,
891
+ "ranking_simple": 0.824999988079071,
892
+ "regularize": 0.2403133064508438,
893
+ "step": 250
894
+ },
895
+ {
896
+ "dpo_loss": 0.23977436125278473,
897
+ "dpo_wo_beta": -0.5784927010536194,
898
+ "epoch": 1.4454416627302786,
899
+ "grad_norm": 10.959901566104394,
900
+ "learning_rate": 3.081603480027826e-06,
901
+ "logits": -2.108074426651001,
902
+ "logps": -94.5383529663086,
903
+ "loss": 0.2625,
904
+ "objective": 0.23977436125278473,
905
+ "ranking_idealized": 0.6166666746139526,
906
+ "ranking_idealized_expo": 0.5458333492279053,
907
+ "ranking_simple": 0.8374999761581421,
908
+ "regularize": 0.23977436125278473,
909
+ "step": 255
910
+ },
911
+ {
912
+ "dpo_loss": 0.25297579169273376,
913
+ "dpo_wo_beta": -0.7996426820755005,
914
+ "epoch": 1.473783656117147,
915
+ "grad_norm": 9.578050078679867,
916
+ "learning_rate": 3.0008950027228035e-06,
917
+ "logits": -2.1828908920288086,
918
+ "logps": -92.77781677246094,
919
+ "loss": 0.232,
920
+ "objective": 0.25297579169273376,
921
+ "ranking_idealized": 0.5666666626930237,
922
+ "ranking_idealized_expo": 0.48750001192092896,
923
+ "ranking_simple": 0.8041666746139526,
924
+ "regularize": 0.25297579169273376,
925
+ "step": 260
926
+ },
927
+ {
928
+ "dpo_loss": 0.2830916941165924,
929
+ "dpo_wo_beta": -1.124144434928894,
930
+ "epoch": 1.5021256495040152,
931
+ "grad_norm": 11.27765707111355,
932
+ "learning_rate": 2.9196388040863695e-06,
933
+ "logits": -2.1150081157684326,
934
+ "logps": -95.04662322998047,
935
+ "loss": 0.2623,
936
+ "objective": 0.2830916941165924,
937
+ "ranking_idealized": 0.5791666507720947,
938
+ "ranking_idealized_expo": 0.5083333253860474,
939
+ "ranking_simple": 0.8041666746139526,
940
+ "regularize": 0.2830916941165924,
941
+ "step": 265
942
+ },
943
+ {
944
+ "epoch": 1.5021256495040152,
945
+ "eval_dpo_loss": 0.7739136815071106,
946
+ "eval_dpo_wo_beta": -4.163427829742432,
947
+ "eval_logits": -2.1478331089019775,
948
+ "eval_logps": -100.8313217163086,
949
+ "eval_loss": 0.7400166392326355,
950
+ "eval_objective": 0.7739136815071106,
951
+ "eval_ranking_idealized": 0.5888429880142212,
952
+ "eval_ranking_idealized_expo": 0.5092975497245789,
953
+ "eval_ranking_simple": 0.53925621509552,
954
+ "eval_regularize": 0.7739136815071106,
955
+ "eval_runtime": 210.8657,
956
+ "eval_samples_per_second": 27.458,
957
+ "eval_steps_per_second": 1.148,
958
+ "step": 265
959
+ },
960
+ {
961
+ "dpo_loss": 0.2930367887020111,
962
+ "dpo_wo_beta": -1.3651045560836792,
963
+ "epoch": 1.5304676428908834,
964
+ "grad_norm": 11.715215816813723,
965
+ "learning_rate": 2.8379237365787426e-06,
966
+ "logits": -2.035703182220459,
967
+ "logps": -97.7331771850586,
968
+ "loss": 0.253,
969
+ "objective": 0.2930367887020111,
970
+ "ranking_idealized": 0.574999988079071,
971
+ "ranking_idealized_expo": 0.5083333253860474,
972
+ "ranking_simple": 0.7916666865348816,
973
+ "regularize": 0.2930367887020111,
974
+ "step": 270
975
+ },
976
+ {
977
+ "dpo_loss": 0.24886849522590637,
978
+ "dpo_wo_beta": -0.8069366216659546,
979
+ "epoch": 1.5588096362777515,
980
+ "grad_norm": 8.958944325794365,
981
+ "learning_rate": 2.7558391544265127e-06,
982
+ "logits": -1.9700883626937866,
983
+ "logps": -97.53855895996094,
984
+ "loss": 0.2491,
985
+ "objective": 0.24886849522590637,
986
+ "ranking_idealized": 0.6208333373069763,
987
+ "ranking_idealized_expo": 0.5249999761581421,
988
+ "ranking_simple": 0.8125,
989
+ "regularize": 0.24886849522590637,
990
+ "step": 275
991
+ },
992
+ {
993
+ "dpo_loss": 0.22936613857746124,
994
+ "dpo_wo_beta": -0.6120084524154663,
995
+ "epoch": 1.5871516296646198,
996
+ "grad_norm": 10.814739938498821,
997
+ "learning_rate": 2.6734748159151104e-06,
998
+ "logits": -1.9118597507476807,
999
+ "logps": -98.06639099121094,
1000
+ "loss": 0.2491,
1001
+ "objective": 0.22936613857746124,
1002
+ "ranking_idealized": 0.574999988079071,
1003
+ "ranking_idealized_expo": 0.5041666626930237,
1004
+ "ranking_simple": 0.8374999761581421,
1005
+ "regularize": 0.22936613857746124,
1006
+ "step": 280
1007
+ },
1008
+ {
1009
+ "dpo_loss": 0.22401383519172668,
1010
+ "dpo_wo_beta": -0.5180224776268005,
1011
+ "epoch": 1.615493623051488,
1012
+ "grad_norm": 11.270657822712987,
1013
+ "learning_rate": 2.5909207852394363e-06,
1014
+ "logits": -1.9585484266281128,
1015
+ "logps": -100.70836639404297,
1016
+ "loss": 0.2348,
1017
+ "objective": 0.22401383519172668,
1018
+ "ranking_idealized": 0.5874999761581421,
1019
+ "ranking_idealized_expo": 0.512499988079071,
1020
+ "ranking_simple": 0.8291666507720947,
1021
+ "regularize": 0.22401383519172668,
1022
+ "step": 285
1023
+ },
1024
+ {
1025
+ "dpo_loss": 0.2646006941795349,
1026
+ "dpo_wo_beta": -0.7763135433197021,
1027
+ "epoch": 1.643835616438356,
1028
+ "grad_norm": 10.585292794409252,
1029
+ "learning_rate": 2.508267334019988e-06,
1030
+ "logits": -1.9566444158554077,
1031
+ "logps": -97.0122299194336,
1032
+ "loss": 0.2532,
1033
+ "objective": 0.2646006941795349,
1034
+ "ranking_idealized": 0.612500011920929,
1035
+ "ranking_idealized_expo": 0.5625,
1036
+ "ranking_simple": 0.8041666746139526,
1037
+ "regularize": 0.2646006941795349,
1038
+ "step": 290
1039
+ },
1040
+ {
1041
+ "dpo_loss": 0.23113909363746643,
1042
+ "dpo_wo_beta": -0.6497251987457275,
1043
+ "epoch": 1.6721776098252243,
1044
+ "grad_norm": 11.90240881956814,
1045
+ "learning_rate": 2.4256048425921693e-06,
1046
+ "logits": -1.8574607372283936,
1047
+ "logps": -94.91531372070312,
1048
+ "loss": 0.2476,
1049
+ "objective": 0.23113909363746643,
1050
+ "ranking_idealized": 0.5791666507720947,
1051
+ "ranking_idealized_expo": 0.4749999940395355,
1052
+ "ranking_simple": 0.8333333134651184,
1053
+ "regularize": 0.23113909363746643,
1054
+ "step": 295
1055
+ },
1056
+ {
1057
+ "dpo_loss": 0.22116926312446594,
1058
+ "dpo_wo_beta": -0.6268281936645508,
1059
+ "epoch": 1.7005196032120926,
1060
+ "grad_norm": 11.745161783871675,
1061
+ "learning_rate": 2.3430237011767166e-06,
1062
+ "logits": -1.895004153251648,
1063
+ "logps": -97.79885864257812,
1064
+ "loss": 0.2266,
1065
+ "objective": 0.22116926312446594,
1066
+ "ranking_idealized": 0.6583333611488342,
1067
+ "ranking_idealized_expo": 0.574999988079071,
1068
+ "ranking_simple": 0.8333333134651184,
1069
+ "regularize": 0.22116926312446594,
1070
+ "step": 300
1071
+ },
1072
+ {
1073
+ "dpo_loss": 0.24756571650505066,
1074
+ "dpo_wo_beta": -0.9131773114204407,
1075
+ "epoch": 1.7288615965989607,
1076
+ "grad_norm": 12.299641904512029,
1077
+ "learning_rate": 2.2606142110393248e-06,
1078
+ "logits": -1.8061485290527344,
1079
+ "logps": -96.69060516357422,
1080
+ "loss": 0.2379,
1081
+ "objective": 0.24756571650505066,
1082
+ "ranking_idealized": 0.5625,
1083
+ "ranking_idealized_expo": 0.5083333253860474,
1084
+ "ranking_simple": 0.8291666507720947,
1085
+ "regularize": 0.24756571650505066,
1086
+ "step": 305
1087
+ },
1088
+ {
1089
+ "dpo_loss": 0.2321903556585312,
1090
+ "dpo_wo_beta": -0.6867564916610718,
1091
+ "epoch": 1.7572035899858292,
1092
+ "grad_norm": 13.489735935272718,
1093
+ "learning_rate": 2.1784664857475356e-06,
1094
+ "logits": -1.8388514518737793,
1095
+ "logps": -95.04447937011719,
1096
+ "loss": 0.2456,
1097
+ "objective": 0.2321903556585312,
1098
+ "ranking_idealized": 0.6083333492279053,
1099
+ "ranking_idealized_expo": 0.5249999761581421,
1100
+ "ranking_simple": 0.824999988079071,
1101
+ "regularize": 0.2321903556585312,
1102
+ "step": 310
1103
+ },
1104
+ {
1105
+ "dpo_loss": 0.2901044189929962,
1106
+ "dpo_wo_beta": -1.1286156177520752,
1107
+ "epoch": 1.7855455833726972,
1108
+ "grad_norm": 10.887596324980125,
1109
+ "learning_rate": 2.096670352632873e-06,
1110
+ "logits": -1.75984525680542,
1111
+ "logps": -94.63612365722656,
1112
+ "loss": 0.2571,
1113
+ "objective": 0.2901044189929962,
1114
+ "ranking_idealized": 0.6041666865348816,
1115
+ "ranking_idealized_expo": 0.5083333253860474,
1116
+ "ranking_simple": 0.8083333373069763,
1117
+ "regularize": 0.2901044189929962,
1118
+ "step": 315
1119
+ },
1120
+ {
1121
+ "epoch": 1.8025507794048181,
1122
+ "eval_dpo_loss": 0.7664583325386047,
1123
+ "eval_dpo_wo_beta": -4.09501838684082,
1124
+ "eval_logits": -1.9888346195220947,
1125
+ "eval_logps": -102.3712158203125,
1126
+ "eval_loss": 0.7400712966918945,
1127
+ "eval_objective": 0.7664583325386047,
1128
+ "eval_ranking_idealized": 0.5888429880142212,
1129
+ "eval_ranking_idealized_expo": 0.5092975497245789,
1130
+ "eval_ranking_simple": 0.53925621509552,
1131
+ "eval_regularize": 0.7664583325386047,
1132
+ "eval_runtime": 210.274,
1133
+ "eval_samples_per_second": 27.535,
1134
+ "eval_steps_per_second": 1.151,
1135
+ "step": 318
1136
+ },
1137
+ {
1138
+ "dpo_loss": 0.2219768464565277,
1139
+ "dpo_wo_beta": -0.47742757201194763,
1140
+ "epoch": 1.8138875767595655,
1141
+ "grad_norm": 11.029480506309918,
1142
+ "learning_rate": 2.01531525456598e-06,
1143
+ "logits": -1.9175788164138794,
1144
+ "logps": -99.74655151367188,
1145
+ "loss": 0.2404,
1146
+ "objective": 0.2219768464565277,
1147
+ "ranking_idealized": 0.6000000238418579,
1148
+ "ranking_idealized_expo": 0.5,
1149
+ "ranking_simple": 0.824999988079071,
1150
+ "regularize": 0.2219768464565277,
1151
+ "step": 320
1152
+ },
1153
+ {
1154
+ "dpo_loss": 0.24908211827278137,
1155
+ "dpo_wo_beta": -0.8014059066772461,
1156
+ "epoch": 1.8422295701464337,
1157
+ "grad_norm": 12.92850322071669,
1158
+ "learning_rate": 1.93449015215215e-06,
1159
+ "logits": -2.0084919929504395,
1160
+ "logps": -101.09780883789062,
1161
+ "loss": 0.2586,
1162
+ "objective": 0.24908211827278137,
1163
+ "ranking_idealized": 0.6333333253860474,
1164
+ "ranking_idealized_expo": 0.5166666507720947,
1165
+ "ranking_simple": 0.8333333134651184,
1166
+ "regularize": 0.24908211827278137,
1167
+ "step": 325
1168
+ },
1169
+ {
1170
+ "dpo_loss": 0.1984507441520691,
1171
+ "dpo_wo_beta": -0.3766098618507385,
1172
+ "epoch": 1.8705715635333018,
1173
+ "grad_norm": 10.415606016359964,
1174
+ "learning_rate": 1.8542834264542091e-06,
1175
+ "logits": -1.851909875869751,
1176
+ "logps": -94.5366439819336,
1177
+ "loss": 0.2496,
1178
+ "objective": 0.1984507441520691,
1179
+ "ranking_idealized": 0.574999988079071,
1180
+ "ranking_idealized_expo": 0.4791666567325592,
1181
+ "ranking_simple": 0.8291666507720947,
1182
+ "regularize": 0.1984507441520691,
1183
+ "step": 330
1184
+ },
1185
+ {
1186
+ "dpo_loss": 0.26707762479782104,
1187
+ "dpo_wo_beta": -0.9339324831962585,
1188
+ "epoch": 1.89891355692017,
1189
+ "grad_norm": 10.078352873471246,
1190
+ "learning_rate": 1.7747827823491253e-06,
1191
+ "logits": -1.9827288389205933,
1192
+ "logps": -94.26249694824219,
1193
+ "loss": 0.2463,
1194
+ "objective": 0.26707762479782104,
1195
+ "ranking_idealized": 0.5958333611488342,
1196
+ "ranking_idealized_expo": 0.4958333373069763,
1197
+ "ranking_simple": 0.8125,
1198
+ "regularize": 0.26707762479782104,
1199
+ "step": 335
1200
+ },
1201
+ {
1202
+ "dpo_loss": 0.2447831928730011,
1203
+ "dpo_wo_beta": -0.7387041449546814,
1204
+ "epoch": 1.9272555503070383,
1205
+ "grad_norm": 10.88136655004607,
1206
+ "learning_rate": 1.6960751526240122e-06,
1207
+ "logits": -1.9671465158462524,
1208
+ "logps": -98.63937377929688,
1209
+ "loss": 0.2399,
1210
+ "objective": 0.2447831928730011,
1211
+ "ranking_idealized": 0.6416666507720947,
1212
+ "ranking_idealized_expo": 0.5416666865348816,
1213
+ "ranking_simple": 0.8583333492279053,
1214
+ "regularize": 0.2447831928730011,
1215
+ "step": 340
1216
+ },
1217
+ {
1218
+ "dpo_loss": 0.2123527079820633,
1219
+ "dpo_wo_beta": -0.5544185638427734,
1220
+ "epoch": 1.9555975436939064,
1221
+ "grad_norm": 11.18260747105762,
1222
+ "learning_rate": 1.6182466029163974e-06,
1223
+ "logits": -1.9572845697402954,
1224
+ "logps": -100.18721008300781,
1225
+ "loss": 0.2211,
1226
+ "objective": 0.2123527079820633,
1227
+ "ranking_idealized": 0.574999988079071,
1228
+ "ranking_idealized_expo": 0.5333333611488342,
1229
+ "ranking_simple": 0.8208333253860474,
1230
+ "regularize": 0.2123527079820633,
1231
+ "step": 345
1232
+ },
1233
+ {
1234
+ "dpo_loss": 0.2570362389087677,
1235
+ "dpo_wo_beta": -0.7474013566970825,
1236
+ "epoch": 1.9839395370807746,
1237
+ "grad_norm": 11.061918116138507,
1238
+ "learning_rate": 1.541382237602721e-06,
1239
+ "logits": -1.8960832357406616,
1240
+ "logps": -101.65901947021484,
1241
+ "loss": 0.2316,
1242
+ "objective": 0.2570362389087677,
1243
+ "ranking_idealized": 0.5791666507720947,
1244
+ "ranking_idealized_expo": 0.48750001192092896,
1245
+ "ranking_simple": 0.7916666865348816,
1246
+ "regularize": 0.2570362389087677,
1247
+ "step": 350
1248
+ },
1249
+ {
1250
+ "dpo_loss": 0.19961656630039215,
1251
+ "dpo_wo_beta": -0.5642960667610168,
1252
+ "epoch": 2.012281530467643,
1253
+ "grad_norm": 7.569515164252156,
1254
+ "learning_rate": 1.465566106737942e-06,
1255
+ "logits": -1.8380100727081299,
1256
+ "logps": -102.71571350097656,
1257
+ "loss": 0.2103,
1258
+ "objective": 0.19961656630039215,
1259
+ "ranking_idealized": 0.5958333611488342,
1260
+ "ranking_idealized_expo": 0.5041666626930237,
1261
+ "ranking_simple": 0.8374999761581421,
1262
+ "regularize": 0.19961656630039215,
1263
+ "step": 355
1264
+ },
1265
+ {
1266
+ "dpo_loss": 0.11018560826778412,
1267
+ "dpo_wo_beta": -0.12253165245056152,
1268
+ "epoch": 2.040623523854511,
1269
+ "grad_norm": 6.632276986432463,
1270
+ "learning_rate": 1.3908811141480408e-06,
1271
+ "logits": -1.867693543434143,
1272
+ "logps": -103.06665802001953,
1273
+ "loss": 0.118,
1274
+ "objective": 0.11018560826778412,
1275
+ "ranking_idealized": 0.6625000238418579,
1276
+ "ranking_idealized_expo": 0.5375000238418579,
1277
+ "ranking_simple": 0.9041666388511658,
1278
+ "regularize": 0.11018560826778412,
1279
+ "step": 360
1280
+ },
1281
+ {
1282
+ "dpo_loss": 0.12077057361602783,
1283
+ "dpo_wo_beta": -0.197490856051445,
1284
+ "epoch": 2.0689655172413794,
1285
+ "grad_norm": 10.213186193965676,
1286
+ "learning_rate": 1.3174089267758983e-06,
1287
+ "logits": -1.8255099058151245,
1288
+ "logps": -110.3724136352539,
1289
+ "loss": 0.118,
1290
+ "objective": 0.12077057361602783,
1291
+ "ranking_idealized": 0.6000000238418579,
1292
+ "ranking_idealized_expo": 0.5083333253860474,
1293
+ "ranking_simple": 0.8666666746139526,
1294
+ "regularize": 0.12077057361602783,
1295
+ "step": 365
1296
+ },
1297
+ {
1298
+ "dpo_loss": 0.1337815225124359,
1299
+ "dpo_wo_beta": -0.27523547410964966,
1300
+ "epoch": 2.0973075106282475,
1301
+ "grad_norm": 9.926730675582434,
1302
+ "learning_rate": 1.245229885379699e-06,
1303
+ "logits": -1.7588540315628052,
1304
+ "logps": -111.99506378173828,
1305
+ "loss": 0.1227,
1306
+ "objective": 0.1337815225124359,
1307
+ "ranking_idealized": 0.5708333253860474,
1308
+ "ranking_idealized_expo": 0.4791666567325592,
1309
+ "ranking_simple": 0.8916666507720947,
1310
+ "regularize": 0.1337815225124359,
1311
+ "step": 370
1312
+ },
1313
+ {
1314
+ "epoch": 2.1029759093056213,
1315
+ "eval_dpo_loss": 0.9223728179931641,
1316
+ "eval_dpo_wo_beta": -6.4510064125061035,
1317
+ "eval_logits": -1.8644566535949707,
1318
+ "eval_logps": -122.00161743164062,
1319
+ "eval_loss": 0.8844180107116699,
1320
+ "eval_objective": 0.9223728179931641,
1321
+ "eval_ranking_idealized": 0.5888429880142212,
1322
+ "eval_ranking_idealized_expo": 0.5092975497245789,
1323
+ "eval_ranking_simple": 0.5423553586006165,
1324
+ "eval_regularize": 0.9223728179931641,
1325
+ "eval_runtime": 210.7356,
1326
+ "eval_samples_per_second": 27.475,
1327
+ "eval_steps_per_second": 1.148,
1328
+ "step": 371
1329
+ },
1330
+ {
1331
+ "dpo_loss": 0.10664375871419907,
1332
+ "dpo_wo_beta": -0.2532973289489746,
1333
+ "epoch": 2.1256495040151155,
1334
+ "grad_norm": 9.740007111179482,
1335
+ "learning_rate": 1.1744229166814889e-06,
1336
+ "logits": -1.696647047996521,
1337
+ "logps": -118.39366149902344,
1338
+ "loss": 0.1103,
1339
+ "objective": 0.10664375871419907,
1340
+ "ranking_idealized": 0.675000011920929,
1341
+ "ranking_idealized_expo": 0.5916666388511658,
1342
+ "ranking_simple": 0.925000011920929,
1343
+ "regularize": 0.10664375871419907,
1344
+ "step": 375
1345
+ },
1346
+ {
1347
+ "dpo_loss": 0.12854978442192078,
1348
+ "dpo_wo_beta": -0.27664583921432495,
1349
+ "epoch": 2.153991497401984,
1350
+ "grad_norm": 9.699256456859702,
1351
+ "learning_rate": 1.1050654470619602e-06,
1352
+ "logits": -1.700494647026062,
1353
+ "logps": -114.1063232421875,
1354
+ "loss": 0.1208,
1355
+ "objective": 0.12854978442192078,
1356
+ "ranking_idealized": 0.6000000238418579,
1357
+ "ranking_idealized_expo": 0.5249999761581421,
1358
+ "ranking_simple": 0.8999999761581421,
1359
+ "regularize": 0.12854978442192078,
1360
+ "step": 380
1361
+ },
1362
+ {
1363
+ "dpo_loss": 0.10418140888214111,
1364
+ "dpo_wo_beta": -0.09889766573905945,
1365
+ "epoch": 2.182333490788852,
1366
+ "grad_norm": 9.620361843085416,
1367
+ "learning_rate": 1.0372333178958462e-06,
1368
+ "logits": -1.8633235692977905,
1369
+ "logps": -110.55794525146484,
1370
+ "loss": 0.1244,
1371
+ "objective": 0.10418140888214111,
1372
+ "ranking_idealized": 0.5833333134651184,
1373
+ "ranking_idealized_expo": 0.5041666626930237,
1374
+ "ranking_simple": 0.9125000238418579,
1375
+ "regularize": 0.10418140888214111,
1376
+ "step": 385
1377
+ },
1378
+ {
1379
+ "dpo_loss": 0.12462247163057327,
1380
+ "dpo_wo_beta": -0.2658768594264984,
1381
+ "epoch": 2.21067548417572,
1382
+ "grad_norm": 11.000881222201947,
1383
+ "learning_rate": 9.710007026204896e-07,
1384
+ "logits": -1.7877620458602905,
1385
+ "logps": -112.08268737792969,
1386
+ "loss": 0.1204,
1387
+ "objective": 0.12462247163057327,
1388
+ "ranking_idealized": 0.5249999761581421,
1389
+ "ranking_idealized_expo": 0.4833333194255829,
1390
+ "ranking_simple": 0.8833333253860474,
1391
+ "regularize": 0.12462247163057327,
1392
+ "step": 390
1393
+ },
1394
+ {
1395
+ "dpo_loss": 0.11183874309062958,
1396
+ "dpo_wo_beta": -0.3540593981742859,
1397
+ "epoch": 2.2390174775625886,
1398
+ "grad_norm": 8.717110295390793,
1399
+ "learning_rate": 9.064400256282757e-07,
1400
+ "logits": -1.8010636568069458,
1401
+ "logps": -110.48490142822266,
1402
+ "loss": 0.1248,
1403
+ "objective": 0.11183874309062958,
1404
+ "ranking_idealized": 0.6000000238418579,
1405
+ "ranking_idealized_expo": 0.5166666507720947,
1406
+ "ranking_simple": 0.9041666388511658,
1407
+ "regularize": 0.11183874309062958,
1408
+ "step": 395
1409
+ },
1410
+ {
1411
+ "dpo_loss": 0.12893003225326538,
1412
+ "dpo_wo_beta": -0.3680768311023712,
1413
+ "epoch": 2.2673594709494567,
1414
+ "grad_norm": 9.562073048936949,
1415
+ "learning_rate": 8.436218830716259e-07,
1416
+ "logits": -1.8909595012664795,
1417
+ "logps": -111.70219421386719,
1418
+ "loss": 0.1193,
1419
+ "objective": 0.12893003225326538,
1420
+ "ranking_idealized": 0.6625000238418579,
1421
+ "ranking_idealized_expo": 0.5708333253860474,
1422
+ "ranking_simple": 0.9166666865348816,
1423
+ "regularize": 0.12893003225326538,
1424
+ "step": 400
1425
+ },
1426
+ {
1427
+ "dpo_loss": 0.13196416199207306,
1428
+ "dpo_wo_beta": -0.17852090299129486,
1429
+ "epoch": 2.295701464336325,
1430
+ "grad_norm": 9.166021194752298,
1431
+ "learning_rate": 7.826149656671386e-07,
1432
+ "logits": -1.9320632219314575,
1433
+ "logps": -108.1246566772461,
1434
+ "loss": 0.1267,
1435
+ "objective": 0.13196416199207306,
1436
+ "ranking_idealized": 0.6416666507720947,
1437
+ "ranking_idealized_expo": 0.5583333373069763,
1438
+ "ranking_simple": 0.8958333134651184,
1439
+ "regularize": 0.13196416199207306,
1440
+ "step": 405
1441
+ },
1442
+ {
1443
+ "dpo_loss": 0.11071384698152542,
1444
+ "dpo_wo_beta": -0.1424117088317871,
1445
+ "epoch": 2.324043457723193,
1446
+ "grad_norm": 8.918983804471582,
1447
+ "learning_rate": 7.234859835833022e-07,
1448
+ "logits": -1.8304682970046997,
1449
+ "logps": -111.2301025390625,
1450
+ "loss": 0.112,
1451
+ "objective": 0.11071384698152542,
1452
+ "ranking_idealized": 0.5708333253860474,
1453
+ "ranking_idealized_expo": 0.4791666567325592,
1454
+ "ranking_simple": 0.9041666388511658,
1455
+ "regularize": 0.11071384698152542,
1456
+ "step": 410
1457
+ },
1458
+ {
1459
+ "dpo_loss": 0.1223960742354393,
1460
+ "dpo_wo_beta": -0.1956464648246765,
1461
+ "epoch": 2.3523854511100613,
1462
+ "grad_norm": 9.386393866562546,
1463
+ "learning_rate": 6.662995934939007e-07,
1464
+ "logits": -1.8708041906356812,
1465
+ "logps": -111.06449890136719,
1466
+ "loss": 0.1155,
1467
+ "objective": 0.1223960742354393,
1468
+ "ranking_idealized": 0.5958333611488342,
1469
+ "ranking_idealized_expo": 0.5291666388511658,
1470
+ "ranking_simple": 0.9083333611488342,
1471
+ "regularize": 0.1223960742354393,
1472
+ "step": 415
1473
+ },
1474
+ {
1475
+ "dpo_loss": 0.12930770218372345,
1476
+ "dpo_wo_beta": -0.21560731530189514,
1477
+ "epoch": 2.3807274444969297,
1478
+ "grad_norm": 11.0131183307354,
1479
+ "learning_rate": 6.111183278768956e-07,
1480
+ "logits": -1.860797643661499,
1481
+ "logps": -113.08780670166016,
1482
+ "loss": 0.133,
1483
+ "objective": 0.12930770218372345,
1484
+ "ranking_idealized": 0.5666666626930237,
1485
+ "ranking_idealized_expo": 0.4749999940395355,
1486
+ "ranking_simple": 0.9208333492279053,
1487
+ "regularize": 0.12930770218372345,
1488
+ "step": 420
1489
+ },
1490
+ {
1491
+ "epoch": 2.403401039206424,
1492
+ "eval_dpo_loss": 0.8785684108734131,
1493
+ "eval_dpo_wo_beta": -5.887755870819092,
1494
+ "eval_logits": -2.0276894569396973,
1495
+ "eval_logps": -117.1216812133789,
1496
+ "eval_loss": 0.8447906374931335,
1497
+ "eval_objective": 0.8785684108734131,
1498
+ "eval_ranking_idealized": 0.5888429880142212,
1499
+ "eval_ranking_idealized_expo": 0.5092975497245789,
1500
+ "eval_ranking_simple": 0.5413222908973694,
1501
+ "eval_regularize": 0.8785684108734131,
1502
+ "eval_runtime": 209.8564,
1503
+ "eval_samples_per_second": 27.59,
1504
+ "eval_steps_per_second": 1.153,
1505
+ "step": 424
1506
+ },
1507
+ {
1508
+ "dpo_loss": 0.117975153028965,
1509
+ "dpo_wo_beta": -0.1884605884552002,
1510
+ "epoch": 2.413793103448276,
1511
+ "grad_norm": 11.036168833651558,
1512
+ "learning_rate": 5.580025266360764e-07,
1513
+ "logits": -1.7822004556655884,
1514
+ "logps": -114.43038177490234,
1515
+ "loss": 0.1465,
1516
+ "objective": 0.117975153028965,
1517
+ "ranking_idealized": 0.6666666865348816,
1518
+ "ranking_idealized_expo": 0.5625,
1519
+ "ranking_simple": 0.9375,
1520
+ "regularize": 0.117975153028965,
1521
+ "step": 425
1522
+ },
1523
+ {
1524
+ "dpo_loss": 0.1465020477771759,
1525
+ "dpo_wo_beta": -0.2595965266227722,
1526
+ "epoch": 2.442135096835144,
1527
+ "grad_norm": 10.595070818850646,
1528
+ "learning_rate": 5.070102711202606e-07,
1529
+ "logits": -1.8692681789398193,
1530
+ "logps": -110.2762680053711,
1531
+ "loss": 0.1276,
1532
+ "objective": 0.1465020477771759,
1533
+ "ranking_idealized": 0.5916666388511658,
1534
+ "ranking_idealized_expo": 0.5333333611488342,
1535
+ "ranking_simple": 0.8958333134651184,
1536
+ "regularize": 0.1465020477771759,
1537
+ "step": 430
1538
+ },
1539
+ {
1540
+ "dpo_loss": 0.09775053709745407,
1541
+ "dpo_wo_beta": -0.12755917012691498,
1542
+ "epoch": 2.4704770902220123,
1543
+ "grad_norm": 9.393206692367766,
1544
+ "learning_rate": 4.581973206121948e-07,
1545
+ "logits": -1.8968538045883179,
1546
+ "logps": -112.28767395019531,
1547
+ "loss": 0.1175,
1548
+ "objective": 0.09775053709745407,
1549
+ "ranking_idealized": 0.5874999761581421,
1550
+ "ranking_idealized_expo": 0.512499988079071,
1551
+ "ranking_simple": 0.9083333611488342,
1552
+ "regularize": 0.09775053709745407,
1553
+ "step": 435
1554
+ },
1555
+ {
1556
+ "dpo_loss": 0.14228057861328125,
1557
+ "dpo_wo_beta": -0.3639788329601288,
1558
+ "epoch": 2.4988190836088804,
1559
+ "grad_norm": 8.020134663378592,
1560
+ "learning_rate": 4.116170513565942e-07,
1561
+ "logits": -1.8666160106658936,
1562
+ "logps": -109.18843078613281,
1563
+ "loss": 0.1167,
1564
+ "objective": 0.14228057861328125,
1565
+ "ranking_idealized": 0.574999988079071,
1566
+ "ranking_idealized_expo": 0.5333333611488342,
1567
+ "ranking_simple": 0.875,
1568
+ "regularize": 0.14228057861328125,
1569
+ "step": 440
1570
+ },
1571
+ {
1572
+ "dpo_loss": 0.13583588600158691,
1573
+ "dpo_wo_beta": -0.2074100226163864,
1574
+ "epoch": 2.527161076995749,
1575
+ "grad_norm": 9.224367796824264,
1576
+ "learning_rate": 3.6732039819400686e-07,
1577
+ "logits": -1.8071045875549316,
1578
+ "logps": -107.2675552368164,
1579
+ "loss": 0.1319,
1580
+ "objective": 0.13583588600158691,
1581
+ "ranking_idealized": 0.5625,
1582
+ "ranking_idealized_expo": 0.49166667461395264,
1583
+ "ranking_simple": 0.8791666626930237,
1584
+ "regularize": 0.13583588600158691,
1585
+ "step": 445
1586
+ },
1587
+ {
1588
+ "dpo_loss": 0.17114870250225067,
1589
+ "dpo_wo_beta": -0.43270742893218994,
1590
+ "epoch": 2.555503070382617,
1591
+ "grad_norm": 11.265861710797749,
1592
+ "learning_rate": 3.253557988643072e-07,
1593
+ "logits": -1.9256045818328857,
1594
+ "logps": -111.20384216308594,
1595
+ "loss": 0.1288,
1596
+ "objective": 0.17114870250225067,
1597
+ "ranking_idealized": 0.5874999761581421,
1598
+ "ranking_idealized_expo": 0.5166666507720947,
1599
+ "ranking_simple": 0.862500011920929,
1600
+ "regularize": 0.17114870250225067,
1601
+ "step": 450
1602
+ },
1603
+ {
1604
+ "dpo_loss": 0.10827689617872238,
1605
+ "dpo_wo_beta": -0.1751028150320053,
1606
+ "epoch": 2.583845063769485,
1607
+ "grad_norm": 9.605136286662574,
1608
+ "learning_rate": 2.8576914104074425e-07,
1609
+ "logits": -1.9289051294326782,
1610
+ "logps": -109.37706756591797,
1611
+ "loss": 0.1168,
1612
+ "objective": 0.10827689617872238,
1613
+ "ranking_idealized": 0.5958333611488342,
1614
+ "ranking_idealized_expo": 0.4958333373069763,
1615
+ "ranking_simple": 0.9125000238418579,
1616
+ "regularize": 0.10827689617872238,
1617
+ "step": 455
1618
+ },
1619
+ {
1620
+ "dpo_loss": 0.11124877631664276,
1621
+ "dpo_wo_beta": -0.28054580092430115,
1622
+ "epoch": 2.6121870571563535,
1623
+ "grad_norm": 9.957466667064367,
1624
+ "learning_rate": 2.486037121524448e-07,
1625
+ "logits": -1.93342924118042,
1626
+ "logps": -113.2356948852539,
1627
+ "loss": 0.1169,
1628
+ "objective": 0.11124877631664276,
1629
+ "ranking_idealized": 0.6291666626930237,
1630
+ "ranking_idealized_expo": 0.5333333611488342,
1631
+ "ranking_simple": 0.9375,
1632
+ "regularize": 0.11124877631664276,
1633
+ "step": 460
1634
+ },
1635
+ {
1636
+ "dpo_loss": 0.12714476883411407,
1637
+ "dpo_wo_beta": -0.22146105766296387,
1638
+ "epoch": 2.6405290505432215,
1639
+ "grad_norm": 10.04326854921629,
1640
+ "learning_rate": 2.13900152050239e-07,
1641
+ "logits": -1.8874350786209106,
1642
+ "logps": -108.94982147216797,
1643
+ "loss": 0.1189,
1644
+ "objective": 0.12714476883411407,
1645
+ "ranking_idealized": 0.5458333492279053,
1646
+ "ranking_idealized_expo": 0.4791666567325592,
1647
+ "ranking_simple": 0.862500011920929,
1648
+ "regularize": 0.12714476883411407,
1649
+ "step": 465
1650
+ },
1651
+ {
1652
+ "dpo_loss": 0.12151040881872177,
1653
+ "dpo_wo_beta": -0.26416900753974915,
1654
+ "epoch": 2.66887104393009,
1655
+ "grad_norm": 8.777820527737605,
1656
+ "learning_rate": 1.8169640856758652e-07,
1657
+ "logits": -1.9314534664154053,
1658
+ "logps": -112.75170135498047,
1659
+ "loss": 0.1254,
1660
+ "objective": 0.12151040881872177,
1661
+ "ranking_idealized": 0.6291666626930237,
1662
+ "ranking_idealized_expo": 0.5541666746139526,
1663
+ "ranking_simple": 0.9083333611488342,
1664
+ "regularize": 0.12151040881872177,
1665
+ "step": 470
1666
+ },
1667
+ {
1668
+ "dpo_loss": 0.12749101221561432,
1669
+ "dpo_wo_beta": -0.2816121280193329,
1670
+ "epoch": 2.697213037316958,
1671
+ "grad_norm": 9.221778751171357,
1672
+ "learning_rate": 1.5202769602517514e-07,
1673
+ "logits": -1.8307260274887085,
1674
+ "logps": -109.39693450927734,
1675
+ "loss": 0.1211,
1676
+ "objective": 0.12749101221561432,
1677
+ "ranking_idealized": 0.6041666865348816,
1678
+ "ranking_idealized_expo": 0.4958333373069763,
1679
+ "ranking_simple": 0.8999999761581421,
1680
+ "regularize": 0.12749101221561432,
1681
+ "step": 475
1682
+ },
1683
+ {
1684
+ "epoch": 2.708549834671705,
1685
+ "eval_dpo_loss": 0.8738968372344971,
1686
+ "eval_dpo_wo_beta": -5.815241813659668,
1687
+ "eval_logits": -2.0271613597869873,
1688
+ "eval_logps": -116.42301177978516,
1689
+ "eval_loss": 0.8371492624282837,
1690
+ "eval_objective": 0.8738968372344971,
1691
+ "eval_ranking_idealized": 0.5888429880142212,
1692
+ "eval_ranking_idealized_expo": 0.5092975497245789,
1693
+ "eval_ranking_simple": 0.5402892827987671,
1694
+ "eval_regularize": 0.8738968372344971,
1695
+ "eval_runtime": 211.9437,
1696
+ "eval_samples_per_second": 27.319,
1697
+ "eval_steps_per_second": 1.142,
1698
+ "step": 477
1699
+ },
1700
+ {
1701
+ "dpo_loss": 0.13781045377254486,
1702
+ "dpo_wo_beta": -0.2485995590686798,
1703
+ "epoch": 2.725555030703826,
1704
+ "grad_norm": 10.971551462649595,
1705
+ "learning_rate": 1.2492645672457838e-07,
1706
+ "logits": -1.9437103271484375,
1707
+ "logps": -108.93817901611328,
1708
+ "loss": 0.1267,
1709
+ "objective": 0.13781045377254486,
1710
+ "ranking_idealized": 0.5916666388511658,
1711
+ "ranking_idealized_expo": 0.5083333253860474,
1712
+ "ranking_simple": 0.8999999761581421,
1713
+ "regularize": 0.13781045377254486,
1714
+ "step": 480
1715
+ },
1716
+ {
1717
+ "dpo_loss": 0.11082082241773605,
1718
+ "dpo_wo_beta": -0.10876031965017319,
1719
+ "epoch": 2.753897024090694,
1720
+ "grad_norm": 10.884940640535042,
1721
+ "learning_rate": 1.004223254730749e-07,
1722
+ "logits": -1.7556992769241333,
1723
+ "logps": -114.1142807006836,
1724
+ "loss": 0.1222,
1725
+ "objective": 0.11082082241773605,
1726
+ "ranking_idealized": 0.6166666746139526,
1727
+ "ranking_idealized_expo": 0.5249999761581421,
1728
+ "ranking_simple": 0.9041666388511658,
1729
+ "regularize": 0.11082082241773605,
1730
+ "step": 485
1731
+ },
1732
+ {
1733
+ "dpo_loss": 0.09154360741376877,
1734
+ "dpo_wo_beta": -0.05899694189429283,
1735
+ "epoch": 2.7822390174775626,
1736
+ "grad_norm": 11.015982469457516,
1737
+ "learning_rate": 7.854209717842231e-08,
1738
+ "logits": -1.8848822116851807,
1739
+ "logps": -110.15470886230469,
1740
+ "loss": 0.1058,
1741
+ "objective": 0.09154360741376877,
1742
+ "ranking_idealized": 0.5874999761581421,
1743
+ "ranking_idealized_expo": 0.47083333134651184,
1744
+ "ranking_simple": 0.9333333373069763,
1745
+ "regularize": 0.09154360741376877,
1746
+ "step": 490
1747
+ },
1748
+ {
1749
+ "dpo_loss": 0.10964310169219971,
1750
+ "dpo_wo_beta": -0.07648710906505585,
1751
+ "epoch": 2.8105810108644307,
1752
+ "grad_norm": 10.079416267782939,
1753
+ "learning_rate": 5.930969754901844e-08,
1754
+ "logits": -1.8575230836868286,
1755
+ "logps": -108.52234649658203,
1756
+ "loss": 0.1192,
1757
+ "objective": 0.10964310169219971,
1758
+ "ranking_idealized": 0.5583333373069763,
1759
+ "ranking_idealized_expo": 0.4749999940395355,
1760
+ "ranking_simple": 0.8666666746139526,
1761
+ "regularize": 0.10964310169219971,
1762
+ "step": 495
1763
+ },
1764
+ {
1765
+ "dpo_loss": 0.09479068219661713,
1766
+ "dpo_wo_beta": -0.03411731496453285,
1767
+ "epoch": 2.838923004251299,
1768
+ "grad_norm": 9.84080114767598,
1769
+ "learning_rate": 4.2746156931490756e-08,
1770
+ "logits": -1.8439643383026123,
1771
+ "logps": -109.77281188964844,
1772
+ "loss": 0.1213,
1773
+ "objective": 0.09479068219661713,
1774
+ "ranking_idealized": 0.6208333373069763,
1775
+ "ranking_idealized_expo": 0.5208333134651184,
1776
+ "ranking_simple": 0.925000011920929,
1777
+ "regularize": 0.09479068219661713,
1778
+ "step": 500
1779
+ },
1780
+ {
1781
+ "dpo_loss": 0.12725140154361725,
1782
+ "dpo_wo_beta": -0.18973813951015472,
1783
+ "epoch": 2.8672649976381672,
1784
+ "grad_norm": 9.973754192936779,
1785
+ "learning_rate": 2.8869587314321324e-08,
1786
+ "logits": -1.8574442863464355,
1787
+ "logps": -110.32710266113281,
1788
+ "loss": 0.132,
1789
+ "objective": 0.12725140154361725,
1790
+ "ranking_idealized": 0.6166666746139526,
1791
+ "ranking_idealized_expo": 0.5208333134651184,
1792
+ "ranking_simple": 0.8999999761581421,
1793
+ "regularize": 0.12725140154361725,
1794
+ "step": 505
1795
+ },
1796
+ {
1797
+ "dpo_loss": 0.10469380766153336,
1798
+ "dpo_wo_beta": -0.1985001415014267,
1799
+ "epoch": 2.8956069910250353,
1800
+ "grad_norm": 8.936464383287202,
1801
+ "learning_rate": 1.7695162522652352e-08,
1802
+ "logits": -1.8629390001296997,
1803
+ "logps": -113.56767272949219,
1804
+ "loss": 0.1218,
1805
+ "objective": 0.10469380766153336,
1806
+ "ranking_idealized": 0.6041666865348816,
1807
+ "ranking_idealized_expo": 0.5,
1808
+ "ranking_simple": 0.8916666507720947,
1809
+ "regularize": 0.10469380766153336,
1810
+ "step": 510
1811
+ },
1812
+ {
1813
+ "dpo_loss": 0.11260154843330383,
1814
+ "dpo_wo_beta": -0.15691885352134705,
1815
+ "epoch": 2.9239489844119038,
1816
+ "grad_norm": 9.442300088571939,
1817
+ "learning_rate": 9.235101625932885e-09,
1818
+ "logits": -1.946829915046692,
1819
+ "logps": -108.54016876220703,
1820
+ "loss": 0.1258,
1821
+ "objective": 0.11260154843330383,
1822
+ "ranking_idealized": 0.5791666507720947,
1823
+ "ranking_idealized_expo": 0.48750001192092896,
1824
+ "ranking_simple": 0.925000011920929,
1825
+ "regularize": 0.11260154843330383,
1826
+ "step": 515
1827
+ },
1828
+ {
1829
+ "dpo_loss": 0.12230218201875687,
1830
+ "dpo_wo_beta": -0.10489177703857422,
1831
+ "epoch": 2.952290977798772,
1832
+ "grad_norm": 9.279898048101137,
1833
+ "learning_rate": 3.4986555765434415e-09,
1834
+ "logits": -1.8482831716537476,
1835
+ "logps": -114.20655059814453,
1836
+ "loss": 0.1228,
1837
+ "objective": 0.12230218201875687,
1838
+ "ranking_idealized": 0.6291666626930237,
1839
+ "ranking_idealized_expo": 0.5249999761581421,
1840
+ "ranking_simple": 0.8708333373069763,
1841
+ "regularize": 0.12230218201875687,
1842
+ "step": 520
1843
+ },
1844
+ {
1845
+ "dpo_loss": 0.13335375487804413,
1846
+ "dpo_wo_beta": -0.35261282324790955,
1847
+ "epoch": 2.9806329711856403,
1848
+ "grad_norm": 11.094809681697281,
1849
+ "learning_rate": 4.920970940180958e-10,
1850
+ "logits": -1.876869797706604,
1851
+ "logps": -111.03084564208984,
1852
+ "loss": 0.1235,
1853
+ "objective": 0.13335375487804413,
1854
+ "ranking_idealized": 0.6291666626930237,
1855
+ "ranking_idealized_expo": 0.5333333611488342,
1856
+ "ranking_simple": 0.8958333134651184,
1857
+ "regularize": 0.13335375487804413,
1858
+ "step": 525
1859
+ },
1860
+ {
1861
+ "epoch": 2.9976381672177608,
1862
+ "step": 528,
1863
+ "total_flos": 0.0,
1864
+ "train_loss": 0.0,
1865
+ "train_runtime": 9.3184,
1866
+ "train_samples_per_second": 16355.433,
1867
+ "train_steps_per_second": 56.662
1868
+ }
1869
+ ],
1870
+ "logging_steps": 5,
1871
+ "max_steps": 528,
1872
+ "num_input_tokens_seen": 0,
1873
+ "num_train_epochs": 3,
1874
+ "save_steps": 53,
1875
+ "stateful_callbacks": {
1876
+ "EarlyStoppingCallback": {
1877
+ "args": {
1878
+ "early_stopping_patience": 5,
1879
+ "early_stopping_threshold": 0.0
1880
+ },
1881
+ "attributes": {
1882
+ "early_stopping_patience_counter": 0
1883
+ }
1884
+ },
1885
+ "TrainerControl": {
1886
+ "args": {
1887
+ "should_epoch_stop": false,
1888
+ "should_evaluate": false,
1889
+ "should_log": false,
1890
+ "should_save": true,
1891
+ "should_training_stop": true
1892
+ },
1893
+ "attributes": {}
1894
+ }
1895
+ },
1896
+ "total_flos": 0.0,
1897
+ "train_batch_size": 4,
1898
+ "trial_name": null,
1899
+ "trial_params": null
1900
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8abf255eab1c68e0232e5027ae55f47aeacd24e2ec19ec5a5f1507dafd0f1975
3
+ size 8120
vocab.json ADDED
The diff for this file is too large to render. See raw diff