Phương commited on
Commit
fa86854
1 Parent(s): 6bd95a9

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+
18
+ The following `bitsandbytes` quantization config was used during training:
19
+ - load_in_8bit: True
20
+ - load_in_4bit: False
21
+ - llm_int8_threshold: 6.0
22
+ - llm_int8_skip_modules: None
23
+ - llm_int8_enable_fp32_cpu_offload: False
24
+ - llm_int8_has_fp16_weight: False
25
+ - bnb_4bit_quant_type: fp4
26
+ - bnb_4bit_use_double_quant: False
27
+ - bnb_4bit_compute_dtype: float32
28
+
29
+ The following `bitsandbytes` quantization config was used during training:
30
+ - load_in_8bit: True
31
+ - load_in_4bit: False
32
+ - llm_int8_threshold: 6.0
33
+ - llm_int8_skip_modules: None
34
+ - llm_int8_enable_fp32_cpu_offload: False
35
+ - llm_int8_has_fp16_weight: False
36
+ - bnb_4bit_quant_type: fp4
37
+ - bnb_4bit_use_double_quant: False
38
+ - bnb_4bit_compute_dtype: float32
39
+ ### Framework versions
40
+
41
+ - PEFT 0.4.0
42
+ - PEFT 0.4.0
43
+
44
+ - PEFT 0.4.0
adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16aded0165ccdf34618552c9774428822fab7840e9b07e0108bd61eb8e7c0510
3
+ size 40137613
checkpoint-100/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-100/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be06210c83a582aa636921654ed1b7db1366f254673140e110ee471c1e119727
3
+ size 40137613
checkpoint-100/adapter_model/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-100/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-100/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be06210c83a582aa636921654ed1b7db1366f254673140e110ee471c1e119727
3
+ size 40137613
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e04e6f433ccc01787e935d43bcd1fddd2e75d58dc1a2d75d5041ba25a502d3c
3
+ size 10264773
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fbc44877a85dc9e31508ab5cdcb4b09e15e4ccd881628820393d3ed5e0b4726
3
+ size 14575
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4773e854daa1644290049115865b392563071f7ab7ace9dc2513bd5d4e5afa65
3
+ size 627
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "global_step": 100,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 2e-05,
13
+ "loss": 1.8562,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 4e-05,
19
+ "loss": 1.8114,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.06,
24
+ "learning_rate": 6e-05,
25
+ "loss": 1.816,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.08,
30
+ "learning_rate": 8e-05,
31
+ "loss": 1.7654,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.1,
36
+ "learning_rate": 0.0001,
37
+ "loss": 1.7975,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "eval_loss": 1.751416563987732,
43
+ "eval_runtime": 6.1216,
44
+ "eval_samples_per_second": 0.327,
45
+ "eval_steps_per_second": 0.163,
46
+ "step": 5
47
+ },
48
+ {
49
+ "epoch": 0.12,
50
+ "learning_rate": 0.00012,
51
+ "loss": 1.88,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "learning_rate": 0.00014,
57
+ "loss": 1.7946,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.16,
62
+ "learning_rate": 0.00016,
63
+ "loss": 1.9499,
64
+ "step": 8
65
+ },
66
+ {
67
+ "epoch": 0.18,
68
+ "learning_rate": 0.00018,
69
+ "loss": 1.988,
70
+ "step": 9
71
+ },
72
+ {
73
+ "epoch": 0.2,
74
+ "learning_rate": 0.0002,
75
+ "loss": 1.6376,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.2,
80
+ "eval_loss": 1.722676396369934,
81
+ "eval_runtime": 6.1182,
82
+ "eval_samples_per_second": 0.327,
83
+ "eval_steps_per_second": 0.163,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.22,
88
+ "learning_rate": 0.00019997482349425066,
89
+ "loss": 1.2851,
90
+ "step": 11
91
+ },
92
+ {
93
+ "epoch": 0.24,
94
+ "learning_rate": 0.00019989930665413147,
95
+ "loss": 1.3604,
96
+ "step": 12
97
+ },
98
+ {
99
+ "epoch": 0.26,
100
+ "learning_rate": 0.0001997734875046456,
101
+ "loss": 1.7414,
102
+ "step": 13
103
+ },
104
+ {
105
+ "epoch": 0.28,
106
+ "learning_rate": 0.00019959742939952392,
107
+ "loss": 1.7797,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "learning_rate": 0.00019937122098932428,
113
+ "loss": 1.7487,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.3,
118
+ "eval_loss": 1.7023706436157227,
119
+ "eval_runtime": 6.1068,
120
+ "eval_samples_per_second": 0.328,
121
+ "eval_steps_per_second": 0.164,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.32,
126
+ "learning_rate": 0.00019909497617679348,
127
+ "loss": 1.8197,
128
+ "step": 16
129
+ },
130
+ {
131
+ "epoch": 0.34,
132
+ "learning_rate": 0.00019876883405951377,
133
+ "loss": 1.7928,
134
+ "step": 17
135
+ },
136
+ {
137
+ "epoch": 0.36,
138
+ "learning_rate": 0.00019839295885986296,
139
+ "loss": 1.7864,
140
+ "step": 18
141
+ },
142
+ {
143
+ "epoch": 0.38,
144
+ "learning_rate": 0.00019796753984232358,
145
+ "loss": 1.8278,
146
+ "step": 19
147
+ },
148
+ {
149
+ "epoch": 0.4,
150
+ "learning_rate": 0.00019749279121818235,
151
+ "loss": 1.7084,
152
+ "step": 20
153
+ },
154
+ {
155
+ "epoch": 0.4,
156
+ "eval_loss": 1.6931452751159668,
157
+ "eval_runtime": 6.1126,
158
+ "eval_samples_per_second": 0.327,
159
+ "eval_steps_per_second": 0.164,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.42,
164
+ "learning_rate": 0.0001969689520376687,
165
+ "loss": 1.8612,
166
+ "step": 21
167
+ },
168
+ {
169
+ "epoch": 0.44,
170
+ "learning_rate": 0.00019639628606958533,
171
+ "loss": 1.8796,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 0.46,
176
+ "learning_rate": 0.00019577508166849304,
177
+ "loss": 1.4313,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.48,
182
+ "learning_rate": 0.00019510565162951537,
183
+ "loss": 1.3394,
184
+ "step": 24
185
+ },
186
+ {
187
+ "epoch": 0.5,
188
+ "learning_rate": 0.00019438833303083678,
189
+ "loss": 1.8286,
190
+ "step": 25
191
+ },
192
+ {
193
+ "epoch": 0.5,
194
+ "eval_loss": 1.6889530420303345,
195
+ "eval_runtime": 6.1117,
196
+ "eval_samples_per_second": 0.327,
197
+ "eval_steps_per_second": 0.164,
198
+ "step": 25
199
+ },
200
+ {
201
+ "epoch": 0.52,
202
+ "learning_rate": 0.00019362348706397373,
203
+ "loss": 1.6983,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.54,
208
+ "learning_rate": 0.0001928114988519039,
209
+ "loss": 1.8419,
210
+ "step": 27
211
+ },
212
+ {
213
+ "epoch": 0.56,
214
+ "learning_rate": 0.0001919527772551451,
215
+ "loss": 1.7634,
216
+ "step": 28
217
+ },
218
+ {
219
+ "epoch": 0.58,
220
+ "learning_rate": 0.00019104775466588161,
221
+ "loss": 1.6784,
222
+ "step": 29
223
+ },
224
+ {
225
+ "epoch": 0.6,
226
+ "learning_rate": 0.0001900968867902419,
227
+ "loss": 1.8443,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 0.6,
232
+ "eval_loss": 1.6835517883300781,
233
+ "eval_runtime": 6.113,
234
+ "eval_samples_per_second": 0.327,
235
+ "eval_steps_per_second": 0.164,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.62,
240
+ "learning_rate": 0.0001891006524188368,
241
+ "loss": 1.7155,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.64,
246
+ "learning_rate": 0.0001880595531856738,
247
+ "loss": 1.8511,
248
+ "step": 32
249
+ },
250
+ {
251
+ "epoch": 0.66,
252
+ "learning_rate": 0.00018697411331556956,
253
+ "loss": 1.9756,
254
+ "step": 33
255
+ },
256
+ {
257
+ "epoch": 0.68,
258
+ "learning_rate": 0.00018584487936018661,
259
+ "loss": 1.7614,
260
+ "step": 34
261
+ },
262
+ {
263
+ "epoch": 0.7,
264
+ "learning_rate": 0.00018467241992282843,
265
+ "loss": 1.3127,
266
+ "step": 35
267
+ },
268
+ {
269
+ "epoch": 0.7,
270
+ "eval_loss": 1.6799031496047974,
271
+ "eval_runtime": 6.1132,
272
+ "eval_samples_per_second": 0.327,
273
+ "eval_steps_per_second": 0.164,
274
+ "step": 35
275
+ },
276
+ {
277
+ "epoch": 0.72,
278
+ "learning_rate": 0.00018345732537213027,
279
+ "loss": 0.8749,
280
+ "step": 36
281
+ },
282
+ {
283
+ "epoch": 0.74,
284
+ "learning_rate": 0.00018220020754479102,
285
+ "loss": 1.7892,
286
+ "step": 37
287
+ },
288
+ {
289
+ "epoch": 0.76,
290
+ "learning_rate": 0.00018090169943749476,
291
+ "loss": 1.8039,
292
+ "step": 38
293
+ },
294
+ {
295
+ "epoch": 0.78,
296
+ "learning_rate": 0.00017956245488817812,
297
+ "loss": 1.673,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.8,
302
+ "learning_rate": 0.000178183148246803,
303
+ "loss": 1.786,
304
+ "step": 40
305
+ },
306
+ {
307
+ "epoch": 0.8,
308
+ "eval_loss": 1.6771162748336792,
309
+ "eval_runtime": 6.1006,
310
+ "eval_samples_per_second": 0.328,
311
+ "eval_steps_per_second": 0.164,
312
+ "step": 40
313
+ },
314
+ {
315
+ "epoch": 0.82,
316
+ "learning_rate": 0.0001767644740358011,
317
+ "loss": 1.7318,
318
+ "step": 41
319
+ },
320
+ {
321
+ "epoch": 0.84,
322
+ "learning_rate": 0.00017530714660036112,
323
+ "loss": 1.8771,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.86,
328
+ "learning_rate": 0.00017381189974873407,
329
+ "loss": 1.8525,
330
+ "step": 43
331
+ },
332
+ {
333
+ "epoch": 0.88,
334
+ "learning_rate": 0.00017227948638273916,
335
+ "loss": 1.8761,
336
+ "step": 44
337
+ },
338
+ {
339
+ "epoch": 0.9,
340
+ "learning_rate": 0.00017071067811865476,
341
+ "loss": 1.8343,
342
+ "step": 45
343
+ },
344
+ {
345
+ "epoch": 0.9,
346
+ "eval_loss": 1.6742032766342163,
347
+ "eval_runtime": 6.1111,
348
+ "eval_samples_per_second": 0.327,
349
+ "eval_steps_per_second": 0.164,
350
+ "step": 45
351
+ },
352
+ {
353
+ "epoch": 0.92,
354
+ "learning_rate": 0.00016910626489868649,
355
+ "loss": 1.6979,
356
+ "step": 46
357
+ },
358
+ {
359
+ "epoch": 0.94,
360
+ "learning_rate": 0.00016746705459320745,
361
+ "loss": 1.2549,
362
+ "step": 47
363
+ },
364
+ {
365
+ "epoch": 0.96,
366
+ "learning_rate": 0.00016579387259397127,
367
+ "loss": 1.0941,
368
+ "step": 48
369
+ },
370
+ {
371
+ "epoch": 0.98,
372
+ "learning_rate": 0.0001640875613985024,
373
+ "loss": 1.7805,
374
+ "step": 49
375
+ },
376
+ {
377
+ "epoch": 1.0,
378
+ "learning_rate": 0.00016234898018587337,
379
+ "loss": 1.5179,
380
+ "step": 50
381
+ },
382
+ {
383
+ "epoch": 1.0,
384
+ "eval_loss": 1.672481656074524,
385
+ "eval_runtime": 6.1089,
386
+ "eval_samples_per_second": 0.327,
387
+ "eval_steps_per_second": 0.164,
388
+ "step": 50
389
+ },
390
+ {
391
+ "epoch": 1.02,
392
+ "learning_rate": 0.000160579004384082,
393
+ "loss": 1.7744,
394
+ "step": 51
395
+ },
396
+ {
397
+ "epoch": 1.04,
398
+ "learning_rate": 0.00015877852522924732,
399
+ "loss": 1.7504,
400
+ "step": 52
401
+ },
402
+ {
403
+ "epoch": 1.06,
404
+ "learning_rate": 0.0001569484493168452,
405
+ "loss": 1.7334,
406
+ "step": 53
407
+ },
408
+ {
409
+ "epoch": 1.08,
410
+ "learning_rate": 0.00015508969814521025,
411
+ "loss": 1.7141,
412
+ "step": 54
413
+ },
414
+ {
415
+ "epoch": 1.1,
416
+ "learning_rate": 0.00015320320765153367,
417
+ "loss": 1.8183,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 1.1,
422
+ "eval_loss": 1.6722060441970825,
423
+ "eval_runtime": 6.1182,
424
+ "eval_samples_per_second": 0.327,
425
+ "eval_steps_per_second": 0.163,
426
+ "step": 55
427
+ },
428
+ {
429
+ "epoch": 1.12,
430
+ "learning_rate": 0.00015128992774059063,
431
+ "loss": 1.8504,
432
+ "step": 56
433
+ },
434
+ {
435
+ "epoch": 1.14,
436
+ "learning_rate": 0.0001493508218064347,
437
+ "loss": 1.7066,
438
+ "step": 57
439
+ },
440
+ {
441
+ "epoch": 1.16,
442
+ "learning_rate": 0.00014738686624729986,
443
+ "loss": 1.7424,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 1.18,
448
+ "learning_rate": 0.00014539904997395468,
449
+ "loss": 1.8836,
450
+ "step": 59
451
+ },
452
+ {
453
+ "epoch": 1.2,
454
+ "learning_rate": 0.00014338837391175582,
455
+ "loss": 1.5515,
456
+ "step": 60
457
+ },
458
+ {
459
+ "epoch": 1.2,
460
+ "eval_loss": 1.667909860610962,
461
+ "eval_runtime": 6.1058,
462
+ "eval_samples_per_second": 0.328,
463
+ "eval_steps_per_second": 0.164,
464
+ "step": 60
465
+ },
466
+ {
467
+ "epoch": 1.22,
468
+ "learning_rate": 0.00014135585049665207,
469
+ "loss": 1.069,
470
+ "step": 61
471
+ },
472
+ {
473
+ "epoch": 1.24,
474
+ "learning_rate": 0.00013930250316539238,
475
+ "loss": 1.2383,
476
+ "step": 62
477
+ },
478
+ {
479
+ "epoch": 1.26,
480
+ "learning_rate": 0.00013722936584019453,
481
+ "loss": 1.7975,
482
+ "step": 63
483
+ },
484
+ {
485
+ "epoch": 1.28,
486
+ "learning_rate": 0.0001351374824081343,
487
+ "loss": 1.7685,
488
+ "step": 64
489
+ },
490
+ {
491
+ "epoch": 1.3,
492
+ "learning_rate": 0.00013302790619551674,
493
+ "loss": 1.831,
494
+ "step": 65
495
+ },
496
+ {
497
+ "epoch": 1.3,
498
+ "eval_loss": 1.6692527532577515,
499
+ "eval_runtime": 6.1077,
500
+ "eval_samples_per_second": 0.327,
501
+ "eval_steps_per_second": 0.164,
502
+ "step": 65
503
+ },
504
+ {
505
+ "epoch": 1.32,
506
+ "learning_rate": 0.00013090169943749476,
507
+ "loss": 1.6875,
508
+ "step": 66
509
+ },
510
+ {
511
+ "epoch": 1.34,
512
+ "learning_rate": 0.00012875993274320173,
513
+ "loss": 1.7344,
514
+ "step": 67
515
+ },
516
+ {
517
+ "epoch": 1.36,
518
+ "learning_rate": 0.00012660368455666752,
519
+ "loss": 1.6883,
520
+ "step": 68
521
+ },
522
+ {
523
+ "epoch": 1.38,
524
+ "learning_rate": 0.0001244340406137894,
525
+ "loss": 1.7168,
526
+ "step": 69
527
+ },
528
+ {
529
+ "epoch": 1.4,
530
+ "learning_rate": 0.00012225209339563145,
531
+ "loss": 1.7975,
532
+ "step": 70
533
+ },
534
+ {
535
+ "epoch": 1.4,
536
+ "eval_loss": 1.6680976152420044,
537
+ "eval_runtime": 6.1243,
538
+ "eval_samples_per_second": 0.327,
539
+ "eval_steps_per_second": 0.163,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 1.42,
544
+ "learning_rate": 0.00012005894157832729,
545
+ "loss": 1.869,
546
+ "step": 71
547
+ },
548
+ {
549
+ "epoch": 1.44,
550
+ "learning_rate": 0.00011785568947986367,
551
+ "loss": 1.6853,
552
+ "step": 72
553
+ },
554
+ {
555
+ "epoch": 1.46,
556
+ "learning_rate": 0.0001156434465040231,
557
+ "loss": 1.2465,
558
+ "step": 73
559
+ },
560
+ {
561
+ "epoch": 1.48,
562
+ "learning_rate": 0.00011342332658176555,
563
+ "loss": 0.8489,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 1.5,
568
+ "learning_rate": 0.00011119644761033078,
569
+ "loss": 1.7818,
570
+ "step": 75
571
+ },
572
+ {
573
+ "epoch": 1.5,
574
+ "eval_loss": 1.669010043144226,
575
+ "eval_runtime": 6.1095,
576
+ "eval_samples_per_second": 0.327,
577
+ "eval_steps_per_second": 0.164,
578
+ "step": 75
579
+ },
580
+ {
581
+ "epoch": 1.52,
582
+ "learning_rate": 0.00010896393089034336,
583
+ "loss": 1.7696,
584
+ "step": 76
585
+ },
586
+ {
587
+ "epoch": 1.54,
588
+ "learning_rate": 0.00010672690056120399,
589
+ "loss": 1.7764,
590
+ "step": 77
591
+ },
592
+ {
593
+ "epoch": 1.56,
594
+ "learning_rate": 0.00010448648303505151,
595
+ "loss": 1.7498,
596
+ "step": 78
597
+ },
598
+ {
599
+ "epoch": 1.58,
600
+ "learning_rate": 0.00010224380642958052,
601
+ "loss": 1.8195,
602
+ "step": 79
603
+ },
604
+ {
605
+ "epoch": 1.6,
606
+ "learning_rate": 0.0001,
607
+ "loss": 1.7768,
608
+ "step": 80
609
+ },
610
+ {
611
+ "epoch": 1.6,
612
+ "eval_loss": 1.667705774307251,
613
+ "eval_runtime": 6.1125,
614
+ "eval_samples_per_second": 0.327,
615
+ "eval_steps_per_second": 0.164,
616
+ "step": 80
617
+ },
618
+ {
619
+ "epoch": 1.62,
620
+ "learning_rate": 9.775619357041952e-05,
621
+ "loss": 1.9044,
622
+ "step": 81
623
+ },
624
+ {
625
+ "epoch": 1.64,
626
+ "learning_rate": 9.551351696494854e-05,
627
+ "loss": 1.8899,
628
+ "step": 82
629
+ },
630
+ {
631
+ "epoch": 1.66,
632
+ "learning_rate": 9.327309943879604e-05,
633
+ "loss": 1.9218,
634
+ "step": 83
635
+ },
636
+ {
637
+ "epoch": 1.68,
638
+ "learning_rate": 9.103606910965666e-05,
639
+ "loss": 1.7831,
640
+ "step": 84
641
+ },
642
+ {
643
+ "epoch": 1.7,
644
+ "learning_rate": 8.880355238966923e-05,
645
+ "loss": 1.3084,
646
+ "step": 85
647
+ },
648
+ {
649
+ "epoch": 1.7,
650
+ "eval_loss": 1.667492389678955,
651
+ "eval_runtime": 6.107,
652
+ "eval_samples_per_second": 0.327,
653
+ "eval_steps_per_second": 0.164,
654
+ "step": 85
655
+ },
656
+ {
657
+ "epoch": 1.72,
658
+ "learning_rate": 8.657667341823448e-05,
659
+ "loss": 0.9686,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 1.74,
664
+ "learning_rate": 8.435655349597689e-05,
665
+ "loss": 1.788,
666
+ "step": 87
667
+ },
668
+ {
669
+ "epoch": 1.76,
670
+ "learning_rate": 8.214431052013634e-05,
671
+ "loss": 1.6448,
672
+ "step": 88
673
+ },
674
+ {
675
+ "epoch": 1.78,
676
+ "learning_rate": 7.994105842167273e-05,
677
+ "loss": 1.6639,
678
+ "step": 89
679
+ },
680
+ {
681
+ "epoch": 1.8,
682
+ "learning_rate": 7.774790660436858e-05,
683
+ "loss": 1.7402,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 1.8,
688
+ "eval_loss": 1.6677496433258057,
689
+ "eval_runtime": 6.0974,
690
+ "eval_samples_per_second": 0.328,
691
+ "eval_steps_per_second": 0.164,
692
+ "step": 90
693
+ },
694
+ {
695
+ "epoch": 1.82,
696
+ "learning_rate": 7.556595938621058e-05,
697
+ "loss": 1.7281,
698
+ "step": 91
699
+ },
700
+ {
701
+ "epoch": 1.84,
702
+ "learning_rate": 7.339631544333249e-05,
703
+ "loss": 1.7017,
704
+ "step": 92
705
+ },
706
+ {
707
+ "epoch": 1.86,
708
+ "learning_rate": 7.124006725679828e-05,
709
+ "loss": 1.758,
710
+ "step": 93
711
+ },
712
+ {
713
+ "epoch": 1.88,
714
+ "learning_rate": 6.909830056250527e-05,
715
+ "loss": 1.7958,
716
+ "step": 94
717
+ },
718
+ {
719
+ "epoch": 1.9,
720
+ "learning_rate": 6.697209380448333e-05,
721
+ "loss": 1.8462,
722
+ "step": 95
723
+ },
724
+ {
725
+ "epoch": 1.9,
726
+ "eval_loss": 1.6652313470840454,
727
+ "eval_runtime": 6.1048,
728
+ "eval_samples_per_second": 0.328,
729
+ "eval_steps_per_second": 0.164,
730
+ "step": 95
731
+ },
732
+ {
733
+ "epoch": 1.92,
734
+ "learning_rate": 6.486251759186572e-05,
735
+ "loss": 1.8275,
736
+ "step": 96
737
+ },
738
+ {
739
+ "epoch": 1.94,
740
+ "learning_rate": 6.277063415980549e-05,
741
+ "loss": 1.4802,
742
+ "step": 97
743
+ },
744
+ {
745
+ "epoch": 1.96,
746
+ "learning_rate": 6.069749683460765e-05,
747
+ "loss": 1.1441,
748
+ "step": 98
749
+ },
750
+ {
751
+ "epoch": 1.98,
752
+ "learning_rate": 5.864414950334796e-05,
753
+ "loss": 1.7344,
754
+ "step": 99
755
+ },
756
+ {
757
+ "epoch": 2.0,
758
+ "learning_rate": 5.6611626088244194e-05,
759
+ "loss": 1.6543,
760
+ "step": 100
761
+ },
762
+ {
763
+ "epoch": 2.0,
764
+ "eval_loss": 1.6656206846237183,
765
+ "eval_runtime": 6.1082,
766
+ "eval_samples_per_second": 0.327,
767
+ "eval_steps_per_second": 0.164,
768
+ "step": 100
769
+ }
770
+ ],
771
+ "max_steps": 150,
772
+ "num_train_epochs": 3,
773
+ "total_flos": 2.8763598340286054e+17,
774
+ "trial_name": null,
775
+ "trial_params": null
776
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd8d23f042a338ad3600f5059478d68d7fca1548633272a68cc6bebfb23ad3ee
3
+ size 3899
checkpoint-120/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-120/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-120/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7390877ef5f362ceb01922c4a999718346af346ba9096ef521fbff5e1593bfa8
3
+ size 40137613
checkpoint-120/adapter_model/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-120/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-120/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7390877ef5f362ceb01922c4a999718346af346ba9096ef521fbff5e1593bfa8
3
+ size 40137613
checkpoint-120/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b53fa9d8c92560599043bb1e77b432d1a86dba073e6b826701cbeb07668c5e6b
3
+ size 10264773
checkpoint-120/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:830e818e6899ff30b431c470cff0e4bafb5e8c7e429469a6fb9dfb5272323c44
3
+ size 14575
checkpoint-120/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87c65de0f3d90fa8477a7f6d5dce812df67139fc573ed544a861f1535557a37e
3
+ size 627
checkpoint-120/trainer_state.json ADDED
@@ -0,0 +1,928 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.4,
5
+ "global_step": 120,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 2e-05,
13
+ "loss": 1.8562,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 4e-05,
19
+ "loss": 1.8114,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.06,
24
+ "learning_rate": 6e-05,
25
+ "loss": 1.816,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.08,
30
+ "learning_rate": 8e-05,
31
+ "loss": 1.7654,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.1,
36
+ "learning_rate": 0.0001,
37
+ "loss": 1.7975,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "eval_loss": 1.751416563987732,
43
+ "eval_runtime": 6.1216,
44
+ "eval_samples_per_second": 0.327,
45
+ "eval_steps_per_second": 0.163,
46
+ "step": 5
47
+ },
48
+ {
49
+ "epoch": 0.12,
50
+ "learning_rate": 0.00012,
51
+ "loss": 1.88,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "learning_rate": 0.00014,
57
+ "loss": 1.7946,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.16,
62
+ "learning_rate": 0.00016,
63
+ "loss": 1.9499,
64
+ "step": 8
65
+ },
66
+ {
67
+ "epoch": 0.18,
68
+ "learning_rate": 0.00018,
69
+ "loss": 1.988,
70
+ "step": 9
71
+ },
72
+ {
73
+ "epoch": 0.2,
74
+ "learning_rate": 0.0002,
75
+ "loss": 1.6376,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.2,
80
+ "eval_loss": 1.722676396369934,
81
+ "eval_runtime": 6.1182,
82
+ "eval_samples_per_second": 0.327,
83
+ "eval_steps_per_second": 0.163,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.22,
88
+ "learning_rate": 0.00019997482349425066,
89
+ "loss": 1.2851,
90
+ "step": 11
91
+ },
92
+ {
93
+ "epoch": 0.24,
94
+ "learning_rate": 0.00019989930665413147,
95
+ "loss": 1.3604,
96
+ "step": 12
97
+ },
98
+ {
99
+ "epoch": 0.26,
100
+ "learning_rate": 0.0001997734875046456,
101
+ "loss": 1.7414,
102
+ "step": 13
103
+ },
104
+ {
105
+ "epoch": 0.28,
106
+ "learning_rate": 0.00019959742939952392,
107
+ "loss": 1.7797,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "learning_rate": 0.00019937122098932428,
113
+ "loss": 1.7487,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.3,
118
+ "eval_loss": 1.7023706436157227,
119
+ "eval_runtime": 6.1068,
120
+ "eval_samples_per_second": 0.328,
121
+ "eval_steps_per_second": 0.164,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.32,
126
+ "learning_rate": 0.00019909497617679348,
127
+ "loss": 1.8197,
128
+ "step": 16
129
+ },
130
+ {
131
+ "epoch": 0.34,
132
+ "learning_rate": 0.00019876883405951377,
133
+ "loss": 1.7928,
134
+ "step": 17
135
+ },
136
+ {
137
+ "epoch": 0.36,
138
+ "learning_rate": 0.00019839295885986296,
139
+ "loss": 1.7864,
140
+ "step": 18
141
+ },
142
+ {
143
+ "epoch": 0.38,
144
+ "learning_rate": 0.00019796753984232358,
145
+ "loss": 1.8278,
146
+ "step": 19
147
+ },
148
+ {
149
+ "epoch": 0.4,
150
+ "learning_rate": 0.00019749279121818235,
151
+ "loss": 1.7084,
152
+ "step": 20
153
+ },
154
+ {
155
+ "epoch": 0.4,
156
+ "eval_loss": 1.6931452751159668,
157
+ "eval_runtime": 6.1126,
158
+ "eval_samples_per_second": 0.327,
159
+ "eval_steps_per_second": 0.164,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.42,
164
+ "learning_rate": 0.0001969689520376687,
165
+ "loss": 1.8612,
166
+ "step": 21
167
+ },
168
+ {
169
+ "epoch": 0.44,
170
+ "learning_rate": 0.00019639628606958533,
171
+ "loss": 1.8796,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 0.46,
176
+ "learning_rate": 0.00019577508166849304,
177
+ "loss": 1.4313,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.48,
182
+ "learning_rate": 0.00019510565162951537,
183
+ "loss": 1.3394,
184
+ "step": 24
185
+ },
186
+ {
187
+ "epoch": 0.5,
188
+ "learning_rate": 0.00019438833303083678,
189
+ "loss": 1.8286,
190
+ "step": 25
191
+ },
192
+ {
193
+ "epoch": 0.5,
194
+ "eval_loss": 1.6889530420303345,
195
+ "eval_runtime": 6.1117,
196
+ "eval_samples_per_second": 0.327,
197
+ "eval_steps_per_second": 0.164,
198
+ "step": 25
199
+ },
200
+ {
201
+ "epoch": 0.52,
202
+ "learning_rate": 0.00019362348706397373,
203
+ "loss": 1.6983,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.54,
208
+ "learning_rate": 0.0001928114988519039,
209
+ "loss": 1.8419,
210
+ "step": 27
211
+ },
212
+ {
213
+ "epoch": 0.56,
214
+ "learning_rate": 0.0001919527772551451,
215
+ "loss": 1.7634,
216
+ "step": 28
217
+ },
218
+ {
219
+ "epoch": 0.58,
220
+ "learning_rate": 0.00019104775466588161,
221
+ "loss": 1.6784,
222
+ "step": 29
223
+ },
224
+ {
225
+ "epoch": 0.6,
226
+ "learning_rate": 0.0001900968867902419,
227
+ "loss": 1.8443,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 0.6,
232
+ "eval_loss": 1.6835517883300781,
233
+ "eval_runtime": 6.113,
234
+ "eval_samples_per_second": 0.327,
235
+ "eval_steps_per_second": 0.164,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.62,
240
+ "learning_rate": 0.0001891006524188368,
241
+ "loss": 1.7155,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.64,
246
+ "learning_rate": 0.0001880595531856738,
247
+ "loss": 1.8511,
248
+ "step": 32
249
+ },
250
+ {
251
+ "epoch": 0.66,
252
+ "learning_rate": 0.00018697411331556956,
253
+ "loss": 1.9756,
254
+ "step": 33
255
+ },
256
+ {
257
+ "epoch": 0.68,
258
+ "learning_rate": 0.00018584487936018661,
259
+ "loss": 1.7614,
260
+ "step": 34
261
+ },
262
+ {
263
+ "epoch": 0.7,
264
+ "learning_rate": 0.00018467241992282843,
265
+ "loss": 1.3127,
266
+ "step": 35
267
+ },
268
+ {
269
+ "epoch": 0.7,
270
+ "eval_loss": 1.6799031496047974,
271
+ "eval_runtime": 6.1132,
272
+ "eval_samples_per_second": 0.327,
273
+ "eval_steps_per_second": 0.164,
274
+ "step": 35
275
+ },
276
+ {
277
+ "epoch": 0.72,
278
+ "learning_rate": 0.00018345732537213027,
279
+ "loss": 0.8749,
280
+ "step": 36
281
+ },
282
+ {
283
+ "epoch": 0.74,
284
+ "learning_rate": 0.00018220020754479102,
285
+ "loss": 1.7892,
286
+ "step": 37
287
+ },
288
+ {
289
+ "epoch": 0.76,
290
+ "learning_rate": 0.00018090169943749476,
291
+ "loss": 1.8039,
292
+ "step": 38
293
+ },
294
+ {
295
+ "epoch": 0.78,
296
+ "learning_rate": 0.00017956245488817812,
297
+ "loss": 1.673,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.8,
302
+ "learning_rate": 0.000178183148246803,
303
+ "loss": 1.786,
304
+ "step": 40
305
+ },
306
+ {
307
+ "epoch": 0.8,
308
+ "eval_loss": 1.6771162748336792,
309
+ "eval_runtime": 6.1006,
310
+ "eval_samples_per_second": 0.328,
311
+ "eval_steps_per_second": 0.164,
312
+ "step": 40
313
+ },
314
+ {
315
+ "epoch": 0.82,
316
+ "learning_rate": 0.0001767644740358011,
317
+ "loss": 1.7318,
318
+ "step": 41
319
+ },
320
+ {
321
+ "epoch": 0.84,
322
+ "learning_rate": 0.00017530714660036112,
323
+ "loss": 1.8771,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.86,
328
+ "learning_rate": 0.00017381189974873407,
329
+ "loss": 1.8525,
330
+ "step": 43
331
+ },
332
+ {
333
+ "epoch": 0.88,
334
+ "learning_rate": 0.00017227948638273916,
335
+ "loss": 1.8761,
336
+ "step": 44
337
+ },
338
+ {
339
+ "epoch": 0.9,
340
+ "learning_rate": 0.00017071067811865476,
341
+ "loss": 1.8343,
342
+ "step": 45
343
+ },
344
+ {
345
+ "epoch": 0.9,
346
+ "eval_loss": 1.6742032766342163,
347
+ "eval_runtime": 6.1111,
348
+ "eval_samples_per_second": 0.327,
349
+ "eval_steps_per_second": 0.164,
350
+ "step": 45
351
+ },
352
+ {
353
+ "epoch": 0.92,
354
+ "learning_rate": 0.00016910626489868649,
355
+ "loss": 1.6979,
356
+ "step": 46
357
+ },
358
+ {
359
+ "epoch": 0.94,
360
+ "learning_rate": 0.00016746705459320745,
361
+ "loss": 1.2549,
362
+ "step": 47
363
+ },
364
+ {
365
+ "epoch": 0.96,
366
+ "learning_rate": 0.00016579387259397127,
367
+ "loss": 1.0941,
368
+ "step": 48
369
+ },
370
+ {
371
+ "epoch": 0.98,
372
+ "learning_rate": 0.0001640875613985024,
373
+ "loss": 1.7805,
374
+ "step": 49
375
+ },
376
+ {
377
+ "epoch": 1.0,
378
+ "learning_rate": 0.00016234898018587337,
379
+ "loss": 1.5179,
380
+ "step": 50
381
+ },
382
+ {
383
+ "epoch": 1.0,
384
+ "eval_loss": 1.672481656074524,
385
+ "eval_runtime": 6.1089,
386
+ "eval_samples_per_second": 0.327,
387
+ "eval_steps_per_second": 0.164,
388
+ "step": 50
389
+ },
390
+ {
391
+ "epoch": 1.02,
392
+ "learning_rate": 0.000160579004384082,
393
+ "loss": 1.7744,
394
+ "step": 51
395
+ },
396
+ {
397
+ "epoch": 1.04,
398
+ "learning_rate": 0.00015877852522924732,
399
+ "loss": 1.7504,
400
+ "step": 52
401
+ },
402
+ {
403
+ "epoch": 1.06,
404
+ "learning_rate": 0.0001569484493168452,
405
+ "loss": 1.7334,
406
+ "step": 53
407
+ },
408
+ {
409
+ "epoch": 1.08,
410
+ "learning_rate": 0.00015508969814521025,
411
+ "loss": 1.7141,
412
+ "step": 54
413
+ },
414
+ {
415
+ "epoch": 1.1,
416
+ "learning_rate": 0.00015320320765153367,
417
+ "loss": 1.8183,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 1.1,
422
+ "eval_loss": 1.6722060441970825,
423
+ "eval_runtime": 6.1182,
424
+ "eval_samples_per_second": 0.327,
425
+ "eval_steps_per_second": 0.163,
426
+ "step": 55
427
+ },
428
+ {
429
+ "epoch": 1.12,
430
+ "learning_rate": 0.00015128992774059063,
431
+ "loss": 1.8504,
432
+ "step": 56
433
+ },
434
+ {
435
+ "epoch": 1.14,
436
+ "learning_rate": 0.0001493508218064347,
437
+ "loss": 1.7066,
438
+ "step": 57
439
+ },
440
+ {
441
+ "epoch": 1.16,
442
+ "learning_rate": 0.00014738686624729986,
443
+ "loss": 1.7424,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 1.18,
448
+ "learning_rate": 0.00014539904997395468,
449
+ "loss": 1.8836,
450
+ "step": 59
451
+ },
452
+ {
453
+ "epoch": 1.2,
454
+ "learning_rate": 0.00014338837391175582,
455
+ "loss": 1.5515,
456
+ "step": 60
457
+ },
458
+ {
459
+ "epoch": 1.2,
460
+ "eval_loss": 1.667909860610962,
461
+ "eval_runtime": 6.1058,
462
+ "eval_samples_per_second": 0.328,
463
+ "eval_steps_per_second": 0.164,
464
+ "step": 60
465
+ },
466
+ {
467
+ "epoch": 1.22,
468
+ "learning_rate": 0.00014135585049665207,
469
+ "loss": 1.069,
470
+ "step": 61
471
+ },
472
+ {
473
+ "epoch": 1.24,
474
+ "learning_rate": 0.00013930250316539238,
475
+ "loss": 1.2383,
476
+ "step": 62
477
+ },
478
+ {
479
+ "epoch": 1.26,
480
+ "learning_rate": 0.00013722936584019453,
481
+ "loss": 1.7975,
482
+ "step": 63
483
+ },
484
+ {
485
+ "epoch": 1.28,
486
+ "learning_rate": 0.0001351374824081343,
487
+ "loss": 1.7685,
488
+ "step": 64
489
+ },
490
+ {
491
+ "epoch": 1.3,
492
+ "learning_rate": 0.00013302790619551674,
493
+ "loss": 1.831,
494
+ "step": 65
495
+ },
496
+ {
497
+ "epoch": 1.3,
498
+ "eval_loss": 1.6692527532577515,
499
+ "eval_runtime": 6.1077,
500
+ "eval_samples_per_second": 0.327,
501
+ "eval_steps_per_second": 0.164,
502
+ "step": 65
503
+ },
504
+ {
505
+ "epoch": 1.32,
506
+ "learning_rate": 0.00013090169943749476,
507
+ "loss": 1.6875,
508
+ "step": 66
509
+ },
510
+ {
511
+ "epoch": 1.34,
512
+ "learning_rate": 0.00012875993274320173,
513
+ "loss": 1.7344,
514
+ "step": 67
515
+ },
516
+ {
517
+ "epoch": 1.36,
518
+ "learning_rate": 0.00012660368455666752,
519
+ "loss": 1.6883,
520
+ "step": 68
521
+ },
522
+ {
523
+ "epoch": 1.38,
524
+ "learning_rate": 0.0001244340406137894,
525
+ "loss": 1.7168,
526
+ "step": 69
527
+ },
528
+ {
529
+ "epoch": 1.4,
530
+ "learning_rate": 0.00012225209339563145,
531
+ "loss": 1.7975,
532
+ "step": 70
533
+ },
534
+ {
535
+ "epoch": 1.4,
536
+ "eval_loss": 1.6680976152420044,
537
+ "eval_runtime": 6.1243,
538
+ "eval_samples_per_second": 0.327,
539
+ "eval_steps_per_second": 0.163,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 1.42,
544
+ "learning_rate": 0.00012005894157832729,
545
+ "loss": 1.869,
546
+ "step": 71
547
+ },
548
+ {
549
+ "epoch": 1.44,
550
+ "learning_rate": 0.00011785568947986367,
551
+ "loss": 1.6853,
552
+ "step": 72
553
+ },
554
+ {
555
+ "epoch": 1.46,
556
+ "learning_rate": 0.0001156434465040231,
557
+ "loss": 1.2465,
558
+ "step": 73
559
+ },
560
+ {
561
+ "epoch": 1.48,
562
+ "learning_rate": 0.00011342332658176555,
563
+ "loss": 0.8489,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 1.5,
568
+ "learning_rate": 0.00011119644761033078,
569
+ "loss": 1.7818,
570
+ "step": 75
571
+ },
572
+ {
573
+ "epoch": 1.5,
574
+ "eval_loss": 1.669010043144226,
575
+ "eval_runtime": 6.1095,
576
+ "eval_samples_per_second": 0.327,
577
+ "eval_steps_per_second": 0.164,
578
+ "step": 75
579
+ },
580
+ {
581
+ "epoch": 1.52,
582
+ "learning_rate": 0.00010896393089034336,
583
+ "loss": 1.7696,
584
+ "step": 76
585
+ },
586
+ {
587
+ "epoch": 1.54,
588
+ "learning_rate": 0.00010672690056120399,
589
+ "loss": 1.7764,
590
+ "step": 77
591
+ },
592
+ {
593
+ "epoch": 1.56,
594
+ "learning_rate": 0.00010448648303505151,
595
+ "loss": 1.7498,
596
+ "step": 78
597
+ },
598
+ {
599
+ "epoch": 1.58,
600
+ "learning_rate": 0.00010224380642958052,
601
+ "loss": 1.8195,
602
+ "step": 79
603
+ },
604
+ {
605
+ "epoch": 1.6,
606
+ "learning_rate": 0.0001,
607
+ "loss": 1.7768,
608
+ "step": 80
609
+ },
610
+ {
611
+ "epoch": 1.6,
612
+ "eval_loss": 1.667705774307251,
613
+ "eval_runtime": 6.1125,
614
+ "eval_samples_per_second": 0.327,
615
+ "eval_steps_per_second": 0.164,
616
+ "step": 80
617
+ },
618
+ {
619
+ "epoch": 1.62,
620
+ "learning_rate": 9.775619357041952e-05,
621
+ "loss": 1.9044,
622
+ "step": 81
623
+ },
624
+ {
625
+ "epoch": 1.64,
626
+ "learning_rate": 9.551351696494854e-05,
627
+ "loss": 1.8899,
628
+ "step": 82
629
+ },
630
+ {
631
+ "epoch": 1.66,
632
+ "learning_rate": 9.327309943879604e-05,
633
+ "loss": 1.9218,
634
+ "step": 83
635
+ },
636
+ {
637
+ "epoch": 1.68,
638
+ "learning_rate": 9.103606910965666e-05,
639
+ "loss": 1.7831,
640
+ "step": 84
641
+ },
642
+ {
643
+ "epoch": 1.7,
644
+ "learning_rate": 8.880355238966923e-05,
645
+ "loss": 1.3084,
646
+ "step": 85
647
+ },
648
+ {
649
+ "epoch": 1.7,
650
+ "eval_loss": 1.667492389678955,
651
+ "eval_runtime": 6.107,
652
+ "eval_samples_per_second": 0.327,
653
+ "eval_steps_per_second": 0.164,
654
+ "step": 85
655
+ },
656
+ {
657
+ "epoch": 1.72,
658
+ "learning_rate": 8.657667341823448e-05,
659
+ "loss": 0.9686,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 1.74,
664
+ "learning_rate": 8.435655349597689e-05,
665
+ "loss": 1.788,
666
+ "step": 87
667
+ },
668
+ {
669
+ "epoch": 1.76,
670
+ "learning_rate": 8.214431052013634e-05,
671
+ "loss": 1.6448,
672
+ "step": 88
673
+ },
674
+ {
675
+ "epoch": 1.78,
676
+ "learning_rate": 7.994105842167273e-05,
677
+ "loss": 1.6639,
678
+ "step": 89
679
+ },
680
+ {
681
+ "epoch": 1.8,
682
+ "learning_rate": 7.774790660436858e-05,
683
+ "loss": 1.7402,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 1.8,
688
+ "eval_loss": 1.6677496433258057,
689
+ "eval_runtime": 6.0974,
690
+ "eval_samples_per_second": 0.328,
691
+ "eval_steps_per_second": 0.164,
692
+ "step": 90
693
+ },
694
+ {
695
+ "epoch": 1.82,
696
+ "learning_rate": 7.556595938621058e-05,
697
+ "loss": 1.7281,
698
+ "step": 91
699
+ },
700
+ {
701
+ "epoch": 1.84,
702
+ "learning_rate": 7.339631544333249e-05,
703
+ "loss": 1.7017,
704
+ "step": 92
705
+ },
706
+ {
707
+ "epoch": 1.86,
708
+ "learning_rate": 7.124006725679828e-05,
709
+ "loss": 1.758,
710
+ "step": 93
711
+ },
712
+ {
713
+ "epoch": 1.88,
714
+ "learning_rate": 6.909830056250527e-05,
715
+ "loss": 1.7958,
716
+ "step": 94
717
+ },
718
+ {
719
+ "epoch": 1.9,
720
+ "learning_rate": 6.697209380448333e-05,
721
+ "loss": 1.8462,
722
+ "step": 95
723
+ },
724
+ {
725
+ "epoch": 1.9,
726
+ "eval_loss": 1.6652313470840454,
727
+ "eval_runtime": 6.1048,
728
+ "eval_samples_per_second": 0.328,
729
+ "eval_steps_per_second": 0.164,
730
+ "step": 95
731
+ },
732
+ {
733
+ "epoch": 1.92,
734
+ "learning_rate": 6.486251759186572e-05,
735
+ "loss": 1.8275,
736
+ "step": 96
737
+ },
738
+ {
739
+ "epoch": 1.94,
740
+ "learning_rate": 6.277063415980549e-05,
741
+ "loss": 1.4802,
742
+ "step": 97
743
+ },
744
+ {
745
+ "epoch": 1.96,
746
+ "learning_rate": 6.069749683460765e-05,
747
+ "loss": 1.1441,
748
+ "step": 98
749
+ },
750
+ {
751
+ "epoch": 1.98,
752
+ "learning_rate": 5.864414950334796e-05,
753
+ "loss": 1.7344,
754
+ "step": 99
755
+ },
756
+ {
757
+ "epoch": 2.0,
758
+ "learning_rate": 5.6611626088244194e-05,
759
+ "loss": 1.6543,
760
+ "step": 100
761
+ },
762
+ {
763
+ "epoch": 2.0,
764
+ "eval_loss": 1.6656206846237183,
765
+ "eval_runtime": 6.1082,
766
+ "eval_samples_per_second": 0.327,
767
+ "eval_steps_per_second": 0.164,
768
+ "step": 100
769
+ },
770
+ {
771
+ "epoch": 2.02,
772
+ "learning_rate": 5.4600950026045326e-05,
773
+ "loss": 1.7825,
774
+ "step": 101
775
+ },
776
+ {
777
+ "epoch": 2.04,
778
+ "learning_rate": 5.261313375270014e-05,
779
+ "loss": 1.7521,
780
+ "step": 102
781
+ },
782
+ {
783
+ "epoch": 2.06,
784
+ "learning_rate": 5.0649178193565314e-05,
785
+ "loss": 1.8365,
786
+ "step": 103
787
+ },
788
+ {
789
+ "epoch": 2.08,
790
+ "learning_rate": 4.87100722594094e-05,
791
+ "loss": 1.738,
792
+ "step": 104
793
+ },
794
+ {
795
+ "epoch": 2.1,
796
+ "learning_rate": 4.6796792348466356e-05,
797
+ "loss": 1.6954,
798
+ "step": 105
799
+ },
800
+ {
801
+ "epoch": 2.1,
802
+ "eval_loss": 1.6637686491012573,
803
+ "eval_runtime": 6.1031,
804
+ "eval_samples_per_second": 0.328,
805
+ "eval_steps_per_second": 0.164,
806
+ "step": 105
807
+ },
808
+ {
809
+ "epoch": 2.12,
810
+ "learning_rate": 4.491030185478976e-05,
811
+ "loss": 1.7503,
812
+ "step": 106
813
+ },
814
+ {
815
+ "epoch": 2.14,
816
+ "learning_rate": 4.305155068315481e-05,
817
+ "loss": 1.7676,
818
+ "step": 107
819
+ },
820
+ {
821
+ "epoch": 2.16,
822
+ "learning_rate": 4.12214747707527e-05,
823
+ "loss": 1.8407,
824
+ "step": 108
825
+ },
826
+ {
827
+ "epoch": 2.18,
828
+ "learning_rate": 3.942099561591802e-05,
829
+ "loss": 1.7692,
830
+ "step": 109
831
+ },
832
+ {
833
+ "epoch": 2.2,
834
+ "learning_rate": 3.7651019814126654e-05,
835
+ "loss": 1.8048,
836
+ "step": 110
837
+ },
838
+ {
839
+ "epoch": 2.2,
840
+ "eval_loss": 1.663559913635254,
841
+ "eval_runtime": 6.1093,
842
+ "eval_samples_per_second": 0.327,
843
+ "eval_steps_per_second": 0.164,
844
+ "step": 110
845
+ },
846
+ {
847
+ "epoch": 2.22,
848
+ "learning_rate": 3.591243860149759e-05,
849
+ "loss": 1.3587,
850
+ "step": 111
851
+ },
852
+ {
853
+ "epoch": 2.24,
854
+ "learning_rate": 3.4206127406028745e-05,
855
+ "loss": 1.2887,
856
+ "step": 112
857
+ },
858
+ {
859
+ "epoch": 2.26,
860
+ "learning_rate": 3.253294540679257e-05,
861
+ "loss": 1.7741,
862
+ "step": 113
863
+ },
864
+ {
865
+ "epoch": 2.28,
866
+ "learning_rate": 3.089373510131354e-05,
867
+ "loss": 1.609,
868
+ "step": 114
869
+ },
870
+ {
871
+ "epoch": 2.3,
872
+ "learning_rate": 2.9289321881345254e-05,
873
+ "loss": 1.8432,
874
+ "step": 115
875
+ },
876
+ {
877
+ "epoch": 2.3,
878
+ "eval_loss": 1.664933443069458,
879
+ "eval_runtime": 6.1066,
880
+ "eval_samples_per_second": 0.328,
881
+ "eval_steps_per_second": 0.164,
882
+ "step": 115
883
+ },
884
+ {
885
+ "epoch": 2.32,
886
+ "learning_rate": 2.7720513617260856e-05,
887
+ "loss": 1.6705,
888
+ "step": 116
889
+ },
890
+ {
891
+ "epoch": 2.34,
892
+ "learning_rate": 2.6188100251265945e-05,
893
+ "loss": 1.8165,
894
+ "step": 117
895
+ },
896
+ {
897
+ "epoch": 2.36,
898
+ "learning_rate": 2.4692853399638917e-05,
899
+ "loss": 1.806,
900
+ "step": 118
901
+ },
902
+ {
903
+ "epoch": 2.38,
904
+ "learning_rate": 2.323552596419889e-05,
905
+ "loss": 1.8827,
906
+ "step": 119
907
+ },
908
+ {
909
+ "epoch": 2.4,
910
+ "learning_rate": 2.181685175319702e-05,
911
+ "loss": 1.833,
912
+ "step": 120
913
+ },
914
+ {
915
+ "epoch": 2.4,
916
+ "eval_loss": 1.665120005607605,
917
+ "eval_runtime": 6.1061,
918
+ "eval_samples_per_second": 0.328,
919
+ "eval_steps_per_second": 0.164,
920
+ "step": 120
921
+ }
922
+ ],
923
+ "max_steps": 150,
924
+ "num_train_epochs": 3,
925
+ "total_flos": 3.4890977913687245e+17,
926
+ "trial_name": null,
927
+ "trial_params": null
928
+ }
checkpoint-120/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd8d23f042a338ad3600f5059478d68d7fca1548633272a68cc6bebfb23ad3ee
3
+ size 3899
checkpoint-140/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-140/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-140/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e47731578763be5a30885b9433fca267c5a009659ab58b385f5ebabdf23ddb
3
+ size 40137613
checkpoint-140/adapter_model/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-140/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": null,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 8,
11
+ "lora_dropout": 0.0,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 4,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "gate_proj",
18
+ "down_proj",
19
+ "up_proj",
20
+ "q_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "o_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-140/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e47731578763be5a30885b9433fca267c5a009659ab58b385f5ebabdf23ddb
3
+ size 40137613
checkpoint-140/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a168336e9a5699ed5a530876fdf4ac993ac8c0f67ce70c747459c18d8f6b065
3
+ size 10264773
checkpoint-140/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f3c17c1e7ce128318be2166e96b6dda4d1ff79c8b44e6fb06488bf0bbfcf8d
3
+ size 14575
checkpoint-140/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0900a6fba1d6009e7b51215cdc66f0d705cb3e1d22579b7333dd467a9a360671
3
+ size 627
checkpoint-140/trainer_state.json ADDED
@@ -0,0 +1,1080 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.8,
5
+ "global_step": 140,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.02,
12
+ "learning_rate": 2e-05,
13
+ "loss": 1.8562,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 4e-05,
19
+ "loss": 1.8114,
20
+ "step": 2
21
+ },
22
+ {
23
+ "epoch": 0.06,
24
+ "learning_rate": 6e-05,
25
+ "loss": 1.816,
26
+ "step": 3
27
+ },
28
+ {
29
+ "epoch": 0.08,
30
+ "learning_rate": 8e-05,
31
+ "loss": 1.7654,
32
+ "step": 4
33
+ },
34
+ {
35
+ "epoch": 0.1,
36
+ "learning_rate": 0.0001,
37
+ "loss": 1.7975,
38
+ "step": 5
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "eval_loss": 1.751416563987732,
43
+ "eval_runtime": 6.1216,
44
+ "eval_samples_per_second": 0.327,
45
+ "eval_steps_per_second": 0.163,
46
+ "step": 5
47
+ },
48
+ {
49
+ "epoch": 0.12,
50
+ "learning_rate": 0.00012,
51
+ "loss": 1.88,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "learning_rate": 0.00014,
57
+ "loss": 1.7946,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.16,
62
+ "learning_rate": 0.00016,
63
+ "loss": 1.9499,
64
+ "step": 8
65
+ },
66
+ {
67
+ "epoch": 0.18,
68
+ "learning_rate": 0.00018,
69
+ "loss": 1.988,
70
+ "step": 9
71
+ },
72
+ {
73
+ "epoch": 0.2,
74
+ "learning_rate": 0.0002,
75
+ "loss": 1.6376,
76
+ "step": 10
77
+ },
78
+ {
79
+ "epoch": 0.2,
80
+ "eval_loss": 1.722676396369934,
81
+ "eval_runtime": 6.1182,
82
+ "eval_samples_per_second": 0.327,
83
+ "eval_steps_per_second": 0.163,
84
+ "step": 10
85
+ },
86
+ {
87
+ "epoch": 0.22,
88
+ "learning_rate": 0.00019997482349425066,
89
+ "loss": 1.2851,
90
+ "step": 11
91
+ },
92
+ {
93
+ "epoch": 0.24,
94
+ "learning_rate": 0.00019989930665413147,
95
+ "loss": 1.3604,
96
+ "step": 12
97
+ },
98
+ {
99
+ "epoch": 0.26,
100
+ "learning_rate": 0.0001997734875046456,
101
+ "loss": 1.7414,
102
+ "step": 13
103
+ },
104
+ {
105
+ "epoch": 0.28,
106
+ "learning_rate": 0.00019959742939952392,
107
+ "loss": 1.7797,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "learning_rate": 0.00019937122098932428,
113
+ "loss": 1.7487,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.3,
118
+ "eval_loss": 1.7023706436157227,
119
+ "eval_runtime": 6.1068,
120
+ "eval_samples_per_second": 0.328,
121
+ "eval_steps_per_second": 0.164,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.32,
126
+ "learning_rate": 0.00019909497617679348,
127
+ "loss": 1.8197,
128
+ "step": 16
129
+ },
130
+ {
131
+ "epoch": 0.34,
132
+ "learning_rate": 0.00019876883405951377,
133
+ "loss": 1.7928,
134
+ "step": 17
135
+ },
136
+ {
137
+ "epoch": 0.36,
138
+ "learning_rate": 0.00019839295885986296,
139
+ "loss": 1.7864,
140
+ "step": 18
141
+ },
142
+ {
143
+ "epoch": 0.38,
144
+ "learning_rate": 0.00019796753984232358,
145
+ "loss": 1.8278,
146
+ "step": 19
147
+ },
148
+ {
149
+ "epoch": 0.4,
150
+ "learning_rate": 0.00019749279121818235,
151
+ "loss": 1.7084,
152
+ "step": 20
153
+ },
154
+ {
155
+ "epoch": 0.4,
156
+ "eval_loss": 1.6931452751159668,
157
+ "eval_runtime": 6.1126,
158
+ "eval_samples_per_second": 0.327,
159
+ "eval_steps_per_second": 0.164,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.42,
164
+ "learning_rate": 0.0001969689520376687,
165
+ "loss": 1.8612,
166
+ "step": 21
167
+ },
168
+ {
169
+ "epoch": 0.44,
170
+ "learning_rate": 0.00019639628606958533,
171
+ "loss": 1.8796,
172
+ "step": 22
173
+ },
174
+ {
175
+ "epoch": 0.46,
176
+ "learning_rate": 0.00019577508166849304,
177
+ "loss": 1.4313,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.48,
182
+ "learning_rate": 0.00019510565162951537,
183
+ "loss": 1.3394,
184
+ "step": 24
185
+ },
186
+ {
187
+ "epoch": 0.5,
188
+ "learning_rate": 0.00019438833303083678,
189
+ "loss": 1.8286,
190
+ "step": 25
191
+ },
192
+ {
193
+ "epoch": 0.5,
194
+ "eval_loss": 1.6889530420303345,
195
+ "eval_runtime": 6.1117,
196
+ "eval_samples_per_second": 0.327,
197
+ "eval_steps_per_second": 0.164,
198
+ "step": 25
199
+ },
200
+ {
201
+ "epoch": 0.52,
202
+ "learning_rate": 0.00019362348706397373,
203
+ "loss": 1.6983,
204
+ "step": 26
205
+ },
206
+ {
207
+ "epoch": 0.54,
208
+ "learning_rate": 0.0001928114988519039,
209
+ "loss": 1.8419,
210
+ "step": 27
211
+ },
212
+ {
213
+ "epoch": 0.56,
214
+ "learning_rate": 0.0001919527772551451,
215
+ "loss": 1.7634,
216
+ "step": 28
217
+ },
218
+ {
219
+ "epoch": 0.58,
220
+ "learning_rate": 0.00019104775466588161,
221
+ "loss": 1.6784,
222
+ "step": 29
223
+ },
224
+ {
225
+ "epoch": 0.6,
226
+ "learning_rate": 0.0001900968867902419,
227
+ "loss": 1.8443,
228
+ "step": 30
229
+ },
230
+ {
231
+ "epoch": 0.6,
232
+ "eval_loss": 1.6835517883300781,
233
+ "eval_runtime": 6.113,
234
+ "eval_samples_per_second": 0.327,
235
+ "eval_steps_per_second": 0.164,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.62,
240
+ "learning_rate": 0.0001891006524188368,
241
+ "loss": 1.7155,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.64,
246
+ "learning_rate": 0.0001880595531856738,
247
+ "loss": 1.8511,
248
+ "step": 32
249
+ },
250
+ {
251
+ "epoch": 0.66,
252
+ "learning_rate": 0.00018697411331556956,
253
+ "loss": 1.9756,
254
+ "step": 33
255
+ },
256
+ {
257
+ "epoch": 0.68,
258
+ "learning_rate": 0.00018584487936018661,
259
+ "loss": 1.7614,
260
+ "step": 34
261
+ },
262
+ {
263
+ "epoch": 0.7,
264
+ "learning_rate": 0.00018467241992282843,
265
+ "loss": 1.3127,
266
+ "step": 35
267
+ },
268
+ {
269
+ "epoch": 0.7,
270
+ "eval_loss": 1.6799031496047974,
271
+ "eval_runtime": 6.1132,
272
+ "eval_samples_per_second": 0.327,
273
+ "eval_steps_per_second": 0.164,
274
+ "step": 35
275
+ },
276
+ {
277
+ "epoch": 0.72,
278
+ "learning_rate": 0.00018345732537213027,
279
+ "loss": 0.8749,
280
+ "step": 36
281
+ },
282
+ {
283
+ "epoch": 0.74,
284
+ "learning_rate": 0.00018220020754479102,
285
+ "loss": 1.7892,
286
+ "step": 37
287
+ },
288
+ {
289
+ "epoch": 0.76,
290
+ "learning_rate": 0.00018090169943749476,
291
+ "loss": 1.8039,
292
+ "step": 38
293
+ },
294
+ {
295
+ "epoch": 0.78,
296
+ "learning_rate": 0.00017956245488817812,
297
+ "loss": 1.673,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.8,
302
+ "learning_rate": 0.000178183148246803,
303
+ "loss": 1.786,
304
+ "step": 40
305
+ },
306
+ {
307
+ "epoch": 0.8,
308
+ "eval_loss": 1.6771162748336792,
309
+ "eval_runtime": 6.1006,
310
+ "eval_samples_per_second": 0.328,
311
+ "eval_steps_per_second": 0.164,
312
+ "step": 40
313
+ },
314
+ {
315
+ "epoch": 0.82,
316
+ "learning_rate": 0.0001767644740358011,
317
+ "loss": 1.7318,
318
+ "step": 41
319
+ },
320
+ {
321
+ "epoch": 0.84,
322
+ "learning_rate": 0.00017530714660036112,
323
+ "loss": 1.8771,
324
+ "step": 42
325
+ },
326
+ {
327
+ "epoch": 0.86,
328
+ "learning_rate": 0.00017381189974873407,
329
+ "loss": 1.8525,
330
+ "step": 43
331
+ },
332
+ {
333
+ "epoch": 0.88,
334
+ "learning_rate": 0.00017227948638273916,
335
+ "loss": 1.8761,
336
+ "step": 44
337
+ },
338
+ {
339
+ "epoch": 0.9,
340
+ "learning_rate": 0.00017071067811865476,
341
+ "loss": 1.8343,
342
+ "step": 45
343
+ },
344
+ {
345
+ "epoch": 0.9,
346
+ "eval_loss": 1.6742032766342163,
347
+ "eval_runtime": 6.1111,
348
+ "eval_samples_per_second": 0.327,
349
+ "eval_steps_per_second": 0.164,
350
+ "step": 45
351
+ },
352
+ {
353
+ "epoch": 0.92,
354
+ "learning_rate": 0.00016910626489868649,
355
+ "loss": 1.6979,
356
+ "step": 46
357
+ },
358
+ {
359
+ "epoch": 0.94,
360
+ "learning_rate": 0.00016746705459320745,
361
+ "loss": 1.2549,
362
+ "step": 47
363
+ },
364
+ {
365
+ "epoch": 0.96,
366
+ "learning_rate": 0.00016579387259397127,
367
+ "loss": 1.0941,
368
+ "step": 48
369
+ },
370
+ {
371
+ "epoch": 0.98,
372
+ "learning_rate": 0.0001640875613985024,
373
+ "loss": 1.7805,
374
+ "step": 49
375
+ },
376
+ {
377
+ "epoch": 1.0,
378
+ "learning_rate": 0.00016234898018587337,
379
+ "loss": 1.5179,
380
+ "step": 50
381
+ },
382
+ {
383
+ "epoch": 1.0,
384
+ "eval_loss": 1.672481656074524,
385
+ "eval_runtime": 6.1089,
386
+ "eval_samples_per_second": 0.327,
387
+ "eval_steps_per_second": 0.164,
388
+ "step": 50
389
+ },
390
+ {
391
+ "epoch": 1.02,
392
+ "learning_rate": 0.000160579004384082,
393
+ "loss": 1.7744,
394
+ "step": 51
395
+ },
396
+ {
397
+ "epoch": 1.04,
398
+ "learning_rate": 0.00015877852522924732,
399
+ "loss": 1.7504,
400
+ "step": 52
401
+ },
402
+ {
403
+ "epoch": 1.06,
404
+ "learning_rate": 0.0001569484493168452,
405
+ "loss": 1.7334,
406
+ "step": 53
407
+ },
408
+ {
409
+ "epoch": 1.08,
410
+ "learning_rate": 0.00015508969814521025,
411
+ "loss": 1.7141,
412
+ "step": 54
413
+ },
414
+ {
415
+ "epoch": 1.1,
416
+ "learning_rate": 0.00015320320765153367,
417
+ "loss": 1.8183,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 1.1,
422
+ "eval_loss": 1.6722060441970825,
423
+ "eval_runtime": 6.1182,
424
+ "eval_samples_per_second": 0.327,
425
+ "eval_steps_per_second": 0.163,
426
+ "step": 55
427
+ },
428
+ {
429
+ "epoch": 1.12,
430
+ "learning_rate": 0.00015128992774059063,
431
+ "loss": 1.8504,
432
+ "step": 56
433
+ },
434
+ {
435
+ "epoch": 1.14,
436
+ "learning_rate": 0.0001493508218064347,
437
+ "loss": 1.7066,
438
+ "step": 57
439
+ },
440
+ {
441
+ "epoch": 1.16,
442
+ "learning_rate": 0.00014738686624729986,
443
+ "loss": 1.7424,
444
+ "step": 58
445
+ },
446
+ {
447
+ "epoch": 1.18,
448
+ "learning_rate": 0.00014539904997395468,
449
+ "loss": 1.8836,
450
+ "step": 59
451
+ },
452
+ {
453
+ "epoch": 1.2,
454
+ "learning_rate": 0.00014338837391175582,
455
+ "loss": 1.5515,
456
+ "step": 60
457
+ },
458
+ {
459
+ "epoch": 1.2,
460
+ "eval_loss": 1.667909860610962,
461
+ "eval_runtime": 6.1058,
462
+ "eval_samples_per_second": 0.328,
463
+ "eval_steps_per_second": 0.164,
464
+ "step": 60
465
+ },
466
+ {
467
+ "epoch": 1.22,
468
+ "learning_rate": 0.00014135585049665207,
469
+ "loss": 1.069,
470
+ "step": 61
471
+ },
472
+ {
473
+ "epoch": 1.24,
474
+ "learning_rate": 0.00013930250316539238,
475
+ "loss": 1.2383,
476
+ "step": 62
477
+ },
478
+ {
479
+ "epoch": 1.26,
480
+ "learning_rate": 0.00013722936584019453,
481
+ "loss": 1.7975,
482
+ "step": 63
483
+ },
484
+ {
485
+ "epoch": 1.28,
486
+ "learning_rate": 0.0001351374824081343,
487
+ "loss": 1.7685,
488
+ "step": 64
489
+ },
490
+ {
491
+ "epoch": 1.3,
492
+ "learning_rate": 0.00013302790619551674,
493
+ "loss": 1.831,
494
+ "step": 65
495
+ },
496
+ {
497
+ "epoch": 1.3,
498
+ "eval_loss": 1.6692527532577515,
499
+ "eval_runtime": 6.1077,
500
+ "eval_samples_per_second": 0.327,
501
+ "eval_steps_per_second": 0.164,
502
+ "step": 65
503
+ },
504
+ {
505
+ "epoch": 1.32,
506
+ "learning_rate": 0.00013090169943749476,
507
+ "loss": 1.6875,
508
+ "step": 66
509
+ },
510
+ {
511
+ "epoch": 1.34,
512
+ "learning_rate": 0.00012875993274320173,
513
+ "loss": 1.7344,
514
+ "step": 67
515
+ },
516
+ {
517
+ "epoch": 1.36,
518
+ "learning_rate": 0.00012660368455666752,
519
+ "loss": 1.6883,
520
+ "step": 68
521
+ },
522
+ {
523
+ "epoch": 1.38,
524
+ "learning_rate": 0.0001244340406137894,
525
+ "loss": 1.7168,
526
+ "step": 69
527
+ },
528
+ {
529
+ "epoch": 1.4,
530
+ "learning_rate": 0.00012225209339563145,
531
+ "loss": 1.7975,
532
+ "step": 70
533
+ },
534
+ {
535
+ "epoch": 1.4,
536
+ "eval_loss": 1.6680976152420044,
537
+ "eval_runtime": 6.1243,
538
+ "eval_samples_per_second": 0.327,
539
+ "eval_steps_per_second": 0.163,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 1.42,
544
+ "learning_rate": 0.00012005894157832729,
545
+ "loss": 1.869,
546
+ "step": 71
547
+ },
548
+ {
549
+ "epoch": 1.44,
550
+ "learning_rate": 0.00011785568947986367,
551
+ "loss": 1.6853,
552
+ "step": 72
553
+ },
554
+ {
555
+ "epoch": 1.46,
556
+ "learning_rate": 0.0001156434465040231,
557
+ "loss": 1.2465,
558
+ "step": 73
559
+ },
560
+ {
561
+ "epoch": 1.48,
562
+ "learning_rate": 0.00011342332658176555,
563
+ "loss": 0.8489,
564
+ "step": 74
565
+ },
566
+ {
567
+ "epoch": 1.5,
568
+ "learning_rate": 0.00011119644761033078,
569
+ "loss": 1.7818,
570
+ "step": 75
571
+ },
572
+ {
573
+ "epoch": 1.5,
574
+ "eval_loss": 1.669010043144226,
575
+ "eval_runtime": 6.1095,
576
+ "eval_samples_per_second": 0.327,
577
+ "eval_steps_per_second": 0.164,
578
+ "step": 75
579
+ },
580
+ {
581
+ "epoch": 1.52,
582
+ "learning_rate": 0.00010896393089034336,
583
+ "loss": 1.7696,
584
+ "step": 76
585
+ },
586
+ {
587
+ "epoch": 1.54,
588
+ "learning_rate": 0.00010672690056120399,
589
+ "loss": 1.7764,
590
+ "step": 77
591
+ },
592
+ {
593
+ "epoch": 1.56,
594
+ "learning_rate": 0.00010448648303505151,
595
+ "loss": 1.7498,
596
+ "step": 78
597
+ },
598
+ {
599
+ "epoch": 1.58,
600
+ "learning_rate": 0.00010224380642958052,
601
+ "loss": 1.8195,
602
+ "step": 79
603
+ },
604
+ {
605
+ "epoch": 1.6,
606
+ "learning_rate": 0.0001,
607
+ "loss": 1.7768,
608
+ "step": 80
609
+ },
610
+ {
611
+ "epoch": 1.6,
612
+ "eval_loss": 1.667705774307251,
613
+ "eval_runtime": 6.1125,
614
+ "eval_samples_per_second": 0.327,
615
+ "eval_steps_per_second": 0.164,
616
+ "step": 80
617
+ },
618
+ {
619
+ "epoch": 1.62,
620
+ "learning_rate": 9.775619357041952e-05,
621
+ "loss": 1.9044,
622
+ "step": 81
623
+ },
624
+ {
625
+ "epoch": 1.64,
626
+ "learning_rate": 9.551351696494854e-05,
627
+ "loss": 1.8899,
628
+ "step": 82
629
+ },
630
+ {
631
+ "epoch": 1.66,
632
+ "learning_rate": 9.327309943879604e-05,
633
+ "loss": 1.9218,
634
+ "step": 83
635
+ },
636
+ {
637
+ "epoch": 1.68,
638
+ "learning_rate": 9.103606910965666e-05,
639
+ "loss": 1.7831,
640
+ "step": 84
641
+ },
642
+ {
643
+ "epoch": 1.7,
644
+ "learning_rate": 8.880355238966923e-05,
645
+ "loss": 1.3084,
646
+ "step": 85
647
+ },
648
+ {
649
+ "epoch": 1.7,
650
+ "eval_loss": 1.667492389678955,
651
+ "eval_runtime": 6.107,
652
+ "eval_samples_per_second": 0.327,
653
+ "eval_steps_per_second": 0.164,
654
+ "step": 85
655
+ },
656
+ {
657
+ "epoch": 1.72,
658
+ "learning_rate": 8.657667341823448e-05,
659
+ "loss": 0.9686,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 1.74,
664
+ "learning_rate": 8.435655349597689e-05,
665
+ "loss": 1.788,
666
+ "step": 87
667
+ },
668
+ {
669
+ "epoch": 1.76,
670
+ "learning_rate": 8.214431052013634e-05,
671
+ "loss": 1.6448,
672
+ "step": 88
673
+ },
674
+ {
675
+ "epoch": 1.78,
676
+ "learning_rate": 7.994105842167273e-05,
677
+ "loss": 1.6639,
678
+ "step": 89
679
+ },
680
+ {
681
+ "epoch": 1.8,
682
+ "learning_rate": 7.774790660436858e-05,
683
+ "loss": 1.7402,
684
+ "step": 90
685
+ },
686
+ {
687
+ "epoch": 1.8,
688
+ "eval_loss": 1.6677496433258057,
689
+ "eval_runtime": 6.0974,
690
+ "eval_samples_per_second": 0.328,
691
+ "eval_steps_per_second": 0.164,
692
+ "step": 90
693
+ },
694
+ {
695
+ "epoch": 1.82,
696
+ "learning_rate": 7.556595938621058e-05,
697
+ "loss": 1.7281,
698
+ "step": 91
699
+ },
700
+ {
701
+ "epoch": 1.84,
702
+ "learning_rate": 7.339631544333249e-05,
703
+ "loss": 1.7017,
704
+ "step": 92
705
+ },
706
+ {
707
+ "epoch": 1.86,
708
+ "learning_rate": 7.124006725679828e-05,
709
+ "loss": 1.758,
710
+ "step": 93
711
+ },
712
+ {
713
+ "epoch": 1.88,
714
+ "learning_rate": 6.909830056250527e-05,
715
+ "loss": 1.7958,
716
+ "step": 94
717
+ },
718
+ {
719
+ "epoch": 1.9,
720
+ "learning_rate": 6.697209380448333e-05,
721
+ "loss": 1.8462,
722
+ "step": 95
723
+ },
724
+ {
725
+ "epoch": 1.9,
726
+ "eval_loss": 1.6652313470840454,
727
+ "eval_runtime": 6.1048,
728
+ "eval_samples_per_second": 0.328,
729
+ "eval_steps_per_second": 0.164,
730
+ "step": 95
731
+ },
732
+ {
733
+ "epoch": 1.92,
734
+ "learning_rate": 6.486251759186572e-05,
735
+ "loss": 1.8275,
736
+ "step": 96
737
+ },
738
+ {
739
+ "epoch": 1.94,
740
+ "learning_rate": 6.277063415980549e-05,
741
+ "loss": 1.4802,
742
+ "step": 97
743
+ },
744
+ {
745
+ "epoch": 1.96,
746
+ "learning_rate": 6.069749683460765e-05,
747
+ "loss": 1.1441,
748
+ "step": 98
749
+ },
750
+ {
751
+ "epoch": 1.98,
752
+ "learning_rate": 5.864414950334796e-05,
753
+ "loss": 1.7344,
754
+ "step": 99
755
+ },
756
+ {
757
+ "epoch": 2.0,
758
+ "learning_rate": 5.6611626088244194e-05,
759
+ "loss": 1.6543,
760
+ "step": 100
761
+ },
762
+ {
763
+ "epoch": 2.0,
764
+ "eval_loss": 1.6656206846237183,
765
+ "eval_runtime": 6.1082,
766
+ "eval_samples_per_second": 0.327,
767
+ "eval_steps_per_second": 0.164,
768
+ "step": 100
769
+ },
770
+ {
771
+ "epoch": 2.02,
772
+ "learning_rate": 5.4600950026045326e-05,
773
+ "loss": 1.7825,
774
+ "step": 101
775
+ },
776
+ {
777
+ "epoch": 2.04,
778
+ "learning_rate": 5.261313375270014e-05,
779
+ "loss": 1.7521,
780
+ "step": 102
781
+ },
782
+ {
783
+ "epoch": 2.06,
784
+ "learning_rate": 5.0649178193565314e-05,
785
+ "loss": 1.8365,
786
+ "step": 103
787
+ },
788
+ {
789
+ "epoch": 2.08,
790
+ "learning_rate": 4.87100722594094e-05,
791
+ "loss": 1.738,
792
+ "step": 104
793
+ },
794
+ {
795
+ "epoch": 2.1,
796
+ "learning_rate": 4.6796792348466356e-05,
797
+ "loss": 1.6954,
798
+ "step": 105
799
+ },
800
+ {
801
+ "epoch": 2.1,
802
+ "eval_loss": 1.6637686491012573,
803
+ "eval_runtime": 6.1031,
804
+ "eval_samples_per_second": 0.328,
805
+ "eval_steps_per_second": 0.164,
806
+ "step": 105
807
+ },
808
+ {
809
+ "epoch": 2.12,
810
+ "learning_rate": 4.491030185478976e-05,
811
+ "loss": 1.7503,
812
+ "step": 106
813
+ },
814
+ {
815
+ "epoch": 2.14,
816
+ "learning_rate": 4.305155068315481e-05,
817
+ "loss": 1.7676,
818
+ "step": 107
819
+ },
820
+ {
821
+ "epoch": 2.16,
822
+ "learning_rate": 4.12214747707527e-05,
823
+ "loss": 1.8407,
824
+ "step": 108
825
+ },
826
+ {
827
+ "epoch": 2.18,
828
+ "learning_rate": 3.942099561591802e-05,
829
+ "loss": 1.7692,
830
+ "step": 109
831
+ },
832
+ {
833
+ "epoch": 2.2,
834
+ "learning_rate": 3.7651019814126654e-05,
835
+ "loss": 1.8048,
836
+ "step": 110
837
+ },
838
+ {
839
+ "epoch": 2.2,
840
+ "eval_loss": 1.663559913635254,
841
+ "eval_runtime": 6.1093,
842
+ "eval_samples_per_second": 0.327,
843
+ "eval_steps_per_second": 0.164,
844
+ "step": 110
845
+ },
846
+ {
847
+ "epoch": 2.22,
848
+ "learning_rate": 3.591243860149759e-05,
849
+ "loss": 1.3587,
850
+ "step": 111
851
+ },
852
+ {
853
+ "epoch": 2.24,
854
+ "learning_rate": 3.4206127406028745e-05,
855
+ "loss": 1.2887,
856
+ "step": 112
857
+ },
858
+ {
859
+ "epoch": 2.26,
860
+ "learning_rate": 3.253294540679257e-05,
861
+ "loss": 1.7741,
862
+ "step": 113
863
+ },
864
+ {
865
+ "epoch": 2.28,
866
+ "learning_rate": 3.089373510131354e-05,
867
+ "loss": 1.609,
868
+ "step": 114
869
+ },
870
+ {
871
+ "epoch": 2.3,
872
+ "learning_rate": 2.9289321881345254e-05,
873
+ "loss": 1.8432,
874
+ "step": 115
875
+ },
876
+ {
877
+ "epoch": 2.3,
878
+ "eval_loss": 1.664933443069458,
879
+ "eval_runtime": 6.1066,
880
+ "eval_samples_per_second": 0.328,
881
+ "eval_steps_per_second": 0.164,
882
+ "step": 115
883
+ },
884
+ {
885
+ "epoch": 2.32,
886
+ "learning_rate": 2.7720513617260856e-05,
887
+ "loss": 1.6705,
888
+ "step": 116
889
+ },
890
+ {
891
+ "epoch": 2.34,
892
+ "learning_rate": 2.6188100251265945e-05,
893
+ "loss": 1.8165,
894
+ "step": 117
895
+ },
896
+ {
897
+ "epoch": 2.36,
898
+ "learning_rate": 2.4692853399638917e-05,
899
+ "loss": 1.806,
900
+ "step": 118
901
+ },
902
+ {
903
+ "epoch": 2.38,
904
+ "learning_rate": 2.323552596419889e-05,
905
+ "loss": 1.8827,
906
+ "step": 119
907
+ },
908
+ {
909
+ "epoch": 2.4,
910
+ "learning_rate": 2.181685175319702e-05,
911
+ "loss": 1.833,
912
+ "step": 120
913
+ },
914
+ {
915
+ "epoch": 2.4,
916
+ "eval_loss": 1.665120005607605,
917
+ "eval_runtime": 6.1061,
918
+ "eval_samples_per_second": 0.328,
919
+ "eval_steps_per_second": 0.164,
920
+ "step": 120
921
+ },
922
+ {
923
+ "epoch": 2.42,
924
+ "learning_rate": 2.043754511182191e-05,
925
+ "loss": 1.8779,
926
+ "step": 121
927
+ },
928
+ {
929
+ "epoch": 2.44,
930
+ "learning_rate": 1.9098300562505266e-05,
931
+ "loss": 1.7553,
932
+ "step": 122
933
+ },
934
+ {
935
+ "epoch": 2.46,
936
+ "learning_rate": 1.7799792455209018e-05,
937
+ "loss": 1.3448,
938
+ "step": 123
939
+ },
940
+ {
941
+ "epoch": 2.48,
942
+ "learning_rate": 1.6542674627869737e-05,
943
+ "loss": 0.967,
944
+ "step": 124
945
+ },
946
+ {
947
+ "epoch": 2.5,
948
+ "learning_rate": 1.5327580077171587e-05,
949
+ "loss": 1.7033,
950
+ "step": 125
951
+ },
952
+ {
953
+ "epoch": 2.5,
954
+ "eval_loss": 1.662558674812317,
955
+ "eval_runtime": 6.1105,
956
+ "eval_samples_per_second": 0.327,
957
+ "eval_steps_per_second": 0.164,
958
+ "step": 125
959
+ },
960
+ {
961
+ "epoch": 2.52,
962
+ "learning_rate": 1.415512063981339e-05,
963
+ "loss": 1.7094,
964
+ "step": 126
965
+ },
966
+ {
967
+ "epoch": 2.54,
968
+ "learning_rate": 1.3025886684430467e-05,
969
+ "loss": 1.6874,
970
+ "step": 127
971
+ },
972
+ {
973
+ "epoch": 2.56,
974
+ "learning_rate": 1.19404468143262e-05,
975
+ "loss": 1.6772,
976
+ "step": 128
977
+ },
978
+ {
979
+ "epoch": 2.58,
980
+ "learning_rate": 1.0899347581163221e-05,
981
+ "loss": 1.6462,
982
+ "step": 129
983
+ },
984
+ {
985
+ "epoch": 2.6,
986
+ "learning_rate": 9.903113209758096e-06,
987
+ "loss": 1.8013,
988
+ "step": 130
989
+ },
990
+ {
991
+ "epoch": 2.6,
992
+ "eval_loss": 1.6647861003875732,
993
+ "eval_runtime": 6.1104,
994
+ "eval_samples_per_second": 0.327,
995
+ "eval_steps_per_second": 0.164,
996
+ "step": 130
997
+ },
998
+ {
999
+ "epoch": 2.62,
1000
+ "learning_rate": 8.952245334118414e-06,
1001
+ "loss": 1.7317,
1002
+ "step": 131
1003
+ },
1004
+ {
1005
+ "epoch": 2.64,
1006
+ "learning_rate": 8.047222744854943e-06,
1007
+ "loss": 1.7714,
1008
+ "step": 132
1009
+ },
1010
+ {
1011
+ "epoch": 2.66,
1012
+ "learning_rate": 7.1885011480961164e-06,
1013
+ "loss": 1.8637,
1014
+ "step": 133
1015
+ },
1016
+ {
1017
+ "epoch": 2.68,
1018
+ "learning_rate": 6.37651293602628e-06,
1019
+ "loss": 1.5169,
1020
+ "step": 134
1021
+ },
1022
+ {
1023
+ "epoch": 2.7,
1024
+ "learning_rate": 5.611666969163243e-06,
1025
+ "loss": 1.2576,
1026
+ "step": 135
1027
+ },
1028
+ {
1029
+ "epoch": 2.7,
1030
+ "eval_loss": 1.6650197505950928,
1031
+ "eval_runtime": 6.1138,
1032
+ "eval_samples_per_second": 0.327,
1033
+ "eval_steps_per_second": 0.164,
1034
+ "step": 135
1035
+ },
1036
+ {
1037
+ "epoch": 2.72,
1038
+ "learning_rate": 4.8943483704846475e-06,
1039
+ "loss": 0.8812,
1040
+ "step": 136
1041
+ },
1042
+ {
1043
+ "epoch": 2.74,
1044
+ "learning_rate": 4.224918331506955e-06,
1045
+ "loss": 1.7942,
1046
+ "step": 137
1047
+ },
1048
+ {
1049
+ "epoch": 2.76,
1050
+ "learning_rate": 3.6037139304146762e-06,
1051
+ "loss": 1.6483,
1052
+ "step": 138
1053
+ },
1054
+ {
1055
+ "epoch": 2.78,
1056
+ "learning_rate": 3.0310479623313127e-06,
1057
+ "loss": 1.7521,
1058
+ "step": 139
1059
+ },
1060
+ {
1061
+ "epoch": 2.8,
1062
+ "learning_rate": 2.5072087818176382e-06,
1063
+ "loss": 1.8145,
1064
+ "step": 140
1065
+ },
1066
+ {
1067
+ "epoch": 2.8,
1068
+ "eval_loss": 1.6641619205474854,
1069
+ "eval_runtime": 6.1153,
1070
+ "eval_samples_per_second": 0.327,
1071
+ "eval_steps_per_second": 0.164,
1072
+ "step": 140
1073
+ }
1074
+ ],
1075
+ "max_steps": 150,
1076
+ "num_train_epochs": 3,
1077
+ "total_flos": 4.058841051803812e+17,
1078
+ "trial_name": null,
1079
+ "trial_params": null
1080
+ }
checkpoint-140/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd8d23f042a338ad3600f5059478d68d7fca1548633272a68cc6bebfb23ad3ee
3
+ size 3899