brettbbb commited on
Commit
af89261
1 Parent(s): 4493425

End of training

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: llama2
3
+ base_model: lmsys/vicuna-7b-v1.5
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: mc_cot_16
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # mc_cot_16
15
+
16
+ This model is a fine-tuned version of [lmsys/vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) on an unknown dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 3.0303
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 0.0001
38
+ - train_batch_size: 4
39
+ - eval_batch_size: 8
40
+ - seed: 42
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: linear
43
+ - lr_scheduler_warmup_steps: 5
44
+ - num_epochs: 20
45
+ - mixed_precision_training: Native AMP
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:----:|:---------------:|
51
+ | 2.1538 | 1.0 | 4 | 2.1552 |
52
+ | 1.7905 | 2.0 | 8 | 1.9012 |
53
+ | 1.4229 | 3.0 | 12 | 1.7686 |
54
+ | 1.3834 | 4.0 | 16 | 1.7070 |
55
+ | 1.4421 | 5.0 | 20 | 1.7308 |
56
+ | 0.9003 | 6.0 | 24 | 1.7646 |
57
+ | 0.7013 | 7.0 | 28 | 1.9070 |
58
+ | 0.6291 | 8.0 | 32 | 2.0078 |
59
+ | 0.3314 | 9.0 | 36 | 2.2682 |
60
+ | 0.1554 | 10.0 | 40 | 2.3624 |
61
+ | 0.0814 | 11.0 | 44 | 2.6523 |
62
+ | 0.0499 | 12.0 | 48 | 2.7565 |
63
+ | 0.0216 | 13.0 | 52 | 2.8505 |
64
+ | 0.0197 | 14.0 | 56 | 2.9170 |
65
+ | 0.0174 | 15.0 | 60 | 2.9433 |
66
+ | 0.0174 | 16.0 | 64 | 2.9683 |
67
+ | 0.0145 | 17.0 | 68 | 2.9966 |
68
+ | 0.013 | 18.0 | 72 | 3.0193 |
69
+ | 0.0151 | 19.0 | 76 | 3.0277 |
70
+ | 0.0145 | 20.0 | 80 | 3.0303 |
71
+
72
+
73
+ ### Framework versions
74
+
75
+ - Transformers 4.36.0.dev0
76
+ - Pytorch 2.1.0+cu121
77
+ - Datasets 2.13.1
78
+ - Tokenizers 0.14.1
adapter_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "lmsys/vicuna-7b-v1.5",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 64,
12
+ "lora_dropout": 0.1,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 16,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "o_proj",
20
+ "k_proj",
21
+ "q_proj",
22
+ "up_proj",
23
+ "down_proj",
24
+ "v_proj",
25
+ "gate_proj"
26
+ ],
27
+ "task_type": "CAUSAL_LM"
28
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7df5842fa3ab0d31a93626608e53400f6d54d99decd6da41564e2ff212ab5c1
3
+ size 160069834
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec05d360d6a52ad6dbd78a151e2524fefbc14cf4d37f79219e2476c2d718f4b4
3
+ size 159967880
all_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "train_loss": 0.6028558738762513,
4
+ "train_runtime": 922.3939,
5
+ "train_samples_per_second": 0.347,
6
+ "train_steps_per_second": 0.087
7
+ }
args.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model_name: lmsys/vicuna-7b-v1.5
2
+ batch_size: 4
3
+ cot: true
4
+ dataset_name: BENBENBENb/McTest640COT
5
+ epochs: 20
6
+ eval_strategy: epoch
7
+ learning_rate: 0.0001
8
+ logging_steps: 1
9
+ output_dir: brettbbb/mc_cot_16
10
+ seed: 42
11
+ train_size: 16
12
+ warmup_steps: 5
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "train_loss": 0.6028558738762513,
4
+ "train_runtime": 922.3939,
5
+ "train_samples_per_second": 0.347,
6
+ "train_steps_per_second": 0.087
7
+ }
trainer_state.json ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
+ "eval_steps": 500,
6
+ "global_step": 80,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.25,
13
+ "learning_rate": 2e-05,
14
+ "loss": 2.2339,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.5,
19
+ "learning_rate": 4e-05,
20
+ "loss": 2.1977,
21
+ "step": 2
22
+ },
23
+ {
24
+ "epoch": 0.75,
25
+ "learning_rate": 6e-05,
26
+ "loss": 2.3018,
27
+ "step": 3
28
+ },
29
+ {
30
+ "epoch": 1.0,
31
+ "learning_rate": 8e-05,
32
+ "loss": 2.1538,
33
+ "step": 4
34
+ },
35
+ {
36
+ "epoch": 1.0,
37
+ "eval_loss": 2.1552114486694336,
38
+ "eval_runtime": 32.1888,
39
+ "eval_samples_per_second": 3.728,
40
+ "eval_steps_per_second": 0.466,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 1.25,
45
+ "learning_rate": 0.0001,
46
+ "loss": 2.0097,
47
+ "step": 5
48
+ },
49
+ {
50
+ "epoch": 1.5,
51
+ "learning_rate": 9.866666666666668e-05,
52
+ "loss": 1.9721,
53
+ "step": 6
54
+ },
55
+ {
56
+ "epoch": 1.75,
57
+ "learning_rate": 9.733333333333335e-05,
58
+ "loss": 2.0311,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 2.0,
63
+ "learning_rate": 9.6e-05,
64
+ "loss": 1.7905,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 2.0,
69
+ "eval_loss": 1.9012010097503662,
70
+ "eval_runtime": 32.3638,
71
+ "eval_samples_per_second": 3.708,
72
+ "eval_steps_per_second": 0.463,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 2.25,
77
+ "learning_rate": 9.466666666666667e-05,
78
+ "loss": 1.9509,
79
+ "step": 9
80
+ },
81
+ {
82
+ "epoch": 2.5,
83
+ "learning_rate": 9.333333333333334e-05,
84
+ "loss": 1.8188,
85
+ "step": 10
86
+ },
87
+ {
88
+ "epoch": 2.75,
89
+ "learning_rate": 9.200000000000001e-05,
90
+ "loss": 1.537,
91
+ "step": 11
92
+ },
93
+ {
94
+ "epoch": 3.0,
95
+ "learning_rate": 9.066666666666667e-05,
96
+ "loss": 1.4229,
97
+ "step": 12
98
+ },
99
+ {
100
+ "epoch": 3.0,
101
+ "eval_loss": 1.7686142921447754,
102
+ "eval_runtime": 32.3865,
103
+ "eval_samples_per_second": 3.705,
104
+ "eval_steps_per_second": 0.463,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 3.25,
109
+ "learning_rate": 8.933333333333334e-05,
110
+ "loss": 1.4918,
111
+ "step": 13
112
+ },
113
+ {
114
+ "epoch": 3.5,
115
+ "learning_rate": 8.800000000000001e-05,
116
+ "loss": 1.4519,
117
+ "step": 14
118
+ },
119
+ {
120
+ "epoch": 3.75,
121
+ "learning_rate": 8.666666666666667e-05,
122
+ "loss": 1.5143,
123
+ "step": 15
124
+ },
125
+ {
126
+ "epoch": 4.0,
127
+ "learning_rate": 8.533333333333334e-05,
128
+ "loss": 1.3834,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 4.0,
133
+ "eval_loss": 1.7070213556289673,
134
+ "eval_runtime": 32.3609,
135
+ "eval_samples_per_second": 3.708,
136
+ "eval_steps_per_second": 0.464,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 4.25,
141
+ "learning_rate": 8.4e-05,
142
+ "loss": 1.1704,
143
+ "step": 17
144
+ },
145
+ {
146
+ "epoch": 4.5,
147
+ "learning_rate": 8.266666666666667e-05,
148
+ "loss": 1.4072,
149
+ "step": 18
150
+ },
151
+ {
152
+ "epoch": 4.75,
153
+ "learning_rate": 8.133333333333334e-05,
154
+ "loss": 1.0509,
155
+ "step": 19
156
+ },
157
+ {
158
+ "epoch": 5.0,
159
+ "learning_rate": 8e-05,
160
+ "loss": 1.4421,
161
+ "step": 20
162
+ },
163
+ {
164
+ "epoch": 5.0,
165
+ "eval_loss": 1.730824589729309,
166
+ "eval_runtime": 32.3413,
167
+ "eval_samples_per_second": 3.71,
168
+ "eval_steps_per_second": 0.464,
169
+ "step": 20
170
+ },
171
+ {
172
+ "epoch": 5.25,
173
+ "learning_rate": 7.866666666666666e-05,
174
+ "loss": 1.1629,
175
+ "step": 21
176
+ },
177
+ {
178
+ "epoch": 5.5,
179
+ "learning_rate": 7.733333333333333e-05,
180
+ "loss": 1.0278,
181
+ "step": 22
182
+ },
183
+ {
184
+ "epoch": 5.75,
185
+ "learning_rate": 7.6e-05,
186
+ "loss": 1.2421,
187
+ "step": 23
188
+ },
189
+ {
190
+ "epoch": 6.0,
191
+ "learning_rate": 7.466666666666667e-05,
192
+ "loss": 0.9003,
193
+ "step": 24
194
+ },
195
+ {
196
+ "epoch": 6.0,
197
+ "eval_loss": 1.764611005783081,
198
+ "eval_runtime": 32.3167,
199
+ "eval_samples_per_second": 3.713,
200
+ "eval_steps_per_second": 0.464,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 6.25,
205
+ "learning_rate": 7.333333333333333e-05,
206
+ "loss": 0.9472,
207
+ "step": 25
208
+ },
209
+ {
210
+ "epoch": 6.5,
211
+ "learning_rate": 7.2e-05,
212
+ "loss": 0.8213,
213
+ "step": 26
214
+ },
215
+ {
216
+ "epoch": 6.75,
217
+ "learning_rate": 7.066666666666667e-05,
218
+ "loss": 0.9445,
219
+ "step": 27
220
+ },
221
+ {
222
+ "epoch": 7.0,
223
+ "learning_rate": 6.933333333333334e-05,
224
+ "loss": 0.7013,
225
+ "step": 28
226
+ },
227
+ {
228
+ "epoch": 7.0,
229
+ "eval_loss": 1.9069510698318481,
230
+ "eval_runtime": 32.3189,
231
+ "eval_samples_per_second": 3.713,
232
+ "eval_steps_per_second": 0.464,
233
+ "step": 28
234
+ },
235
+ {
236
+ "epoch": 7.25,
237
+ "learning_rate": 6.800000000000001e-05,
238
+ "loss": 0.6424,
239
+ "step": 29
240
+ },
241
+ {
242
+ "epoch": 7.5,
243
+ "learning_rate": 6.666666666666667e-05,
244
+ "loss": 0.6485,
245
+ "step": 30
246
+ },
247
+ {
248
+ "epoch": 7.75,
249
+ "learning_rate": 6.533333333333334e-05,
250
+ "loss": 0.5505,
251
+ "step": 31
252
+ },
253
+ {
254
+ "epoch": 8.0,
255
+ "learning_rate": 6.400000000000001e-05,
256
+ "loss": 0.6291,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 8.0,
261
+ "eval_loss": 2.0077626705169678,
262
+ "eval_runtime": 32.3146,
263
+ "eval_samples_per_second": 3.713,
264
+ "eval_steps_per_second": 0.464,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 8.25,
269
+ "learning_rate": 6.266666666666667e-05,
270
+ "loss": 0.3493,
271
+ "step": 33
272
+ },
273
+ {
274
+ "epoch": 8.5,
275
+ "learning_rate": 6.133333333333334e-05,
276
+ "loss": 0.53,
277
+ "step": 34
278
+ },
279
+ {
280
+ "epoch": 8.75,
281
+ "learning_rate": 6e-05,
282
+ "loss": 0.3821,
283
+ "step": 35
284
+ },
285
+ {
286
+ "epoch": 9.0,
287
+ "learning_rate": 5.866666666666667e-05,
288
+ "loss": 0.3314,
289
+ "step": 36
290
+ },
291
+ {
292
+ "epoch": 9.0,
293
+ "eval_loss": 2.268174886703491,
294
+ "eval_runtime": 32.301,
295
+ "eval_samples_per_second": 3.715,
296
+ "eval_steps_per_second": 0.464,
297
+ "step": 36
298
+ },
299
+ {
300
+ "epoch": 9.25,
301
+ "learning_rate": 5.7333333333333336e-05,
302
+ "loss": 0.3121,
303
+ "step": 37
304
+ },
305
+ {
306
+ "epoch": 9.5,
307
+ "learning_rate": 5.6000000000000006e-05,
308
+ "loss": 0.1531,
309
+ "step": 38
310
+ },
311
+ {
312
+ "epoch": 9.75,
313
+ "learning_rate": 5.466666666666666e-05,
314
+ "loss": 0.2633,
315
+ "step": 39
316
+ },
317
+ {
318
+ "epoch": 10.0,
319
+ "learning_rate": 5.333333333333333e-05,
320
+ "loss": 0.1554,
321
+ "step": 40
322
+ },
323
+ {
324
+ "epoch": 10.0,
325
+ "eval_loss": 2.3624258041381836,
326
+ "eval_runtime": 32.3025,
327
+ "eval_samples_per_second": 3.715,
328
+ "eval_steps_per_second": 0.464,
329
+ "step": 40
330
+ },
331
+ {
332
+ "epoch": 10.25,
333
+ "learning_rate": 5.2000000000000004e-05,
334
+ "loss": 0.0969,
335
+ "step": 41
336
+ },
337
+ {
338
+ "epoch": 10.5,
339
+ "learning_rate": 5.0666666666666674e-05,
340
+ "loss": 0.1729,
341
+ "step": 42
342
+ },
343
+ {
344
+ "epoch": 10.75,
345
+ "learning_rate": 4.933333333333334e-05,
346
+ "loss": 0.1003,
347
+ "step": 43
348
+ },
349
+ {
350
+ "epoch": 11.0,
351
+ "learning_rate": 4.8e-05,
352
+ "loss": 0.0814,
353
+ "step": 44
354
+ },
355
+ {
356
+ "epoch": 11.0,
357
+ "eval_loss": 2.6522703170776367,
358
+ "eval_runtime": 32.3054,
359
+ "eval_samples_per_second": 3.715,
360
+ "eval_steps_per_second": 0.464,
361
+ "step": 44
362
+ },
363
+ {
364
+ "epoch": 11.25,
365
+ "learning_rate": 4.666666666666667e-05,
366
+ "loss": 0.0623,
367
+ "step": 45
368
+ },
369
+ {
370
+ "epoch": 11.5,
371
+ "learning_rate": 4.5333333333333335e-05,
372
+ "loss": 0.0566,
373
+ "step": 46
374
+ },
375
+ {
376
+ "epoch": 11.75,
377
+ "learning_rate": 4.4000000000000006e-05,
378
+ "loss": 0.0357,
379
+ "step": 47
380
+ },
381
+ {
382
+ "epoch": 12.0,
383
+ "learning_rate": 4.266666666666667e-05,
384
+ "loss": 0.0499,
385
+ "step": 48
386
+ },
387
+ {
388
+ "epoch": 12.0,
389
+ "eval_loss": 2.7565455436706543,
390
+ "eval_runtime": 32.2967,
391
+ "eval_samples_per_second": 3.716,
392
+ "eval_steps_per_second": 0.464,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 12.25,
397
+ "learning_rate": 4.133333333333333e-05,
398
+ "loss": 0.0298,
399
+ "step": 49
400
+ },
401
+ {
402
+ "epoch": 12.5,
403
+ "learning_rate": 4e-05,
404
+ "loss": 0.036,
405
+ "step": 50
406
+ },
407
+ {
408
+ "epoch": 12.75,
409
+ "learning_rate": 3.866666666666667e-05,
410
+ "loss": 0.0269,
411
+ "step": 51
412
+ },
413
+ {
414
+ "epoch": 13.0,
415
+ "learning_rate": 3.733333333333334e-05,
416
+ "loss": 0.0216,
417
+ "step": 52
418
+ },
419
+ {
420
+ "epoch": 13.0,
421
+ "eval_loss": 2.8505098819732666,
422
+ "eval_runtime": 32.3048,
423
+ "eval_samples_per_second": 3.715,
424
+ "eval_steps_per_second": 0.464,
425
+ "step": 52
426
+ },
427
+ {
428
+ "epoch": 13.25,
429
+ "learning_rate": 3.6e-05,
430
+ "loss": 0.0184,
431
+ "step": 53
432
+ },
433
+ {
434
+ "epoch": 13.5,
435
+ "learning_rate": 3.466666666666667e-05,
436
+ "loss": 0.0254,
437
+ "step": 54
438
+ },
439
+ {
440
+ "epoch": 13.75,
441
+ "learning_rate": 3.3333333333333335e-05,
442
+ "loss": 0.022,
443
+ "step": 55
444
+ },
445
+ {
446
+ "epoch": 14.0,
447
+ "learning_rate": 3.2000000000000005e-05,
448
+ "loss": 0.0197,
449
+ "step": 56
450
+ },
451
+ {
452
+ "epoch": 14.0,
453
+ "eval_loss": 2.9169647693634033,
454
+ "eval_runtime": 32.2965,
455
+ "eval_samples_per_second": 3.716,
456
+ "eval_steps_per_second": 0.464,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 14.25,
461
+ "learning_rate": 3.066666666666667e-05,
462
+ "loss": 0.0173,
463
+ "step": 57
464
+ },
465
+ {
466
+ "epoch": 14.5,
467
+ "learning_rate": 2.9333333333333336e-05,
468
+ "loss": 0.0153,
469
+ "step": 58
470
+ },
471
+ {
472
+ "epoch": 14.75,
473
+ "learning_rate": 2.8000000000000003e-05,
474
+ "loss": 0.016,
475
+ "step": 59
476
+ },
477
+ {
478
+ "epoch": 15.0,
479
+ "learning_rate": 2.6666666666666667e-05,
480
+ "loss": 0.0174,
481
+ "step": 60
482
+ },
483
+ {
484
+ "epoch": 15.0,
485
+ "eval_loss": 2.943286657333374,
486
+ "eval_runtime": 32.2924,
487
+ "eval_samples_per_second": 3.716,
488
+ "eval_steps_per_second": 0.465,
489
+ "step": 60
490
+ },
491
+ {
492
+ "epoch": 15.25,
493
+ "learning_rate": 2.5333333333333337e-05,
494
+ "loss": 0.0136,
495
+ "step": 61
496
+ },
497
+ {
498
+ "epoch": 15.5,
499
+ "learning_rate": 2.4e-05,
500
+ "loss": 0.017,
501
+ "step": 62
502
+ },
503
+ {
504
+ "epoch": 15.75,
505
+ "learning_rate": 2.2666666666666668e-05,
506
+ "loss": 0.0146,
507
+ "step": 63
508
+ },
509
+ {
510
+ "epoch": 16.0,
511
+ "learning_rate": 2.1333333333333335e-05,
512
+ "loss": 0.0174,
513
+ "step": 64
514
+ },
515
+ {
516
+ "epoch": 16.0,
517
+ "eval_loss": 2.968252420425415,
518
+ "eval_runtime": 32.2957,
519
+ "eval_samples_per_second": 3.716,
520
+ "eval_steps_per_second": 0.464,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 16.25,
525
+ "learning_rate": 2e-05,
526
+ "loss": 0.0147,
527
+ "step": 65
528
+ },
529
+ {
530
+ "epoch": 16.5,
531
+ "learning_rate": 1.866666666666667e-05,
532
+ "loss": 0.014,
533
+ "step": 66
534
+ },
535
+ {
536
+ "epoch": 16.75,
537
+ "learning_rate": 1.7333333333333336e-05,
538
+ "loss": 0.0145,
539
+ "step": 67
540
+ },
541
+ {
542
+ "epoch": 17.0,
543
+ "learning_rate": 1.6000000000000003e-05,
544
+ "loss": 0.0145,
545
+ "step": 68
546
+ },
547
+ {
548
+ "epoch": 17.0,
549
+ "eval_loss": 2.9966158866882324,
550
+ "eval_runtime": 32.3008,
551
+ "eval_samples_per_second": 3.715,
552
+ "eval_steps_per_second": 0.464,
553
+ "step": 68
554
+ },
555
+ {
556
+ "epoch": 17.25,
557
+ "learning_rate": 1.4666666666666668e-05,
558
+ "loss": 0.0139,
559
+ "step": 69
560
+ },
561
+ {
562
+ "epoch": 17.5,
563
+ "learning_rate": 1.3333333333333333e-05,
564
+ "loss": 0.0121,
565
+ "step": 70
566
+ },
567
+ {
568
+ "epoch": 17.75,
569
+ "learning_rate": 1.2e-05,
570
+ "loss": 0.0139,
571
+ "step": 71
572
+ },
573
+ {
574
+ "epoch": 18.0,
575
+ "learning_rate": 1.0666666666666667e-05,
576
+ "loss": 0.013,
577
+ "step": 72
578
+ },
579
+ {
580
+ "epoch": 18.0,
581
+ "eval_loss": 3.0193371772766113,
582
+ "eval_runtime": 32.2982,
583
+ "eval_samples_per_second": 3.715,
584
+ "eval_steps_per_second": 0.464,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 18.25,
589
+ "learning_rate": 9.333333333333334e-06,
590
+ "loss": 0.0132,
591
+ "step": 73
592
+ },
593
+ {
594
+ "epoch": 18.5,
595
+ "learning_rate": 8.000000000000001e-06,
596
+ "loss": 0.0127,
597
+ "step": 74
598
+ },
599
+ {
600
+ "epoch": 18.75,
601
+ "learning_rate": 6.666666666666667e-06,
602
+ "loss": 0.0134,
603
+ "step": 75
604
+ },
605
+ {
606
+ "epoch": 19.0,
607
+ "learning_rate": 5.333333333333334e-06,
608
+ "loss": 0.0151,
609
+ "step": 76
610
+ },
611
+ {
612
+ "epoch": 19.0,
613
+ "eval_loss": 3.027658700942993,
614
+ "eval_runtime": 32.2971,
615
+ "eval_samples_per_second": 3.716,
616
+ "eval_steps_per_second": 0.464,
617
+ "step": 76
618
+ },
619
+ {
620
+ "epoch": 19.25,
621
+ "learning_rate": 4.000000000000001e-06,
622
+ "loss": 0.0126,
623
+ "step": 77
624
+ },
625
+ {
626
+ "epoch": 19.5,
627
+ "learning_rate": 2.666666666666667e-06,
628
+ "loss": 0.014,
629
+ "step": 78
630
+ },
631
+ {
632
+ "epoch": 19.75,
633
+ "learning_rate": 1.3333333333333334e-06,
634
+ "loss": 0.0113,
635
+ "step": 79
636
+ },
637
+ {
638
+ "epoch": 20.0,
639
+ "learning_rate": 0.0,
640
+ "loss": 0.0145,
641
+ "step": 80
642
+ },
643
+ {
644
+ "epoch": 20.0,
645
+ "eval_loss": 3.0302772521972656,
646
+ "eval_runtime": 32.2941,
647
+ "eval_samples_per_second": 3.716,
648
+ "eval_steps_per_second": 0.464,
649
+ "step": 80
650
+ },
651
+ {
652
+ "epoch": 20.0,
653
+ "step": 80,
654
+ "total_flos": 8972287420661760.0,
655
+ "train_loss": 0.6028558738762513,
656
+ "train_runtime": 922.3939,
657
+ "train_samples_per_second": 0.347,
658
+ "train_steps_per_second": 0.087
659
+ }
660
+ ],
661
+ "logging_steps": 1,
662
+ "max_steps": 80,
663
+ "num_train_epochs": 20,
664
+ "save_steps": 500,
665
+ "total_flos": 8972287420661760.0,
666
+ "trial_name": null,
667
+ "trial_params": null
668
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50b0fda92390d96fa1802370f8c9e81c35e9b4ff74e74c9689a50be3e815c264
3
+ size 4600