tyzhu commited on
Commit
5d5ef3b
1 Parent(s): 78ec215

End of training

Browse files
Files changed (6) hide show
  1. README.md +14 -2
  2. all_results.json +16 -0
  3. eval_results.json +10 -0
  4. tokenizer.json +1 -6
  5. train_results.json +9 -0
  6. trainer_state.json +701 -0
README.md CHANGED
@@ -3,11 +3,23 @@ license: other
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  library_name: peft
12
  ---
13
 
@@ -16,7 +28,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
18
 
19
- This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 1.0623
22
  - Accuracy: 0.7692
 
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx
8
  metrics:
9
  - accuracy
10
  model-index:
11
  - name: lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
12
+ results:
13
+ - task:
14
+ name: Causal Language Modeling
15
+ type: text-generation
16
+ dataset:
17
+ name: tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx
18
+ type: tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx
19
+ metrics:
20
+ - name: Accuracy
21
+ type: accuracy
22
+ value: 0.7691922246220302
23
  library_name: peft
24
  ---
25
 
 
28
 
29
  # lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
30
 
31
+ This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on the tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx dataset.
32
  It achieves the following results on the evaluation set:
33
  - Loss: 1.0623
34
  - Accuracy: 0.7692
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.997021149836163,
3
+ "eval_accuracy": 0.7691922246220302,
4
+ "eval_loss": 1.0623269081115723,
5
+ "eval_runtime": 7.649,
6
+ "eval_samples": 500,
7
+ "eval_samples_per_second": 65.368,
8
+ "eval_steps_per_second": 8.236,
9
+ "perplexity": 2.8930951295301637,
10
+ "total_flos": 6.866381543623885e+17,
11
+ "train_loss": 1.1092717030903723,
12
+ "train_runtime": 19337.1025,
13
+ "train_samples": 26854,
14
+ "train_samples_per_second": 13.887,
15
+ "train_steps_per_second": 0.434
16
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.997021149836163,
3
+ "eval_accuracy": 0.7691922246220302,
4
+ "eval_loss": 1.0623269081115723,
5
+ "eval_runtime": 7.649,
6
+ "eval_samples": 500,
7
+ "eval_samples_per_second": 65.368,
8
+ "eval_steps_per_second": 8.236,
9
+ "perplexity": 2.8930951295301637
10
+ }
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 1024,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.997021149836163,
3
+ "total_flos": 6.866381543623885e+17,
4
+ "train_loss": 1.1092717030903723,
5
+ "train_runtime": 19337.1025,
6
+ "train_samples": 26854,
7
+ "train_samples_per_second": 13.887,
8
+ "train_steps_per_second": 0.434
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.997021149836163,
5
+ "eval_steps": 500,
6
+ "global_step": 8390,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.11915400655347036,
13
+ "grad_norm": 0.24564065039157867,
14
+ "learning_rate": 0.0001,
15
+ "loss": 1.6916,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.23830801310694072,
20
+ "grad_norm": 0.2549617886543274,
21
+ "learning_rate": 0.0001,
22
+ "loss": 1.6213,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.3574620196604111,
27
+ "grad_norm": 0.26466843485832214,
28
+ "learning_rate": 0.0001,
29
+ "loss": 1.6443,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.47661602621388144,
34
+ "grad_norm": 0.2397240847349167,
35
+ "learning_rate": 0.0001,
36
+ "loss": 1.6157,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.5957700327673519,
41
+ "grad_norm": 0.24300818145275116,
42
+ "learning_rate": 0.0001,
43
+ "loss": 1.6087,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.7149240393208222,
48
+ "grad_norm": 0.2128152847290039,
49
+ "learning_rate": 0.0001,
50
+ "loss": 1.6163,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.8340780458742926,
55
+ "grad_norm": 0.24853849411010742,
56
+ "learning_rate": 0.0001,
57
+ "loss": 1.609,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.9532320524277629,
62
+ "grad_norm": 0.26104700565338135,
63
+ "learning_rate": 0.0001,
64
+ "loss": 1.6067,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.9997021149836163,
69
+ "eval_accuracy": 0.7197408207343412,
70
+ "eval_loss": 1.8439931869506836,
71
+ "eval_runtime": 7.8863,
72
+ "eval_samples_per_second": 63.401,
73
+ "eval_steps_per_second": 7.989,
74
+ "step": 839
75
+ },
76
+ {
77
+ "epoch": 1.0723860589812333,
78
+ "grad_norm": 0.26654207706451416,
79
+ "learning_rate": 0.0001,
80
+ "loss": 1.5707,
81
+ "step": 900
82
+ },
83
+ {
84
+ "epoch": 1.1915400655347037,
85
+ "grad_norm": 0.3084248900413513,
86
+ "learning_rate": 0.0001,
87
+ "loss": 1.546,
88
+ "step": 1000
89
+ },
90
+ {
91
+ "epoch": 1.310694072088174,
92
+ "grad_norm": 0.35963907837867737,
93
+ "learning_rate": 0.0001,
94
+ "loss": 1.5456,
95
+ "step": 1100
96
+ },
97
+ {
98
+ "epoch": 1.4298480786416443,
99
+ "grad_norm": 0.32188501954078674,
100
+ "learning_rate": 0.0001,
101
+ "loss": 1.5333,
102
+ "step": 1200
103
+ },
104
+ {
105
+ "epoch": 1.5490020851951147,
106
+ "grad_norm": 0.32053473591804504,
107
+ "learning_rate": 0.0001,
108
+ "loss": 1.5506,
109
+ "step": 1300
110
+ },
111
+ {
112
+ "epoch": 1.668156091748585,
113
+ "grad_norm": 0.35408514738082886,
114
+ "learning_rate": 0.0001,
115
+ "loss": 1.5256,
116
+ "step": 1400
117
+ },
118
+ {
119
+ "epoch": 1.7873100983020556,
120
+ "grad_norm": 0.3224356770515442,
121
+ "learning_rate": 0.0001,
122
+ "loss": 1.5349,
123
+ "step": 1500
124
+ },
125
+ {
126
+ "epoch": 1.9064641048555258,
127
+ "grad_norm": 0.3134737014770508,
128
+ "learning_rate": 0.0001,
129
+ "loss": 1.5433,
130
+ "step": 1600
131
+ },
132
+ {
133
+ "epoch": 1.9994042299672325,
134
+ "eval_accuracy": 0.7246911447084233,
135
+ "eval_loss": 1.7660729885101318,
136
+ "eval_runtime": 7.8239,
137
+ "eval_samples_per_second": 63.907,
138
+ "eval_steps_per_second": 8.052,
139
+ "step": 1678
140
+ },
141
+ {
142
+ "epoch": 2.025618111408996,
143
+ "grad_norm": 0.32698601484298706,
144
+ "learning_rate": 0.0001,
145
+ "loss": 1.5237,
146
+ "step": 1700
147
+ },
148
+ {
149
+ "epoch": 2.1447721179624666,
150
+ "grad_norm": 0.4822945296764374,
151
+ "learning_rate": 0.0001,
152
+ "loss": 1.4085,
153
+ "step": 1800
154
+ },
155
+ {
156
+ "epoch": 2.2639261245159368,
157
+ "grad_norm": 0.43997159600257874,
158
+ "learning_rate": 0.0001,
159
+ "loss": 1.4227,
160
+ "step": 1900
161
+ },
162
+ {
163
+ "epoch": 2.3830801310694074,
164
+ "grad_norm": 0.4469866156578064,
165
+ "learning_rate": 0.0001,
166
+ "loss": 1.4325,
167
+ "step": 2000
168
+ },
169
+ {
170
+ "epoch": 2.5022341376228776,
171
+ "grad_norm": 0.44795843958854675,
172
+ "learning_rate": 0.0001,
173
+ "loss": 1.4266,
174
+ "step": 2100
175
+ },
176
+ {
177
+ "epoch": 2.621388144176348,
178
+ "grad_norm": 0.41753217577934265,
179
+ "learning_rate": 0.0001,
180
+ "loss": 1.4235,
181
+ "step": 2200
182
+ },
183
+ {
184
+ "epoch": 2.7405421507298184,
185
+ "grad_norm": 0.45843449234962463,
186
+ "learning_rate": 0.0001,
187
+ "loss": 1.4123,
188
+ "step": 2300
189
+ },
190
+ {
191
+ "epoch": 2.8596961572832886,
192
+ "grad_norm": 0.5280742049217224,
193
+ "learning_rate": 0.0001,
194
+ "loss": 1.4162,
195
+ "step": 2400
196
+ },
197
+ {
198
+ "epoch": 2.978850163836759,
199
+ "grad_norm": 0.5003345012664795,
200
+ "learning_rate": 0.0001,
201
+ "loss": 1.4167,
202
+ "step": 2500
203
+ },
204
+ {
205
+ "epoch": 2.999106344950849,
206
+ "eval_accuracy": 0.7309719222462203,
207
+ "eval_loss": 1.6454776525497437,
208
+ "eval_runtime": 7.8863,
209
+ "eval_samples_per_second": 63.401,
210
+ "eval_steps_per_second": 7.989,
211
+ "step": 2517
212
+ },
213
+ {
214
+ "epoch": 3.0980041703902295,
215
+ "grad_norm": 0.5184714794158936,
216
+ "learning_rate": 0.0001,
217
+ "loss": 1.3029,
218
+ "step": 2600
219
+ },
220
+ {
221
+ "epoch": 3.2171581769436997,
222
+ "grad_norm": 0.611228883266449,
223
+ "learning_rate": 0.0001,
224
+ "loss": 1.2788,
225
+ "step": 2700
226
+ },
227
+ {
228
+ "epoch": 3.33631218349717,
229
+ "grad_norm": 0.538593590259552,
230
+ "learning_rate": 0.0001,
231
+ "loss": 1.2949,
232
+ "step": 2800
233
+ },
234
+ {
235
+ "epoch": 3.4554661900506405,
236
+ "grad_norm": 0.5769683122634888,
237
+ "learning_rate": 0.0001,
238
+ "loss": 1.2816,
239
+ "step": 2900
240
+ },
241
+ {
242
+ "epoch": 3.5746201966041107,
243
+ "grad_norm": 0.541022002696991,
244
+ "learning_rate": 0.0001,
245
+ "loss": 1.2787,
246
+ "step": 3000
247
+ },
248
+ {
249
+ "epoch": 3.6937742031575813,
250
+ "grad_norm": 0.5838562250137329,
251
+ "learning_rate": 0.0001,
252
+ "loss": 1.299,
253
+ "step": 3100
254
+ },
255
+ {
256
+ "epoch": 3.8129282097110515,
257
+ "grad_norm": 0.583423912525177,
258
+ "learning_rate": 0.0001,
259
+ "loss": 1.2967,
260
+ "step": 3200
261
+ },
262
+ {
263
+ "epoch": 3.932082216264522,
264
+ "grad_norm": 0.5817753672599792,
265
+ "learning_rate": 0.0001,
266
+ "loss": 1.2948,
267
+ "step": 3300
268
+ },
269
+ {
270
+ "epoch": 4.0,
271
+ "eval_accuracy": 0.736622030237581,
272
+ "eval_loss": 1.5393506288528442,
273
+ "eval_runtime": 7.705,
274
+ "eval_samples_per_second": 64.893,
275
+ "eval_steps_per_second": 8.177,
276
+ "step": 3357
277
+ },
278
+ {
279
+ "epoch": 4.051236222817992,
280
+ "grad_norm": 0.661818265914917,
281
+ "learning_rate": 0.0001,
282
+ "loss": 1.2282,
283
+ "step": 3400
284
+ },
285
+ {
286
+ "epoch": 4.1703902293714625,
287
+ "grad_norm": 0.6072232723236084,
288
+ "learning_rate": 0.0001,
289
+ "loss": 1.1362,
290
+ "step": 3500
291
+ },
292
+ {
293
+ "epoch": 4.289544235924933,
294
+ "grad_norm": 0.6120555996894836,
295
+ "learning_rate": 0.0001,
296
+ "loss": 1.1453,
297
+ "step": 3600
298
+ },
299
+ {
300
+ "epoch": 4.408698242478403,
301
+ "grad_norm": 0.698825478553772,
302
+ "learning_rate": 0.0001,
303
+ "loss": 1.1467,
304
+ "step": 3700
305
+ },
306
+ {
307
+ "epoch": 4.5278522490318736,
308
+ "grad_norm": 0.6432230472564697,
309
+ "learning_rate": 0.0001,
310
+ "loss": 1.1593,
311
+ "step": 3800
312
+ },
313
+ {
314
+ "epoch": 4.647006255585344,
315
+ "grad_norm": 0.9182979464530945,
316
+ "learning_rate": 0.0001,
317
+ "loss": 1.1709,
318
+ "step": 3900
319
+ },
320
+ {
321
+ "epoch": 4.766160262138815,
322
+ "grad_norm": 0.6992977857589722,
323
+ "learning_rate": 0.0001,
324
+ "loss": 1.1674,
325
+ "step": 4000
326
+ },
327
+ {
328
+ "epoch": 4.885314268692285,
329
+ "grad_norm": 0.6830582022666931,
330
+ "learning_rate": 0.0001,
331
+ "loss": 1.1715,
332
+ "step": 4100
333
+ },
334
+ {
335
+ "epoch": 4.9997021149836165,
336
+ "eval_accuracy": 0.7422375809935206,
337
+ "eval_loss": 1.4463233947753906,
338
+ "eval_runtime": 7.7567,
339
+ "eval_samples_per_second": 64.461,
340
+ "eval_steps_per_second": 8.122,
341
+ "step": 4196
342
+ },
343
+ {
344
+ "epoch": 5.004468275245755,
345
+ "grad_norm": 0.5924903750419617,
346
+ "learning_rate": 0.0001,
347
+ "loss": 1.1816,
348
+ "step": 4200
349
+ },
350
+ {
351
+ "epoch": 5.123622281799226,
352
+ "grad_norm": 0.8615913987159729,
353
+ "learning_rate": 0.0001,
354
+ "loss": 1.0065,
355
+ "step": 4300
356
+ },
357
+ {
358
+ "epoch": 5.242776288352696,
359
+ "grad_norm": 0.8048379421234131,
360
+ "learning_rate": 0.0001,
361
+ "loss": 1.0146,
362
+ "step": 4400
363
+ },
364
+ {
365
+ "epoch": 5.361930294906166,
366
+ "grad_norm": 0.7855103015899658,
367
+ "learning_rate": 0.0001,
368
+ "loss": 1.0196,
369
+ "step": 4500
370
+ },
371
+ {
372
+ "epoch": 5.481084301459637,
373
+ "grad_norm": 0.8682273626327515,
374
+ "learning_rate": 0.0001,
375
+ "loss": 1.0334,
376
+ "step": 4600
377
+ },
378
+ {
379
+ "epoch": 5.600238308013107,
380
+ "grad_norm": 0.8534315228462219,
381
+ "learning_rate": 0.0001,
382
+ "loss": 1.0415,
383
+ "step": 4700
384
+ },
385
+ {
386
+ "epoch": 5.719392314566577,
387
+ "grad_norm": 0.7905874252319336,
388
+ "learning_rate": 0.0001,
389
+ "loss": 1.0472,
390
+ "step": 4800
391
+ },
392
+ {
393
+ "epoch": 5.838546321120048,
394
+ "grad_norm": 0.7829225659370422,
395
+ "learning_rate": 0.0001,
396
+ "loss": 1.0353,
397
+ "step": 4900
398
+ },
399
+ {
400
+ "epoch": 5.957700327673518,
401
+ "grad_norm": 0.7980929613113403,
402
+ "learning_rate": 0.0001,
403
+ "loss": 1.0458,
404
+ "step": 5000
405
+ },
406
+ {
407
+ "epoch": 5.999404229967233,
408
+ "eval_accuracy": 0.7483671706263499,
409
+ "eval_loss": 1.353654384613037,
410
+ "eval_runtime": 7.9987,
411
+ "eval_samples_per_second": 62.51,
412
+ "eval_steps_per_second": 7.876,
413
+ "step": 5035
414
+ },
415
+ {
416
+ "epoch": 6.076854334226988,
417
+ "grad_norm": 0.7904637455940247,
418
+ "learning_rate": 0.0001,
419
+ "loss": 0.9339,
420
+ "step": 5100
421
+ },
422
+ {
423
+ "epoch": 6.196008340780459,
424
+ "grad_norm": 1.046057105064392,
425
+ "learning_rate": 0.0001,
426
+ "loss": 0.8833,
427
+ "step": 5200
428
+ },
429
+ {
430
+ "epoch": 6.31516234733393,
431
+ "grad_norm": 0.8678649663925171,
432
+ "learning_rate": 0.0001,
433
+ "loss": 0.8899,
434
+ "step": 5300
435
+ },
436
+ {
437
+ "epoch": 6.434316353887399,
438
+ "grad_norm": 0.9677824378013611,
439
+ "learning_rate": 0.0001,
440
+ "loss": 0.9211,
441
+ "step": 5400
442
+ },
443
+ {
444
+ "epoch": 6.55347036044087,
445
+ "grad_norm": 0.9737918376922607,
446
+ "learning_rate": 0.0001,
447
+ "loss": 0.9206,
448
+ "step": 5500
449
+ },
450
+ {
451
+ "epoch": 6.67262436699434,
452
+ "grad_norm": 0.8853780627250671,
453
+ "learning_rate": 0.0001,
454
+ "loss": 0.9225,
455
+ "step": 5600
456
+ },
457
+ {
458
+ "epoch": 6.79177837354781,
459
+ "grad_norm": 0.9523513913154602,
460
+ "learning_rate": 0.0001,
461
+ "loss": 0.9163,
462
+ "step": 5700
463
+ },
464
+ {
465
+ "epoch": 6.910932380101281,
466
+ "grad_norm": 0.9134466648101807,
467
+ "learning_rate": 0.0001,
468
+ "loss": 0.9357,
469
+ "step": 5800
470
+ },
471
+ {
472
+ "epoch": 6.999106344950849,
473
+ "eval_accuracy": 0.7545961123110151,
474
+ "eval_loss": 1.2455778121948242,
475
+ "eval_runtime": 7.6574,
476
+ "eval_samples_per_second": 65.296,
477
+ "eval_steps_per_second": 8.227,
478
+ "step": 5874
479
+ },
480
+ {
481
+ "epoch": 7.030086386654752,
482
+ "grad_norm": 1.020137906074524,
483
+ "learning_rate": 0.0001,
484
+ "loss": 0.8909,
485
+ "step": 5900
486
+ },
487
+ {
488
+ "epoch": 7.149240393208221,
489
+ "grad_norm": 0.9680564403533936,
490
+ "learning_rate": 0.0001,
491
+ "loss": 0.7683,
492
+ "step": 6000
493
+ },
494
+ {
495
+ "epoch": 7.268394399761692,
496
+ "grad_norm": 0.9959320425987244,
497
+ "learning_rate": 0.0001,
498
+ "loss": 0.7802,
499
+ "step": 6100
500
+ },
501
+ {
502
+ "epoch": 7.387548406315163,
503
+ "grad_norm": 1.0101680755615234,
504
+ "learning_rate": 0.0001,
505
+ "loss": 0.7758,
506
+ "step": 6200
507
+ },
508
+ {
509
+ "epoch": 7.506702412868632,
510
+ "grad_norm": 0.929568886756897,
511
+ "learning_rate": 0.0001,
512
+ "loss": 0.8111,
513
+ "step": 6300
514
+ },
515
+ {
516
+ "epoch": 7.625856419422103,
517
+ "grad_norm": 1.101192593574524,
518
+ "learning_rate": 0.0001,
519
+ "loss": 0.8047,
520
+ "step": 6400
521
+ },
522
+ {
523
+ "epoch": 7.745010425975574,
524
+ "grad_norm": 1.0534611940383911,
525
+ "learning_rate": 0.0001,
526
+ "loss": 0.8168,
527
+ "step": 6500
528
+ },
529
+ {
530
+ "epoch": 7.864164432529043,
531
+ "grad_norm": 1.08072829246521,
532
+ "learning_rate": 0.0001,
533
+ "loss": 0.8178,
534
+ "step": 6600
535
+ },
536
+ {
537
+ "epoch": 7.983318439082514,
538
+ "grad_norm": 1.2470301389694214,
539
+ "learning_rate": 0.0001,
540
+ "loss": 0.8269,
541
+ "step": 6700
542
+ },
543
+ {
544
+ "epoch": 8.0,
545
+ "eval_accuracy": 0.7598142548596112,
546
+ "eval_loss": 1.1735292673110962,
547
+ "eval_runtime": 7.6157,
548
+ "eval_samples_per_second": 65.654,
549
+ "eval_steps_per_second": 8.272,
550
+ "step": 6714
551
+ },
552
+ {
553
+ "epoch": 8.102472445635984,
554
+ "grad_norm": 1.0507800579071045,
555
+ "learning_rate": 0.0001,
556
+ "loss": 0.6826,
557
+ "step": 6800
558
+ },
559
+ {
560
+ "epoch": 8.221626452189454,
561
+ "grad_norm": 0.966494083404541,
562
+ "learning_rate": 0.0001,
563
+ "loss": 0.6672,
564
+ "step": 6900
565
+ },
566
+ {
567
+ "epoch": 8.340780458742925,
568
+ "grad_norm": 1.0983446836471558,
569
+ "learning_rate": 0.0001,
570
+ "loss": 0.6883,
571
+ "step": 7000
572
+ },
573
+ {
574
+ "epoch": 8.459934465296396,
575
+ "grad_norm": 1.1256661415100098,
576
+ "learning_rate": 0.0001,
577
+ "loss": 0.6951,
578
+ "step": 7100
579
+ },
580
+ {
581
+ "epoch": 8.579088471849866,
582
+ "grad_norm": 1.2311198711395264,
583
+ "learning_rate": 0.0001,
584
+ "loss": 0.7196,
585
+ "step": 7200
586
+ },
587
+ {
588
+ "epoch": 8.698242478403337,
589
+ "grad_norm": 1.108267068862915,
590
+ "learning_rate": 0.0001,
591
+ "loss": 0.7081,
592
+ "step": 7300
593
+ },
594
+ {
595
+ "epoch": 8.817396484956806,
596
+ "grad_norm": 1.1294405460357666,
597
+ "learning_rate": 0.0001,
598
+ "loss": 0.7131,
599
+ "step": 7400
600
+ },
601
+ {
602
+ "epoch": 8.936550491510276,
603
+ "grad_norm": 1.301544189453125,
604
+ "learning_rate": 0.0001,
605
+ "loss": 0.7262,
606
+ "step": 7500
607
+ },
608
+ {
609
+ "epoch": 8.999702114983616,
610
+ "eval_accuracy": 0.7649460043196544,
611
+ "eval_loss": 1.0966144800186157,
612
+ "eval_runtime": 7.7663,
613
+ "eval_samples_per_second": 64.381,
614
+ "eval_steps_per_second": 8.112,
615
+ "step": 7553
616
+ },
617
+ {
618
+ "epoch": 9.055704498063747,
619
+ "grad_norm": 1.3238133192062378,
620
+ "learning_rate": 0.0001,
621
+ "loss": 0.6592,
622
+ "step": 7600
623
+ },
624
+ {
625
+ "epoch": 9.174858504617218,
626
+ "grad_norm": 0.9897598028182983,
627
+ "learning_rate": 0.0001,
628
+ "loss": 0.5839,
629
+ "step": 7700
630
+ },
631
+ {
632
+ "epoch": 9.294012511170688,
633
+ "grad_norm": 1.2646971940994263,
634
+ "learning_rate": 0.0001,
635
+ "loss": 0.6008,
636
+ "step": 7800
637
+ },
638
+ {
639
+ "epoch": 9.413166517724159,
640
+ "grad_norm": 1.1842299699783325,
641
+ "learning_rate": 0.0001,
642
+ "loss": 0.6062,
643
+ "step": 7900
644
+ },
645
+ {
646
+ "epoch": 9.53232052427763,
647
+ "grad_norm": 1.2886223793029785,
648
+ "learning_rate": 0.0001,
649
+ "loss": 0.6136,
650
+ "step": 8000
651
+ },
652
+ {
653
+ "epoch": 9.651474530831099,
654
+ "grad_norm": 1.2469590902328491,
655
+ "learning_rate": 0.0001,
656
+ "loss": 0.6217,
657
+ "step": 8100
658
+ },
659
+ {
660
+ "epoch": 9.77062853738457,
661
+ "grad_norm": 1.202868103981018,
662
+ "learning_rate": 0.0001,
663
+ "loss": 0.6322,
664
+ "step": 8200
665
+ },
666
+ {
667
+ "epoch": 9.88978254393804,
668
+ "grad_norm": 1.3775478601455688,
669
+ "learning_rate": 0.0001,
670
+ "loss": 0.6381,
671
+ "step": 8300
672
+ },
673
+ {
674
+ "epoch": 9.997021149836163,
675
+ "eval_accuracy": 0.7691922246220302,
676
+ "eval_loss": 1.0623269081115723,
677
+ "eval_runtime": 7.6207,
678
+ "eval_samples_per_second": 65.611,
679
+ "eval_steps_per_second": 8.267,
680
+ "step": 8390
681
+ },
682
+ {
683
+ "epoch": 9.997021149836163,
684
+ "step": 8390,
685
+ "total_flos": 6.866381543623885e+17,
686
+ "train_loss": 1.1092717030903723,
687
+ "train_runtime": 19337.1025,
688
+ "train_samples_per_second": 13.887,
689
+ "train_steps_per_second": 0.434
690
+ }
691
+ ],
692
+ "logging_steps": 100,
693
+ "max_steps": 8390,
694
+ "num_input_tokens_seen": 0,
695
+ "num_train_epochs": 10,
696
+ "save_steps": 500,
697
+ "total_flos": 6.866381543623885e+17,
698
+ "train_batch_size": 1,
699
+ "trial_name": null,
700
+ "trial_params": null
701
+ }