robertou2 commited on
Commit
12af06e
·
verified ·
1 Parent(s): f54b544

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -199,4 +199,4 @@ Carbon emissions can be estimated using the [Machine Learning Impact calculator]
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
- - PEFT 0.12.0
 
199
  [More Information Needed]
200
  ### Framework versions
201
 
202
+ - PEFT 0.13.2
adapter_config.json CHANGED
@@ -10,20 +10,20 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 64,
14
- "lora_dropout": 0.0001,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 32,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "down_proj",
24
  "qkv_proj",
25
- "gate_up_proj",
26
- "o_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 64,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "qkv_proj",
24
+ "down_proj",
25
+ "o_proj",
26
+ "gate_up_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34e8c063268c82d446f081987dac5fd9c69282ecfad89abd0570dc93517cdbc9
3
- size 201361312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d39d4b71c2d9c958752b7019b0481033ab8d7caa096419fe04a39f1e2c03e5f
3
+ size 402688040
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0157da31bb4062434f031ff2dd7c51f693e094db4fe85815de38edaefd40b9fa
3
- size 402868986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2214c2d7be4e7002e6b458c215e56dc3cc1231d71e76dcf574cfefeb1df1f14
3
+ size 805522170
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed70ecedcd9a62bbb04bf9838304aced41ca983de90cea5987c3cff1d4f80fe3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a39edec8fd0ca2c66adccb7ddca2a246727221a5cedfcaa945c37683bd0907
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f94793ec3497737749203684f2a64875f06eeb7a4781950315fb5cb4ec740a8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db20a34ad6b350b7c1ce1bf536f3e5516e15fa5f9d629c0ece20011d12bce789
3
  size 1064
trainer_state.json CHANGED
@@ -1,408 +1,235 @@
1
  {
2
- "best_metric": 0.4687739610671997,
3
- "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-250",
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.1,
13
- "grad_norm": 1.4559898376464844,
14
- "learning_rate": 3.75e-06,
15
- "loss": 1.8397,
16
- "step": 5
17
- },
18
- {
19
- "epoch": 0.2,
20
- "grad_norm": 1.011980414390564,
21
- "learning_rate": 7.5e-06,
22
- "loss": 1.4061,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.3,
27
- "grad_norm": 0.8619025945663452,
28
- "learning_rate": 1.125e-05,
29
- "loss": 1.037,
30
- "step": 15
31
- },
32
- {
33
- "epoch": 0.4,
34
- "grad_norm": 1.4890649318695068,
35
- "learning_rate": 1.5e-05,
36
- "loss": 1.2559,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.5,
41
- "grad_norm": 0.3758047819137573,
42
- "learning_rate": 1.8750000000000002e-05,
43
- "loss": 1.0541,
44
- "step": 25
45
- },
46
- {
47
- "epoch": 0.6,
48
- "grad_norm": 1.0797535181045532,
49
- "learning_rate": 2.25e-05,
50
- "loss": 1.2359,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.7,
55
- "grad_norm": 0.32953447103500366,
56
- "learning_rate": 2.625e-05,
57
- "loss": 0.8877,
58
- "step": 35
 
59
  },
60
  {
61
- "epoch": 0.8,
62
- "grad_norm": 0.319231241941452,
63
- "learning_rate": 3e-05,
64
- "loss": 1.0191,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.9,
69
- "grad_norm": 0.319320410490036,
70
- "learning_rate": 2.9996796251818968e-05,
71
- "loss": 0.8399,
72
- "step": 45
73
- },
74
- {
75
- "epoch": 1.0,
76
- "grad_norm": 1.2043859958648682,
77
- "learning_rate": 2.9987186375809513e-05,
78
- "loss": 0.9834,
79
- "step": 50
80
- },
81
- {
82
- "epoch": 1.0,
83
- "eval_loss": 0.6599301099777222,
84
- "eval_runtime": 52.5278,
85
- "eval_samples_per_second": 3.808,
86
- "eval_steps_per_second": 0.476,
87
  "step": 50
88
  },
89
  {
90
- "epoch": 1.1,
91
- "grad_norm": 0.39200976490974426,
92
- "learning_rate": 2.997117447698802e-05,
93
- "loss": 0.8063,
94
- "step": 55
95
- },
96
- {
97
- "epoch": 1.2,
98
- "grad_norm": 0.42485809326171875,
99
- "learning_rate": 2.994876739510005e-05,
100
- "loss": 0.5906,
101
  "step": 60
102
  },
103
  {
104
- "epoch": 1.3,
105
- "grad_norm": 0.5581662654876709,
106
- "learning_rate": 2.9919974701698638e-05,
107
- "loss": 0.7749,
108
- "step": 65
 
109
  },
110
  {
111
- "epoch": 1.4,
112
- "grad_norm": 0.8188683390617371,
113
- "learning_rate": 2.9884808696055675e-05,
114
- "loss": 0.7623,
115
  "step": 70
116
  },
117
  {
118
- "epoch": 1.5,
119
- "grad_norm": 0.4976309537887573,
120
- "learning_rate": 2.984328439990804e-05,
121
- "loss": 0.7587,
122
- "step": 75
123
- },
124
- {
125
- "epoch": 1.6,
126
- "grad_norm": 0.515602171421051,
127
- "learning_rate": 2.9795419551040836e-05,
128
- "loss": 0.6395,
129
  "step": 80
130
  },
131
  {
132
- "epoch": 1.7,
133
- "grad_norm": 0.8577103018760681,
134
- "learning_rate": 2.9741234595710393e-05,
135
- "loss": 0.5315,
136
- "step": 85
137
- },
138
- {
139
- "epoch": 1.8,
140
- "grad_norm": 0.6678707599639893,
141
- "learning_rate": 2.968075267991032e-05,
142
- "loss": 0.6739,
143
  "step": 90
144
  },
145
  {
146
- "epoch": 1.9,
147
- "grad_norm": 0.3638306260108948,
148
- "learning_rate": 2.9613999639484314e-05,
149
- "loss": 0.6927,
150
- "step": 95
151
- },
152
- {
153
- "epoch": 2.0,
154
- "grad_norm": 0.8823966383934021,
155
- "learning_rate": 2.9541003989089956e-05,
156
- "loss": 0.6094,
157
- "step": 100
158
- },
159
- {
160
- "epoch": 2.0,
161
- "eval_loss": 0.5690982341766357,
162
- "eval_runtime": 52.1876,
163
- "eval_samples_per_second": 3.832,
164
- "eval_steps_per_second": 0.479,
165
  "step": 100
166
  },
167
  {
168
- "epoch": 2.1,
169
- "grad_norm": 0.5922141671180725,
170
- "learning_rate": 2.9461796910018204e-05,
171
- "loss": 0.6031,
172
- "step": 105
 
173
  },
174
  {
175
- "epoch": 2.2,
176
- "grad_norm": 0.5325513482093811,
177
- "learning_rate": 2.9376412236873792e-05,
178
- "loss": 0.493,
179
  "step": 110
180
  },
181
  {
182
- "epoch": 2.3,
183
- "grad_norm": 1.020575761795044,
184
- "learning_rate": 2.928488644312222e-05,
185
- "loss": 0.4483,
186
- "step": 115
187
- },
188
- {
189
- "epoch": 2.4,
190
- "grad_norm": 0.9036449790000916,
191
- "learning_rate": 2.9187258625509518e-05,
192
- "loss": 0.5766,
193
  "step": 120
194
  },
195
  {
196
- "epoch": 2.5,
197
- "grad_norm": 1.0615090131759644,
198
- "learning_rate": 2.9083570487361445e-05,
199
- "loss": 0.4717,
200
- "step": 125
201
- },
202
- {
203
- "epoch": 2.6,
204
- "grad_norm": 0.638048529624939,
205
- "learning_rate": 2.8973866320769186e-05,
206
- "loss": 0.3577,
207
  "step": 130
208
  },
209
  {
210
- "epoch": 2.7,
211
- "grad_norm": 1.1508071422576904,
212
- "learning_rate": 2.8858192987669303e-05,
213
- "loss": 0.5615,
214
- "step": 135
 
215
  },
216
  {
217
- "epoch": 2.8,
218
- "grad_norm": 0.6334187984466553,
219
- "learning_rate": 2.873659989982586e-05,
220
- "loss": 0.3704,
221
  "step": 140
222
  },
223
  {
224
- "epoch": 2.9,
225
- "grad_norm": 0.53675377368927,
226
- "learning_rate": 2.86091389977234e-05,
227
- "loss": 0.3623,
228
- "step": 145
229
- },
230
- {
231
- "epoch": 3.0,
232
- "grad_norm": 0.5917493104934692,
233
- "learning_rate": 2.8475864728379682e-05,
234
- "loss": 0.3345,
235
- "step": 150
236
- },
237
- {
238
- "epoch": 3.0,
239
- "eval_loss": 0.5363968014717102,
240
- "eval_runtime": 52.2028,
241
- "eval_samples_per_second": 3.831,
242
- "eval_steps_per_second": 0.479,
243
  "step": 150
244
  },
245
  {
246
- "epoch": 3.1,
247
- "grad_norm": 1.654146671295166,
248
- "learning_rate": 2.8336834022087776e-05,
249
- "loss": 0.3779,
250
- "step": 155
251
- },
252
- {
253
- "epoch": 3.2,
254
- "grad_norm": 0.9066053032875061,
255
- "learning_rate": 2.8192106268097336e-05,
256
- "loss": 0.2994,
257
  "step": 160
258
  },
259
  {
260
- "epoch": 3.3,
261
- "grad_norm": 0.5281007289886475,
262
- "learning_rate": 2.8041743289245503e-05,
263
- "loss": 0.4545,
264
- "step": 165
265
- },
266
- {
267
- "epoch": 3.4,
268
- "grad_norm": 0.8571799397468567,
269
- "learning_rate": 2.788580931554828e-05,
270
- "loss": 0.3399,
271
  "step": 170
272
  },
273
  {
274
- "epoch": 3.5,
275
- "grad_norm": 0.43631649017333984,
276
- "learning_rate": 2.7724370956763605e-05,
277
- "loss": 0.2589,
278
- "step": 175
 
279
  },
280
  {
281
- "epoch": 3.6,
282
- "grad_norm": 0.7908278107643127,
283
- "learning_rate": 2.7557497173937928e-05,
284
- "loss": 0.3241,
285
  "step": 180
286
  },
287
  {
288
- "epoch": 3.7,
289
- "grad_norm": 1.0415078401565552,
290
- "learning_rate": 2.7385259249948338e-05,
291
- "loss": 0.3205,
292
- "step": 185
293
- },
294
- {
295
- "epoch": 3.8,
296
- "grad_norm": 0.5231990218162537,
297
- "learning_rate": 2.7207730759052925e-05,
298
- "loss": 0.1806,
299
  "step": 190
300
  },
301
  {
302
- "epoch": 3.9,
303
- "grad_norm": 0.48716872930526733,
304
- "learning_rate": 2.7024987535462327e-05,
305
- "loss": 0.172,
306
- "step": 195
307
- },
308
- {
309
- "epoch": 4.0,
310
- "grad_norm": 0.6646760702133179,
311
- "learning_rate": 2.6837107640945904e-05,
312
- "loss": 0.2291,
313
  "step": 200
314
  },
315
  {
316
- "epoch": 4.0,
317
- "eval_loss": 0.5222796201705933,
318
- "eval_runtime": 52.1967,
319
- "eval_samples_per_second": 3.832,
320
- "eval_steps_per_second": 0.479,
321
- "step": 200
322
- },
323
- {
324
- "epoch": 4.1,
325
- "grad_norm": 1.3394831418991089,
326
- "learning_rate": 2.6644171331486363e-05,
327
- "loss": 0.2097,
328
- "step": 205
329
  },
330
  {
331
- "epoch": 4.2,
332
- "grad_norm": 0.6753952503204346,
333
- "learning_rate": 2.6446261022997098e-05,
334
- "loss": 0.2552,
335
  "step": 210
336
  },
337
  {
338
- "epoch": 4.3,
339
- "grad_norm": 0.5856276750564575,
340
- "learning_rate": 2.6243461256116892e-05,
341
- "loss": 0.1606,
342
- "step": 215
343
- },
344
- {
345
- "epoch": 4.4,
346
- "grad_norm": 0.695767879486084,
347
- "learning_rate": 2.6035858660096975e-05,
348
- "loss": 0.2958,
349
  "step": 220
350
  },
351
  {
352
- "epoch": 4.5,
353
- "grad_norm": 0.6565276980400085,
354
- "learning_rate": 2.5823541915795932e-05,
355
- "loss": 0.1491,
356
- "step": 225
357
- },
358
- {
359
- "epoch": 4.6,
360
- "grad_norm": 0.497454971075058,
361
- "learning_rate": 2.5606601717798212e-05,
362
- "loss": 0.1945,
363
  "step": 230
364
  },
365
  {
366
- "epoch": 4.7,
367
- "grad_norm": 0.7928630709648132,
368
- "learning_rate": 2.5385130735672442e-05,
369
- "loss": 0.1197,
370
- "step": 235
371
- },
372
- {
373
- "epoch": 4.8,
374
- "grad_norm": 0.9403858780860901,
375
- "learning_rate": 2.5159223574386117e-05,
376
- "loss": 0.2448,
377
- "step": 240
378
- },
379
- {
380
- "epoch": 4.9,
381
- "grad_norm": 0.41166239976882935,
382
- "learning_rate": 2.49289767338935e-05,
383
- "loss": 0.2321,
384
- "step": 245
385
- },
386
- {
387
- "epoch": 5.0,
388
- "grad_norm": 0.4782765805721283,
389
- "learning_rate": 2.469448856791411e-05,
390
- "loss": 0.1126,
391
- "step": 250
392
- },
393
- {
394
- "epoch": 5.0,
395
- "eval_loss": 0.4687739610671997,
396
- "eval_runtime": 52.1992,
397
- "eval_samples_per_second": 3.831,
398
- "eval_steps_per_second": 0.479,
399
- "step": 250
400
  }
401
  ],
402
- "logging_steps": 5,
403
- "max_steps": 800,
404
  "num_input_tokens_seen": 0,
405
- "num_train_epochs": 16,
406
  "save_steps": 500,
407
  "stateful_callbacks": {
408
  "TrainerControl": {
@@ -416,8 +243,8 @@
416
  "attributes": {}
417
  }
418
  },
419
- "total_flos": 4.646855528216986e+16,
420
- "train_batch_size": 2,
421
  "trial_name": null,
422
  "trial_params": null
423
  }
 
1
  {
2
+ "best_metric": 0.5998682379722595,
3
+ "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-238",
4
+ "epoch": 7.0,
5
  "eval_steps": 500,
6
+ "global_step": 238,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.2962962962962963,
13
+ "grad_norm": 0.869780957698822,
14
+ "learning_rate": 7.692307692307694e-06,
15
+ "loss": 0.9461,
 
 
 
 
 
 
 
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.5925925925925926,
20
+ "grad_norm": 0.5883250832557678,
21
+ "learning_rate": 9.978490638616671e-06,
22
+ "loss": 0.6991,
 
 
 
 
 
 
 
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.8888888888888888,
27
+ "grad_norm": 0.922535240650177,
28
+ "learning_rate": 9.873583924954152e-06,
29
+ "loss": 0.7785,
 
 
 
 
 
 
 
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 1.0,
34
+ "eval_loss": 0.6846582889556885,
35
+ "eval_runtime": 3.3866,
36
+ "eval_samples_per_second": 4.429,
37
+ "eval_steps_per_second": 0.591,
38
+ "step": 34
39
  },
40
  {
41
+ "epoch": 1.1777777777777778,
42
+ "grad_norm": 0.3914264738559723,
43
+ "learning_rate": 9.68316749134364e-06,
44
+ "loss": 0.6765,
45
  "step": 40
46
  },
47
  {
48
+ "epoch": 1.474074074074074,
49
+ "grad_norm": 0.5533085465431213,
50
+ "learning_rate": 9.410582299213574e-06,
51
+ "loss": 0.799,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  "step": 50
53
  },
54
  {
55
+ "epoch": 1.7703703703703704,
56
+ "grad_norm": 0.3141545355319977,
57
+ "learning_rate": 9.060611006213833e-06,
58
+ "loss": 0.5998,
 
 
 
 
 
 
 
59
  "step": 60
60
  },
61
  {
62
+ "epoch": 2.0,
63
+ "eval_loss": 0.6442674994468689,
64
+ "eval_runtime": 3.3729,
65
+ "eval_samples_per_second": 4.447,
66
+ "eval_steps_per_second": 0.593,
67
+ "step": 68
68
  },
69
  {
70
+ "epoch": 2.0592592592592593,
71
+ "grad_norm": 0.6881595849990845,
72
+ "learning_rate": 8.639394051847472e-06,
73
+ "loss": 0.6565,
74
  "step": 70
75
  },
76
  {
77
+ "epoch": 2.3555555555555556,
78
+ "grad_norm": 0.817984402179718,
79
+ "learning_rate": 8.154321920070415e-06,
80
+ "loss": 0.6779,
 
 
 
 
 
 
 
81
  "step": 80
82
  },
83
  {
84
+ "epoch": 2.651851851851852,
85
+ "grad_norm": 0.585773229598999,
86
+ "learning_rate": 7.613905469171247e-06,
87
+ "loss": 0.5443,
 
 
 
 
 
 
 
88
  "step": 90
89
  },
90
  {
91
+ "epoch": 2.948148148148148,
92
+ "grad_norm": 0.5395255088806152,
93
+ "learning_rate": 7.02762660406497e-06,
94
+ "loss": 0.6243,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  "step": 100
96
  },
97
  {
98
+ "epoch": 3.0,
99
+ "eval_loss": 0.6209592223167419,
100
+ "eval_runtime": 3.3724,
101
+ "eval_samples_per_second": 4.448,
102
+ "eval_steps_per_second": 0.593,
103
+ "step": 102
104
  },
105
  {
106
+ "epoch": 3.237037037037037,
107
+ "grad_norm": 0.7252342104911804,
108
+ "learning_rate": 6.405771911037698e-06,
109
+ "loss": 0.6189,
110
  "step": 110
111
  },
112
  {
113
+ "epoch": 3.533333333333333,
114
+ "grad_norm": 0.5255349278450012,
115
+ "learning_rate": 5.759252173912573e-06,
116
+ "loss": 0.5914,
 
 
 
 
 
 
 
117
  "step": 120
118
  },
119
  {
120
+ "epoch": 3.8296296296296295,
121
+ "grad_norm": 0.5693609118461609,
122
+ "learning_rate": 5.099410938325351e-06,
123
+ "loss": 0.5872,
 
 
 
 
 
 
 
124
  "step": 130
125
  },
126
  {
127
+ "epoch": 4.0,
128
+ "eval_loss": 0.6094754934310913,
129
+ "eval_runtime": 3.3715,
130
+ "eval_samples_per_second": 4.449,
131
+ "eval_steps_per_second": 0.593,
132
+ "step": 136
133
  },
134
  {
135
+ "epoch": 4.118518518518519,
136
+ "grad_norm": 0.38578182458877563,
137
+ "learning_rate": 4.43782548295514e-06,
138
+ "loss": 0.574,
139
  "step": 140
140
  },
141
  {
142
+ "epoch": 4.4148148148148145,
143
+ "grad_norm": 0.5012251138687134,
144
+ "learning_rate": 3.786103689779861e-06,
145
+ "loss": 0.5227,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  "step": 150
147
  },
148
  {
149
+ "epoch": 4.711111111111111,
150
+ "grad_norm": 0.5396020412445068,
151
+ "learning_rate": 3.1556803773799616e-06,
152
+ "loss": 0.5366,
 
 
 
 
 
 
 
153
  "step": 160
154
  },
155
  {
156
+ "epoch": 5.0,
157
+ "grad_norm": 0.5771762728691101,
158
+ "learning_rate": 2.5576166707349387e-06,
159
+ "loss": 0.6322,
 
 
 
 
 
 
 
160
  "step": 170
161
  },
162
  {
163
+ "epoch": 5.0,
164
+ "eval_loss": 0.6032379269599915,
165
+ "eval_runtime": 3.372,
166
+ "eval_samples_per_second": 4.448,
167
+ "eval_steps_per_second": 0.593,
168
+ "step": 170
169
  },
170
  {
171
+ "epoch": 5.296296296296296,
172
+ "grad_norm": 0.5676046013832092,
173
+ "learning_rate": 2.0024059276803742e-06,
174
+ "loss": 0.5883,
175
  "step": 180
176
  },
177
  {
178
+ "epoch": 5.592592592592593,
179
+ "grad_norm": 0.4530661702156067,
180
+ "learning_rate": 1.499789627152874e-06,
181
+ "loss": 0.5619,
 
 
 
 
 
 
 
182
  "step": 190
183
  },
184
  {
185
+ "epoch": 5.888888888888889,
186
+ "grad_norm": 0.5783275365829468,
187
+ "learning_rate": 1.0585864495652899e-06,
188
+ "loss": 0.4661,
 
 
 
 
 
 
 
189
  "step": 200
190
  },
191
  {
192
+ "epoch": 6.0,
193
+ "eval_loss": 0.6005836129188538,
194
+ "eval_runtime": 3.3722,
195
+ "eval_samples_per_second": 4.448,
196
+ "eval_steps_per_second": 0.593,
197
+ "step": 204
 
 
 
 
 
 
 
198
  },
199
  {
200
+ "epoch": 6.177777777777778,
201
+ "grad_norm": 0.5576639175415039,
202
+ "learning_rate": 6.865375481914017e-07,
203
+ "loss": 0.5346,
204
  "step": 210
205
  },
206
  {
207
+ "epoch": 6.474074074074074,
208
+ "grad_norm": 0.5694870948791504,
209
+ "learning_rate": 3.9017072635896716e-07,
210
+ "loss": 0.5697,
 
 
 
 
 
 
 
211
  "step": 220
212
  },
213
  {
214
+ "epoch": 6.770370370370371,
215
+ "grad_norm": 0.35573288798332214,
216
+ "learning_rate": 1.7468590353731495e-07,
217
+ "loss": 0.5585,
 
 
 
 
 
 
 
218
  "step": 230
219
  },
220
  {
221
+ "epoch": 7.0,
222
+ "eval_loss": 0.5998682379722595,
223
+ "eval_runtime": 3.3736,
224
+ "eval_samples_per_second": 4.446,
225
+ "eval_steps_per_second": 0.593,
226
+ "step": 238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  }
228
  ],
229
+ "logging_steps": 10,
230
+ "max_steps": 250,
231
  "num_input_tokens_seen": 0,
232
+ "num_train_epochs": 8,
233
  "save_steps": 500,
234
  "stateful_callbacks": {
235
  "TrainerControl": {
 
243
  "attributes": {}
244
  }
245
  },
246
+ "total_flos": 2.291052678921216e+16,
247
+ "train_batch_size": 1,
248
  "trial_name": null,
249
  "trial_params": null
250
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:44a91374d47e061d44848107bfc25ebd1ed4e3cf32bfc6349d577cac835076d2
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:057e998df0396c8c0743c2e8486036bb54b886294b8c5da9a7b7083bcb4e9d62
3
  size 5624