chansung commited on
Commit
b2cbe68
1 Parent(s): 9494e6d

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,12 @@
2
  license: gemma
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  base_model: google/gemma-7b
10
  datasets:
11
- - llama-duo/synth_summarize_dataset
12
  model-index:
13
  - name: gemma7b-summarize-gpt4o-30k
14
  results: []
@@ -17,12 +16,12 @@ model-index:
17
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
  should probably proofread and complete it, then remove this comment. -->
19
 
20
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/ddvw2m8z)
21
  # gemma7b-summarize-gpt4o-30k
22
 
23
- This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 2.3811
26
 
27
  ## Model description
28
 
@@ -53,17 +52,22 @@ The following hyperparameters were used during training:
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: cosine
55
  - lr_scheduler_warmup_ratio: 0.1
56
- - num_epochs: 5
57
 
58
  ### Training results
59
 
60
  | Training Loss | Epoch | Step | Validation Loss |
61
  |:-------------:|:-----:|:----:|:---------------:|
62
- | 0.9712 | 1.0 | 137 | 2.3077 |
63
- | 0.8675 | 2.0 | 274 | 2.2479 |
64
- | 0.7623 | 3.0 | 411 | 2.2756 |
65
- | 0.709 | 4.0 | 548 | 2.3417 |
66
- | 0.6601 | 5.0 | 685 | 2.3811 |
 
 
 
 
 
67
 
68
 
69
  ### Framework versions
 
2
  license: gemma
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  base_model: google/gemma-7b
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: gemma7b-summarize-gpt4o-30k
13
  results: []
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/chansung18/huggingface/runs/gtgsbwvu)
20
  # gemma7b-summarize-gpt4o-30k
21
 
22
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
23
  It achieves the following results on the evaluation set:
24
+ - Loss: 3.2430
25
 
26
  ## Model description
27
 
 
52
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
53
  - lr_scheduler_type: cosine
54
  - lr_scheduler_warmup_ratio: 0.1
55
+ - num_epochs: 10
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
+ | 1.1572 | 1.0 | 111 | 2.3072 |
62
+ | 0.9296 | 2.0 | 222 | 2.1789 |
63
+ | 0.8273 | 3.0 | 333 | 2.1709 |
64
+ | 0.7586 | 4.0 | 444 | 2.2164 |
65
+ | 0.6613 | 5.0 | 555 | 2.3182 |
66
+ | 0.577 | 6.0 | 666 | 2.4774 |
67
+ | 0.4958 | 7.0 | 777 | 2.7036 |
68
+ | 0.4205 | 8.0 | 888 | 2.9689 |
69
+ | 0.382 | 9.0 | 999 | 3.2252 |
70
+ | 0.372 | 10.0 | 1110 | 3.2430 |
71
 
72
 
73
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1578c9aa1d019a32f4f14559226badaa97bb33080e583876456fc629835a8cb8
3
  size 50056096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32b4355c727acdb0c6029f34cd21b7f1e40baf4881b93221c8019898e95b873f
3
  size 50056096
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "eval_loss": 2.3811252117156982,
4
- "eval_runtime": 1.024,
5
- "eval_samples": 25,
6
- "eval_samples_per_second": 4.883,
7
- "eval_steps_per_second": 1.953,
8
- "total_flos": 1.0472781231601746e+18,
9
- "train_loss": 2.151051264783762,
10
- "train_runtime": 5341.9856,
11
- "train_samples": 29787,
12
- "train_samples_per_second": 2.052,
13
- "train_steps_per_second": 0.128
14
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.697049221804327e+18,
4
+ "train_loss": 1.8630313719715084,
5
+ "train_runtime": 9058.6901,
6
+ "train_samples": 32782,
7
+ "train_samples_per_second": 1.957,
8
+ "train_steps_per_second": 0.123
 
 
 
 
 
9
  }
runs/May21_04-02-00_deep-diver-main-tough-snake-1-0-0/events.out.tfevents.1716278664.deep-diver-main-tough-snake-1-0-0.385.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3a789b1bcb5c469f2d0771d5392b716d51c212add7fcc76c2f0831c7fa8c9fd
3
- size 54552
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb3394383ff9d99394f680955b2c1d92f3e2570009d97a580da145d58da55e7
3
+ size 55599
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 5.0,
3
- "total_flos": 1.0472781231601746e+18,
4
- "train_loss": 2.151051264783762,
5
- "train_runtime": 5341.9856,
6
- "train_samples": 29787,
7
- "train_samples_per_second": 2.052,
8
- "train_steps_per_second": 0.128
9
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.697049221804327e+18,
4
+ "train_loss": 1.8630313719715084,
5
+ "train_runtime": 9058.6901,
6
+ "train_samples": 32782,
7
+ "train_samples_per_second": 1.957,
8
+ "train_steps_per_second": 0.123
9
  }
trainer_state.json CHANGED
@@ -1,1033 +1,1668 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 685,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0072992700729927005,
13
- "grad_norm": 708.0,
14
- "learning_rate": 2.898550724637681e-06,
15
- "loss": 56.8346,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.0364963503649635,
20
- "grad_norm": 604.0,
21
- "learning_rate": 1.4492753623188407e-05,
22
- "loss": 52.9742,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.072992700729927,
27
- "grad_norm": 340.0,
28
- "learning_rate": 2.8985507246376814e-05,
29
- "loss": 39.0746,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.10948905109489052,
34
- "grad_norm": 40.25,
35
- "learning_rate": 4.347826086956522e-05,
36
- "loss": 20.8099,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.145985401459854,
41
- "grad_norm": 25.5,
42
- "learning_rate": 5.797101449275363e-05,
43
- "loss": 17.6144,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.18248175182481752,
48
- "grad_norm": 7.78125,
49
- "learning_rate": 7.246376811594203e-05,
50
- "loss": 15.3803,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.21897810218978103,
55
- "grad_norm": 6.40625,
56
- "learning_rate": 8.695652173913044e-05,
57
- "loss": 14.0798,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.25547445255474455,
62
- "grad_norm": 13.4375,
63
- "learning_rate": 0.00010144927536231885,
64
- "loss": 13.4032,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.291970802919708,
69
- "grad_norm": 41.0,
70
- "learning_rate": 0.00011594202898550725,
71
- "loss": 10.8827,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.3284671532846715,
76
- "grad_norm": 13.1875,
77
- "learning_rate": 0.00013043478260869567,
78
- "loss": 4.5915,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.36496350364963503,
83
- "grad_norm": 4.09375,
84
- "learning_rate": 0.00014492753623188405,
85
- "loss": 1.9,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.40145985401459855,
90
- "grad_norm": 2.28125,
91
- "learning_rate": 0.00015942028985507247,
92
- "loss": 1.6474,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.43795620437956206,
97
- "grad_norm": 3.5,
98
- "learning_rate": 0.00017391304347826088,
99
- "loss": 1.477,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.4744525547445255,
104
- "grad_norm": 2.28125,
105
- "learning_rate": 0.00018840579710144927,
106
- "loss": 1.3309,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.5109489051094891,
111
- "grad_norm": 1.6171875,
112
- "learning_rate": 0.00019999869950890106,
113
- "loss": 1.2538,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.5474452554744526,
118
- "grad_norm": 5.9375,
119
- "learning_rate": 0.0001999531858720213,
120
- "loss": 1.224,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.583941605839416,
125
- "grad_norm": 2.25,
126
- "learning_rate": 0.00019984268150178167,
127
- "loss": 1.1823,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.6204379562043796,
132
- "grad_norm": 2.078125,
133
- "learning_rate": 0.00019966725824941932,
134
- "loss": 1.1279,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.656934306569343,
139
- "grad_norm": 3.0625,
140
- "learning_rate": 0.00019942703017718975,
141
- "loss": 1.127,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.6934306569343066,
146
- "grad_norm": 1.75,
147
- "learning_rate": 0.000199122153484202,
148
- "loss": 1.1284,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.7299270072992701,
153
- "grad_norm": 1.5625,
154
- "learning_rate": 0.00019875282640485645,
155
- "loss": 1.0566,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.7664233576642335,
160
- "grad_norm": 4.53125,
161
- "learning_rate": 0.0001983192890799503,
162
- "loss": 1.0361,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.8029197080291971,
167
- "grad_norm": 2.5,
168
- "learning_rate": 0.0001978218234005352,
169
- "loss": 1.0371,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.8394160583941606,
174
- "grad_norm": 1.890625,
175
- "learning_rate": 0.00019726075282462845,
176
- "loss": 1.0235,
 
 
 
 
 
 
 
 
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.8759124087591241,
181
- "grad_norm": 0.67578125,
182
- "learning_rate": 0.00019663644216689683,
183
- "loss": 0.996,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.9124087591240876,
188
- "grad_norm": 1.2421875,
189
- "learning_rate": 0.00019594929736144976,
190
- "loss": 0.9734,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.948905109489051,
195
- "grad_norm": 1.5625,
196
- "learning_rate": 0.00019519976519789616,
197
- "loss": 0.978,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.9854014598540146,
202
- "grad_norm": 0.95703125,
203
- "learning_rate": 0.00019438833303083678,
204
- "loss": 0.9712,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 1.0,
209
- "eval_loss": 2.307734489440918,
210
- "eval_runtime": 0.9962,
211
- "eval_samples_per_second": 5.019,
212
- "eval_steps_per_second": 2.008,
213
- "step": 137
214
- },
215
- {
216
- "epoch": 1.0218978102189782,
217
- "grad_norm": 2.125,
218
- "learning_rate": 0.00019351552846298025,
219
- "loss": 0.9374,
220
  "step": 140
221
  },
222
  {
223
- "epoch": 1.0583941605839415,
224
- "grad_norm": 2.265625,
225
- "learning_rate": 0.0001925819190020898,
226
- "loss": 0.9173,
227
  "step": 145
228
  },
229
  {
230
- "epoch": 1.094890510948905,
231
- "grad_norm": 0.828125,
232
- "learning_rate": 0.00019158811169198313,
233
- "loss": 0.8916,
234
  "step": 150
235
  },
236
  {
237
- "epoch": 1.1313868613138687,
238
- "grad_norm": 1.0703125,
239
- "learning_rate": 0.0001905347527178252,
240
- "loss": 0.9418,
241
  "step": 155
242
  },
243
  {
244
- "epoch": 1.167883211678832,
245
- "grad_norm": 0.9140625,
246
- "learning_rate": 0.00018942252698597113,
247
- "loss": 0.9054,
248
  "step": 160
249
  },
250
  {
251
- "epoch": 1.2043795620437956,
252
- "grad_norm": 2.0625,
253
- "learning_rate": 0.00018825215767863214,
254
- "loss": 0.9039,
255
  "step": 165
256
  },
257
  {
258
- "epoch": 1.2408759124087592,
259
- "grad_norm": 1.5859375,
260
- "learning_rate": 0.00018702440578365387,
261
- "loss": 0.9146,
262
  "step": 170
263
  },
264
  {
265
- "epoch": 1.2773722627737225,
266
- "grad_norm": 1.3515625,
267
- "learning_rate": 0.00018574006959971333,
268
- "loss": 0.8896,
269
  "step": 175
270
  },
271
  {
272
- "epoch": 1.313868613138686,
273
- "grad_norm": 2.09375,
274
- "learning_rate": 0.00018439998421725554,
275
- "loss": 0.8947,
276
  "step": 180
277
  },
278
  {
279
- "epoch": 1.3503649635036497,
280
- "grad_norm": 0.80078125,
281
- "learning_rate": 0.00018300502097550806,
282
- "loss": 0.881,
283
  "step": 185
284
  },
285
  {
286
- "epoch": 1.3868613138686132,
287
- "grad_norm": 0.80078125,
288
- "learning_rate": 0.00018155608689592604,
289
- "loss": 0.8906,
290
  "step": 190
291
  },
292
  {
293
- "epoch": 1.4233576642335766,
294
- "grad_norm": 0.80859375,
295
- "learning_rate": 0.00018005412409243606,
296
- "loss": 0.8939,
297
  "step": 195
298
  },
299
  {
300
- "epoch": 1.4598540145985401,
301
- "grad_norm": 1.0234375,
302
- "learning_rate": 0.0001785001091588628,
303
- "loss": 0.9016,
304
  "step": 200
305
  },
306
  {
307
- "epoch": 1.4963503649635037,
308
- "grad_norm": 0.70703125,
309
- "learning_rate": 0.0001768950525339362,
310
- "loss": 0.8943,
311
  "step": 205
312
  },
313
  {
314
- "epoch": 1.5328467153284673,
315
- "grad_norm": 1.2109375,
316
- "learning_rate": 0.00017523999784429238,
317
- "loss": 0.8614,
318
  "step": 210
319
  },
320
  {
321
- "epoch": 1.5693430656934306,
322
- "grad_norm": 0.7734375,
323
- "learning_rate": 0.00017353602122589527,
324
- "loss": 0.8788,
325
  "step": 215
326
  },
327
  {
328
- "epoch": 1.6058394160583942,
329
- "grad_norm": 0.82421875,
330
- "learning_rate": 0.0001717842306243205,
331
- "loss": 0.8833,
332
  "step": 220
333
  },
334
  {
335
- "epoch": 1.6423357664233578,
336
- "grad_norm": 0.84765625,
337
- "learning_rate": 0.00016998576507435618,
338
- "loss": 0.8713,
 
 
 
 
 
 
 
 
339
  "step": 225
340
  },
341
  {
342
- "epoch": 1.6788321167883211,
343
- "grad_norm": 1.234375,
344
- "learning_rate": 0.00016814179395938913,
345
- "loss": 0.8661,
346
  "step": 230
347
  },
348
  {
349
- "epoch": 1.7153284671532847,
350
- "grad_norm": 0.91015625,
351
- "learning_rate": 0.00016625351625105796,
352
- "loss": 0.8413,
353
  "step": 235
354
  },
355
  {
356
- "epoch": 1.7518248175182483,
357
- "grad_norm": 0.63671875,
358
- "learning_rate": 0.0001643221597296679,
359
- "loss": 0.8741,
360
  "step": 240
361
  },
362
  {
363
- "epoch": 1.7883211678832116,
364
- "grad_norm": 0.73046875,
365
- "learning_rate": 0.00016234898018587337,
366
- "loss": 0.8744,
367
  "step": 245
368
  },
369
  {
370
- "epoch": 1.8248175182481752,
371
- "grad_norm": 0.671875,
372
- "learning_rate": 0.00016033526060414842,
373
- "loss": 0.8517,
374
  "step": 250
375
  },
376
  {
377
- "epoch": 1.8613138686131387,
378
- "grad_norm": 1.0234375,
379
- "learning_rate": 0.00015828231032857503,
380
- "loss": 0.8899,
381
  "step": 255
382
  },
383
  {
384
- "epoch": 1.897810218978102,
385
- "grad_norm": 0.66796875,
386
- "learning_rate": 0.00015619146421149232,
387
- "loss": 0.8537,
388
  "step": 260
389
  },
390
  {
391
- "epoch": 1.9343065693430657,
392
- "grad_norm": 0.7109375,
393
- "learning_rate": 0.00015406408174555976,
394
- "loss": 0.8329,
395
  "step": 265
396
  },
397
  {
398
- "epoch": 1.9708029197080292,
399
- "grad_norm": 0.71875,
400
- "learning_rate": 0.00015190154617979938,
401
- "loss": 0.8675,
402
  "step": 270
403
  },
404
  {
405
- "epoch": 2.0,
406
- "eval_loss": 2.247941017150879,
407
- "eval_runtime": 0.9979,
408
- "eval_samples_per_second": 5.01,
409
- "eval_steps_per_second": 2.004,
410
- "step": 274
411
- },
412
- {
413
- "epoch": 2.0072992700729926,
414
- "grad_norm": 0.80859375,
415
- "learning_rate": 0.00014970526362019079,
416
- "loss": 0.8435,
417
  "step": 275
418
  },
419
  {
420
- "epoch": 2.0437956204379564,
421
- "grad_norm": 1.515625,
422
- "learning_rate": 0.00014747666211540459,
423
- "loss": 0.7774,
424
  "step": 280
425
  },
426
  {
427
- "epoch": 2.0802919708029197,
428
- "grad_norm": 1.0859375,
429
- "learning_rate": 0.00014521719072826858,
430
- "loss": 0.79,
431
  "step": 285
432
  },
433
  {
434
- "epoch": 2.116788321167883,
435
- "grad_norm": 0.498046875,
436
- "learning_rate": 0.00014292831859356997,
437
- "loss": 0.7929,
438
  "step": 290
439
  },
440
  {
441
- "epoch": 2.153284671532847,
442
- "grad_norm": 1.59375,
443
- "learning_rate": 0.00014061153396280674,
444
- "loss": 0.8032,
445
  "step": 295
446
  },
447
  {
448
- "epoch": 2.18978102189781,
449
- "grad_norm": 0.83203125,
450
- "learning_rate": 0.000138268343236509,
451
- "loss": 0.7932,
452
  "step": 300
453
  },
454
  {
455
- "epoch": 2.2262773722627736,
456
- "grad_norm": 0.734375,
457
- "learning_rate": 0.00013590026998475986,
458
- "loss": 0.7657,
459
  "step": 305
460
  },
461
  {
462
- "epoch": 2.2627737226277373,
463
- "grad_norm": 0.609375,
464
- "learning_rate": 0.0001335088539565523,
465
- "loss": 0.783,
466
  "step": 310
467
  },
468
  {
469
- "epoch": 2.2992700729927007,
470
- "grad_norm": 0.71484375,
471
- "learning_rate": 0.00013109565007862596,
472
- "loss": 0.7755,
473
  "step": 315
474
  },
475
  {
476
- "epoch": 2.335766423357664,
477
- "grad_norm": 0.609375,
478
- "learning_rate": 0.0001286622274444361,
479
- "loss": 0.7723,
480
  "step": 320
481
  },
482
  {
483
- "epoch": 2.372262773722628,
484
- "grad_norm": 1.3359375,
485
- "learning_rate": 0.00012621016829391022,
486
- "loss": 0.7739,
487
  "step": 325
488
  },
489
  {
490
- "epoch": 2.408759124087591,
491
- "grad_norm": 1.1328125,
492
- "learning_rate": 0.00012374106698465732,
493
- "loss": 0.7821,
494
  "step": 330
495
  },
496
  {
497
- "epoch": 2.445255474452555,
498
- "grad_norm": 0.91015625,
499
- "learning_rate": 0.00012125652895529766,
500
- "loss": 0.7852,
 
 
 
 
 
 
 
 
501
  "step": 335
502
  },
503
  {
504
- "epoch": 2.4817518248175183,
505
- "grad_norm": 0.74609375,
506
- "learning_rate": 0.00011875816968158815,
507
- "loss": 0.7792,
508
  "step": 340
509
  },
510
  {
511
- "epoch": 2.5182481751824817,
512
- "grad_norm": 0.625,
513
- "learning_rate": 0.00011624761362602061,
514
- "loss": 0.7799,
515
  "step": 345
516
  },
517
  {
518
- "epoch": 2.554744525547445,
519
- "grad_norm": 0.81640625,
520
- "learning_rate": 0.00011372649318157749,
521
- "loss": 0.7914,
522
  "step": 350
523
  },
524
  {
525
- "epoch": 2.591240875912409,
526
- "grad_norm": 0.80078125,
527
- "learning_rate": 0.00011119644761033078,
528
- "loss": 0.7847,
529
  "step": 355
530
  },
531
  {
532
- "epoch": 2.627737226277372,
533
- "grad_norm": 0.984375,
534
- "learning_rate": 0.0001086591219775746,
535
- "loss": 0.8049,
536
  "step": 360
537
  },
538
  {
539
- "epoch": 2.664233576642336,
540
- "grad_norm": 0.81640625,
541
- "learning_rate": 0.00010611616608218429,
542
- "loss": 0.7865,
543
  "step": 365
544
  },
545
  {
546
- "epoch": 2.7007299270072993,
547
- "grad_norm": 0.51953125,
548
- "learning_rate": 0.00010356923338389806,
549
- "loss": 0.7908,
550
  "step": 370
551
  },
552
  {
553
- "epoch": 2.7372262773722627,
554
- "grad_norm": 0.53125,
555
- "learning_rate": 0.00010101997992821797,
556
- "loss": 0.7925,
557
  "step": 375
558
  },
559
  {
560
- "epoch": 2.7737226277372264,
561
- "grad_norm": 0.49609375,
562
- "learning_rate": 9.847006326962974e-05,
563
- "loss": 0.799,
564
  "step": 380
565
  },
566
  {
567
- "epoch": 2.81021897810219,
568
- "grad_norm": 0.51171875,
569
- "learning_rate": 9.592114139384145e-05,
570
- "loss": 0.7832,
571
  "step": 385
572
  },
573
  {
574
- "epoch": 2.846715328467153,
575
- "grad_norm": 0.7109375,
576
- "learning_rate": 9.337487163974164e-05,
577
- "loss": 0.7796,
578
  "step": 390
579
  },
580
  {
581
- "epoch": 2.883211678832117,
582
- "grad_norm": 0.6328125,
583
- "learning_rate": 9.083290962177828e-05,
584
- "loss": 0.7839,
585
  "step": 395
586
  },
587
  {
588
- "epoch": 2.9197080291970803,
589
- "grad_norm": 0.59765625,
590
- "learning_rate": 8.829690815345886e-05,
591
- "loss": 0.7781,
592
  "step": 400
593
  },
594
  {
595
- "epoch": 2.9562043795620436,
596
- "grad_norm": 0.58203125,
597
- "learning_rate": 8.57685161726715e-05,
598
- "loss": 0.7457,
599
  "step": 405
600
  },
601
  {
602
- "epoch": 2.9927007299270074,
603
- "grad_norm": 0.6171875,
604
- "learning_rate": 8.324937766952638e-05,
605
- "loss": 0.7623,
606
  "step": 410
607
  },
608
  {
609
- "epoch": 3.0,
610
- "eval_loss": 2.275648355484009,
611
- "eval_runtime": 0.9945,
612
- "eval_samples_per_second": 5.028,
613
- "eval_steps_per_second": 2.011,
614
- "step": 411
615
- },
616
- {
617
- "epoch": 3.0291970802919708,
618
- "grad_norm": 0.8359375,
619
- "learning_rate": 8.074113061741397e-05,
620
- "loss": 0.7329,
621
  "step": 415
622
  },
623
  {
624
- "epoch": 3.065693430656934,
625
- "grad_norm": 0.50390625,
626
- "learning_rate": 7.824540590797568e-05,
627
- "loss": 0.7052,
628
  "step": 420
629
  },
630
  {
631
- "epoch": 3.102189781021898,
632
- "grad_norm": 0.5703125,
633
- "learning_rate": 7.576382629067877e-05,
634
- "loss": 0.7015,
635
  "step": 425
636
  },
637
  {
638
- "epoch": 3.1386861313868613,
639
- "grad_norm": 0.6015625,
640
- "learning_rate": 7.329800531768584e-05,
641
- "loss": 0.696,
642
  "step": 430
643
  },
644
  {
645
- "epoch": 3.1751824817518246,
646
- "grad_norm": 0.55078125,
647
- "learning_rate": 7.084954629470417e-05,
648
- "loss": 0.7154,
649
  "step": 435
650
  },
651
  {
652
- "epoch": 3.2116788321167884,
653
- "grad_norm": 0.59765625,
654
- "learning_rate": 6.842004123849752e-05,
655
- "loss": 0.7113,
656
  "step": 440
657
  },
658
  {
659
- "epoch": 3.2481751824817517,
660
- "grad_norm": 0.5625,
661
- "learning_rate": 6.601106984173835e-05,
662
- "loss": 0.7139,
 
 
 
 
 
 
 
 
663
  "step": 445
664
  },
665
  {
666
- "epoch": 3.2846715328467155,
667
- "grad_norm": 0.59765625,
668
- "learning_rate": 6.362419844587287e-05,
669
- "loss": 0.6967,
670
  "step": 450
671
  },
672
  {
673
- "epoch": 3.321167883211679,
674
- "grad_norm": 0.52734375,
675
- "learning_rate": 6.126097902266772e-05,
676
- "loss": 0.7073,
677
  "step": 455
678
  },
679
  {
680
- "epoch": 3.3576642335766422,
681
- "grad_norm": 0.5625,
682
- "learning_rate": 5.8922948165099524e-05,
683
- "loss": 0.6857,
684
  "step": 460
685
  },
686
  {
687
- "epoch": 3.394160583941606,
688
- "grad_norm": 0.55859375,
689
- "learning_rate": 5.6611626088244194e-05,
690
- "loss": 0.7199,
691
  "step": 465
692
  },
693
  {
694
- "epoch": 3.4306569343065694,
695
- "grad_norm": 0.58203125,
696
- "learning_rate": 5.432851564081534e-05,
697
- "loss": 0.7075,
698
  "step": 470
699
  },
700
  {
701
- "epoch": 3.4671532846715327,
702
- "grad_norm": 0.52734375,
703
- "learning_rate": 5.207510132799436e-05,
704
- "loss": 0.7006,
705
  "step": 475
706
  },
707
  {
708
- "epoch": 3.5036496350364965,
709
- "grad_norm": 0.53515625,
710
- "learning_rate": 4.9852848346187566e-05,
711
- "loss": 0.7151,
712
  "step": 480
713
  },
714
  {
715
- "epoch": 3.54014598540146,
716
- "grad_norm": 0.546875,
717
- "learning_rate": 4.7663201630338816e-05,
718
- "loss": 0.7129,
719
  "step": 485
720
  },
721
  {
722
- "epoch": 3.576642335766423,
723
- "grad_norm": 0.5859375,
724
- "learning_rate": 4.550758491441526e-05,
725
- "loss": 0.7139,
726
  "step": 490
727
  },
728
  {
729
- "epoch": 3.613138686131387,
730
- "grad_norm": 0.51953125,
731
- "learning_rate": 4.3387399805679255e-05,
732
- "loss": 0.7162,
733
  "step": 495
734
  },
735
  {
736
- "epoch": 3.6496350364963503,
737
- "grad_norm": 0.55859375,
738
- "learning_rate": 4.1304024873346705e-05,
739
- "loss": 0.7132,
740
  "step": 500
741
  },
742
  {
743
- "epoch": 3.686131386861314,
744
- "grad_norm": 0.57421875,
745
- "learning_rate": 3.9258814752225284e-05,
746
- "loss": 0.7007,
747
  "step": 505
748
  },
749
  {
750
- "epoch": 3.7226277372262775,
751
- "grad_norm": 0.546875,
752
- "learning_rate": 3.725309926191479e-05,
753
- "loss": 0.7037,
754
  "step": 510
755
  },
756
  {
757
- "epoch": 3.759124087591241,
758
- "grad_norm": 0.73828125,
759
- "learning_rate": 3.528818254214329e-05,
760
- "loss": 0.7255,
761
  "step": 515
762
  },
763
  {
764
- "epoch": 3.795620437956204,
765
- "grad_norm": 0.52734375,
766
- "learning_rate": 3.336534220479961e-05,
767
- "loss": 0.6966,
768
  "step": 520
769
  },
770
  {
771
- "epoch": 3.832116788321168,
772
- "grad_norm": 0.5078125,
773
- "learning_rate": 3.1485828503215585e-05,
774
- "loss": 0.7143,
775
  "step": 525
776
  },
777
  {
778
- "epoch": 3.8686131386861313,
779
- "grad_norm": 0.6328125,
780
- "learning_rate": 2.9650863519236418e-05,
781
- "loss": 0.7005,
782
  "step": 530
783
  },
784
  {
785
- "epoch": 3.905109489051095,
786
- "grad_norm": 0.5703125,
787
- "learning_rate": 2.7861640368608844e-05,
788
- "loss": 0.7005,
789
  "step": 535
790
  },
791
  {
792
- "epoch": 3.9416058394160585,
793
- "grad_norm": 0.53125,
794
- "learning_rate": 2.6119322425203197e-05,
795
- "loss": 0.7139,
796
  "step": 540
797
  },
798
  {
799
- "epoch": 3.978102189781022,
800
- "grad_norm": 0.51953125,
801
- "learning_rate": 2.4425042564574184e-05,
802
- "loss": 0.709,
803
  "step": 545
804
  },
805
  {
806
- "epoch": 4.0,
807
- "eval_loss": 2.341665267944336,
808
- "eval_runtime": 0.9977,
809
- "eval_samples_per_second": 5.012,
810
- "eval_steps_per_second": 2.005,
811
- "step": 548
812
  },
813
  {
814
- "epoch": 4.014598540145985,
815
- "grad_norm": 0.53515625,
816
- "learning_rate": 2.277990242735185e-05,
817
- "loss": 0.6801,
818
- "step": 550
819
  },
820
  {
821
- "epoch": 4.0510948905109485,
822
- "grad_norm": 0.52734375,
823
- "learning_rate": 2.118497170294195e-05,
824
- "loss": 0.6495,
 
825
  "step": 555
826
  },
827
  {
828
- "epoch": 4.087591240875913,
829
- "grad_norm": 0.5625,
830
- "learning_rate": 1.9641287434001355e-05,
831
- "loss": 0.672,
832
  "step": 560
833
  },
834
  {
835
- "epoch": 4.124087591240876,
836
- "grad_norm": 0.55078125,
837
- "learning_rate": 1.8149853342140645e-05,
838
- "loss": 0.6611,
839
  "step": 565
840
  },
841
  {
842
- "epoch": 4.160583941605839,
843
- "grad_norm": 0.59375,
844
- "learning_rate": 1.671163917529285e-05,
845
- "loss": 0.662,
846
  "step": 570
847
  },
848
  {
849
- "epoch": 4.197080291970803,
850
- "grad_norm": 0.51171875,
851
- "learning_rate": 1.5327580077171587e-05,
852
- "loss": 0.6635,
853
  "step": 575
854
  },
855
  {
856
- "epoch": 4.233576642335766,
857
- "grad_norm": 0.54296875,
858
- "learning_rate": 1.3998575979229944e-05,
859
- "loss": 0.6624,
860
  "step": 580
861
  },
862
  {
863
- "epoch": 4.2700729927007295,
864
- "grad_norm": 0.50390625,
865
- "learning_rate": 1.272549101551438e-05,
866
- "loss": 0.6523,
867
  "step": 585
868
  },
869
  {
870
- "epoch": 4.306569343065694,
871
- "grad_norm": 0.51171875,
872
- "learning_rate": 1.1509152960794666e-05,
873
- "loss": 0.6607,
874
  "step": 590
875
  },
876
  {
877
- "epoch": 4.343065693430657,
878
- "grad_norm": 0.546875,
879
- "learning_rate": 1.035035269233493e-05,
880
- "loss": 0.6626,
881
  "step": 595
882
  },
883
  {
884
- "epoch": 4.37956204379562,
885
- "grad_norm": 0.54296875,
886
- "learning_rate": 9.249843675656212e-06,
887
- "loss": 0.678,
888
  "step": 600
889
  },
890
  {
891
- "epoch": 4.416058394160584,
892
- "grad_norm": 0.5234375,
893
- "learning_rate": 8.208341474624071e-06,
894
- "loss": 0.6783,
895
  "step": 605
896
  },
897
  {
898
- "epoch": 4.452554744525547,
899
- "grad_norm": 0.53515625,
900
- "learning_rate": 7.226523286180776e-06,
901
- "loss": 0.6699,
902
  "step": 610
903
  },
904
  {
905
- "epoch": 4.489051094890511,
906
- "grad_norm": 0.5703125,
907
- "learning_rate": 6.3050275000238414e-06,
908
- "loss": 0.6607,
909
  "step": 615
910
  },
911
  {
912
- "epoch": 4.525547445255475,
913
- "grad_norm": 0.5234375,
914
- "learning_rate": 5.4444532835175144e-06,
915
- "loss": 0.6702,
916
  "step": 620
917
  },
918
  {
919
- "epoch": 4.562043795620438,
920
- "grad_norm": 0.5234375,
921
- "learning_rate": 4.6453601921072395e-06,
922
- "loss": 0.6793,
923
  "step": 625
924
  },
925
  {
926
- "epoch": 4.598540145985401,
927
- "grad_norm": 0.5234375,
928
- "learning_rate": 3.908267805490051e-06,
929
- "loss": 0.6622,
930
  "step": 630
931
  },
932
  {
933
- "epoch": 4.635036496350365,
934
- "grad_norm": 0.54296875,
935
- "learning_rate": 3.233655389777801e-06,
936
- "loss": 0.677,
937
  "step": 635
938
  },
939
  {
940
- "epoch": 4.671532846715328,
941
- "grad_norm": 0.5234375,
942
- "learning_rate": 2.62196158587269e-06,
943
- "loss": 0.6588,
944
  "step": 640
945
  },
946
  {
947
- "epoch": 4.708029197080292,
948
- "grad_norm": 0.5234375,
949
- "learning_rate": 2.073584124257899e-06,
950
- "loss": 0.6621,
951
  "step": 645
952
  },
953
  {
954
- "epoch": 4.744525547445256,
955
- "grad_norm": 0.53515625,
956
- "learning_rate": 1.5888795663883904e-06,
957
- "loss": 0.6655,
958
  "step": 650
959
  },
960
  {
961
- "epoch": 4.781021897810219,
962
- "grad_norm": 0.515625,
963
- "learning_rate": 1.1681630728506699e-06,
964
- "loss": 0.6653,
965
  "step": 655
966
  },
967
  {
968
- "epoch": 4.817518248175182,
969
- "grad_norm": 0.52734375,
970
- "learning_rate": 8.117081984415298e-07,
971
- "loss": 0.6734,
972
  "step": 660
973
  },
974
  {
975
- "epoch": 4.854014598540146,
976
- "grad_norm": 0.5390625,
977
- "learning_rate": 5.19746714299596e-07,
978
- "loss": 0.6541,
979
  "step": 665
980
  },
981
  {
982
- "epoch": 4.89051094890511,
983
- "grad_norm": 0.5390625,
984
- "learning_rate": 2.9246845720496407e-07,
985
- "loss": 0.6722,
 
 
 
 
 
 
 
 
986
  "step": 670
987
  },
988
  {
989
- "epoch": 4.927007299270073,
990
- "grad_norm": 0.55859375,
991
- "learning_rate": 1.300212061451367e-07,
992
- "loss": 0.6472,
993
  "step": 675
994
  },
995
  {
996
- "epoch": 4.963503649635037,
997
- "grad_norm": 0.51953125,
998
- "learning_rate": 3.251058622737446e-08,
999
- "loss": 0.667,
1000
  "step": 680
1001
  },
1002
  {
1003
- "epoch": 5.0,
1004
- "grad_norm": 0.52734375,
1005
- "learning_rate": 0.0,
1006
- "loss": 0.6601,
1007
  "step": 685
1008
  },
1009
  {
1010
- "epoch": 5.0,
1011
- "eval_loss": 2.3811252117156982,
1012
- "eval_runtime": 0.9953,
1013
- "eval_samples_per_second": 5.024,
1014
- "eval_steps_per_second": 2.01,
1015
- "step": 685
1016
  },
1017
  {
1018
- "epoch": 5.0,
1019
- "step": 685,
1020
- "total_flos": 1.0472781231601746e+18,
1021
- "train_loss": 2.151051264783762,
1022
- "train_runtime": 5341.9856,
1023
- "train_samples_per_second": 2.052,
1024
- "train_steps_per_second": 0.128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1025
  }
1026
  ],
1027
  "logging_steps": 5,
1028
- "max_steps": 685,
1029
  "num_input_tokens_seen": 0,
1030
- "num_train_epochs": 5,
1031
  "save_steps": 100,
1032
  "stateful_callbacks": {
1033
  "TrainerControl": {
@@ -1041,7 +1676,7 @@
1041
  "attributes": {}
1042
  }
1043
  },
1044
- "total_flos": 1.0472781231601746e+18,
1045
  "train_batch_size": 4,
1046
  "trial_name": null,
1047
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 1110,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.009009009009009009,
13
+ "grad_norm": 608.0,
14
+ "learning_rate": 1.801801801801802e-06,
15
+ "loss": 58.5641,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.04504504504504504,
20
+ "grad_norm": 532.0,
21
+ "learning_rate": 9.00900900900901e-06,
22
+ "loss": 54.6181,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.09009009009009009,
27
+ "grad_norm": 446.0,
28
+ "learning_rate": 1.801801801801802e-05,
29
+ "loss": 50.0236,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.13513513513513514,
34
+ "grad_norm": 193.0,
35
+ "learning_rate": 2.702702702702703e-05,
36
+ "loss": 33.1549,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.18018018018018017,
41
+ "grad_norm": 44.5,
42
+ "learning_rate": 3.603603603603604e-05,
43
+ "loss": 25.2428,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.22522522522522523,
48
+ "grad_norm": 26.625,
49
+ "learning_rate": 4.5045045045045046e-05,
50
+ "loss": 22.4735,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.2702702702702703,
55
+ "grad_norm": 17.25,
56
+ "learning_rate": 5.405405405405406e-05,
57
+ "loss": 20.4661,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.3153153153153153,
62
+ "grad_norm": 7.6875,
63
+ "learning_rate": 6.306306306306306e-05,
64
+ "loss": 19.1401,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.36036036036036034,
69
+ "grad_norm": 11.6875,
70
+ "learning_rate": 7.207207207207208e-05,
71
+ "loss": 18.3188,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.40540540540540543,
76
+ "grad_norm": 23.5,
77
+ "learning_rate": 8.108108108108109e-05,
78
+ "loss": 16.7622,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.45045045045045046,
83
+ "grad_norm": 56.25,
84
+ "learning_rate": 9.009009009009009e-05,
85
+ "loss": 12.6183,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.4954954954954955,
90
+ "grad_norm": 13.0,
91
+ "learning_rate": 9.90990990990991e-05,
92
+ "loss": 4.3593,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.5405405405405406,
97
+ "grad_norm": 3.875,
98
+ "learning_rate": 0.00010810810810810812,
99
+ "loss": 2.18,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.5855855855855856,
104
+ "grad_norm": 2.421875,
105
+ "learning_rate": 0.00011711711711711712,
106
+ "loss": 1.8179,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.6306306306306306,
111
+ "grad_norm": 3.265625,
112
+ "learning_rate": 0.00012612612612612612,
113
+ "loss": 1.5974,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.6756756756756757,
118
+ "grad_norm": 2.375,
119
+ "learning_rate": 0.00013513513513513514,
120
+ "loss": 1.486,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.7207207207207207,
125
+ "grad_norm": 1.5078125,
126
+ "learning_rate": 0.00014414414414414415,
127
+ "loss": 1.361,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.7657657657657657,
132
+ "grad_norm": 3.890625,
133
+ "learning_rate": 0.00015315315315315314,
134
+ "loss": 1.3001,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.8108108108108109,
139
+ "grad_norm": 4.46875,
140
+ "learning_rate": 0.00016216216216216218,
141
+ "loss": 1.261,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.8558558558558559,
146
+ "grad_norm": 5.03125,
147
+ "learning_rate": 0.0001711711711711712,
148
+ "loss": 1.2015,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.9009009009009009,
153
+ "grad_norm": 32.25,
154
+ "learning_rate": 0.00018018018018018018,
155
+ "loss": 1.1886,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.9459459459459459,
160
+ "grad_norm": 1.703125,
161
+ "learning_rate": 0.0001891891891891892,
162
+ "loss": 1.1679,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.990990990990991,
167
+ "grad_norm": 2.984375,
168
+ "learning_rate": 0.0001981981981981982,
169
+ "loss": 1.1572,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 1.0,
174
+ "eval_loss": 2.307225465774536,
175
+ "eval_runtime": 1.0056,
176
+ "eval_samples_per_second": 4.972,
177
+ "eval_steps_per_second": 1.989,
178
+ "step": 111
179
+ },
180
+ {
181
+ "epoch": 1.0360360360360361,
182
+ "grad_norm": 1.546875,
183
+ "learning_rate": 0.00019999208860571255,
184
+ "loss": 1.0473,
185
  "step": 115
186
  },
187
  {
188
+ "epoch": 1.0810810810810811,
189
+ "grad_norm": 1.546875,
190
+ "learning_rate": 0.0001999599507118322,
191
+ "loss": 1.0618,
192
  "step": 120
193
  },
194
  {
195
+ "epoch": 1.1261261261261262,
196
+ "grad_norm": 10.0,
197
+ "learning_rate": 0.00019990309979553045,
198
+ "loss": 1.0458,
199
  "step": 125
200
  },
201
  {
202
+ "epoch": 1.1711711711711712,
203
+ "grad_norm": 8.4375,
204
+ "learning_rate": 0.00019982154991201608,
205
+ "loss": 1.0364,
206
  "step": 130
207
  },
208
  {
209
+ "epoch": 1.2162162162162162,
210
+ "grad_norm": 2.0,
211
+ "learning_rate": 0.00019971532122280464,
212
+ "loss": 1.0457,
213
  "step": 135
214
  },
215
  {
216
+ "epoch": 1.2612612612612613,
217
+ "grad_norm": 1.4453125,
218
+ "learning_rate": 0.00019958443999073397,
219
+ "loss": 0.9906,
 
 
 
 
 
 
 
 
220
  "step": 140
221
  },
222
  {
223
+ "epoch": 1.3063063063063063,
224
+ "grad_norm": 18.25,
225
+ "learning_rate": 0.00019942893857347128,
226
+ "loss": 0.9911,
227
  "step": 145
228
  },
229
  {
230
+ "epoch": 1.3513513513513513,
231
+ "grad_norm": 2.578125,
232
+ "learning_rate": 0.0001992488554155135,
233
+ "loss": 0.9996,
234
  "step": 150
235
  },
236
  {
237
+ "epoch": 1.3963963963963963,
238
+ "grad_norm": 1.7734375,
239
+ "learning_rate": 0.00019904423503868247,
240
+ "loss": 0.9656,
241
  "step": 155
242
  },
243
  {
244
+ "epoch": 1.4414414414414414,
245
+ "grad_norm": 5.65625,
246
+ "learning_rate": 0.00019881512803111796,
247
+ "loss": 0.9753,
248
  "step": 160
249
  },
250
  {
251
+ "epoch": 1.4864864864864864,
252
+ "grad_norm": 3.78125,
253
+ "learning_rate": 0.00019856159103477086,
254
+ "loss": 0.9239,
255
  "step": 165
256
  },
257
  {
258
+ "epoch": 1.5315315315315314,
259
+ "grad_norm": 0.86328125,
260
+ "learning_rate": 0.00019828368673139947,
261
+ "loss": 0.9428,
262
  "step": 170
263
  },
264
  {
265
+ "epoch": 1.5765765765765765,
266
+ "grad_norm": 0.7265625,
267
+ "learning_rate": 0.00019798148382707296,
268
+ "loss": 0.9455,
269
  "step": 175
270
  },
271
  {
272
+ "epoch": 1.6216216216216215,
273
+ "grad_norm": 1.8125,
274
+ "learning_rate": 0.00019765505703518496,
275
+ "loss": 0.9373,
276
  "step": 180
277
  },
278
  {
279
+ "epoch": 1.6666666666666665,
280
+ "grad_norm": 1.0078125,
281
+ "learning_rate": 0.00019730448705798239,
282
+ "loss": 0.9659,
283
  "step": 185
284
  },
285
  {
286
+ "epoch": 1.7117117117117115,
287
+ "grad_norm": 3.859375,
288
+ "learning_rate": 0.00019692986056661356,
289
+ "loss": 0.9271,
290
  "step": 190
291
  },
292
  {
293
+ "epoch": 1.7567567567567568,
294
+ "grad_norm": 3.34375,
295
+ "learning_rate": 0.00019653127017970034,
296
+ "loss": 0.9303,
297
  "step": 195
298
  },
299
  {
300
+ "epoch": 1.8018018018018018,
301
+ "grad_norm": 0.91796875,
302
+ "learning_rate": 0.0001961088144404403,
303
+ "loss": 0.9333,
304
  "step": 200
305
  },
306
  {
307
+ "epoch": 1.8468468468468469,
308
+ "grad_norm": 1.4453125,
309
+ "learning_rate": 0.00019566259779224378,
310
+ "loss": 0.8923,
311
  "step": 205
312
  },
313
  {
314
+ "epoch": 1.8918918918918919,
315
+ "grad_norm": 3.171875,
316
+ "learning_rate": 0.00019519273055291266,
317
+ "loss": 0.9,
318
  "step": 210
319
  },
320
  {
321
+ "epoch": 1.936936936936937,
322
+ "grad_norm": 14.3125,
323
+ "learning_rate": 0.00019469932888736632,
324
+ "loss": 0.8988,
325
  "step": 215
326
  },
327
  {
328
+ "epoch": 1.981981981981982,
329
+ "grad_norm": 3.46875,
330
+ "learning_rate": 0.0001941825147789225,
331
+ "loss": 0.9296,
332
  "step": 220
333
  },
334
  {
335
+ "epoch": 2.0,
336
+ "eval_loss": 2.178852081298828,
337
+ "eval_runtime": 1.0053,
338
+ "eval_samples_per_second": 4.973,
339
+ "eval_steps_per_second": 1.989,
340
+ "step": 222
341
+ },
342
+ {
343
+ "epoch": 2.027027027027027,
344
+ "grad_norm": 1.4921875,
345
+ "learning_rate": 0.00019364241599913924,
346
+ "loss": 0.8696,
347
  "step": 225
348
  },
349
  {
350
+ "epoch": 2.0720720720720722,
351
+ "grad_norm": 2.65625,
352
+ "learning_rate": 0.0001930791660762262,
353
+ "loss": 0.8363,
354
  "step": 230
355
  },
356
  {
357
+ "epoch": 2.1171171171171173,
358
+ "grad_norm": 1.265625,
359
+ "learning_rate": 0.00019249290426203252,
360
+ "loss": 0.821,
361
  "step": 235
362
  },
363
  {
364
+ "epoch": 2.1621621621621623,
365
+ "grad_norm": 2.546875,
366
+ "learning_rate": 0.00019188377549761963,
367
+ "loss": 0.8511,
368
  "step": 240
369
  },
370
  {
371
+ "epoch": 2.2072072072072073,
372
+ "grad_norm": 0.828125,
373
+ "learning_rate": 0.0001912519303774276,
374
+ "loss": 0.8231,
375
  "step": 245
376
  },
377
  {
378
+ "epoch": 2.2522522522522523,
379
+ "grad_norm": 0.73046875,
380
+ "learning_rate": 0.000190597525112044,
381
+ "loss": 0.8496,
382
  "step": 250
383
  },
384
  {
385
+ "epoch": 2.2972972972972974,
386
+ "grad_norm": 1.2421875,
387
+ "learning_rate": 0.00018992072148958368,
388
+ "loss": 0.852,
389
  "step": 255
390
  },
391
  {
392
+ "epoch": 2.3423423423423424,
393
+ "grad_norm": 1.578125,
394
+ "learning_rate": 0.0001892216868356904,
395
+ "loss": 0.8131,
396
  "step": 260
397
  },
398
  {
399
+ "epoch": 2.3873873873873874,
400
+ "grad_norm": 1.5078125,
401
+ "learning_rate": 0.00018850059397216876,
402
+ "loss": 0.8483,
403
  "step": 265
404
  },
405
  {
406
+ "epoch": 2.4324324324324325,
407
+ "grad_norm": 1.125,
408
+ "learning_rate": 0.00018775762117425777,
409
+ "loss": 0.8432,
410
  "step": 270
411
  },
412
  {
413
+ "epoch": 2.4774774774774775,
414
+ "grad_norm": 0.6015625,
415
+ "learning_rate": 0.00018699295212655596,
416
+ "loss": 0.8493,
 
 
 
 
 
 
 
 
417
  "step": 275
418
  },
419
  {
420
+ "epoch": 2.5225225225225225,
421
+ "grad_norm": 0.8828125,
422
+ "learning_rate": 0.00018620677587760916,
423
+ "loss": 0.7998,
424
  "step": 280
425
  },
426
  {
427
+ "epoch": 2.5675675675675675,
428
+ "grad_norm": 0.73046875,
429
+ "learning_rate": 0.0001853992867931721,
430
+ "loss": 0.8256,
431
  "step": 285
432
  },
433
  {
434
+ "epoch": 2.6126126126126126,
435
+ "grad_norm": 0.6796875,
436
+ "learning_rate": 0.00018457068450815562,
437
+ "loss": 0.8162,
438
  "step": 290
439
  },
440
  {
441
+ "epoch": 2.6576576576576576,
442
+ "grad_norm": 0.671875,
443
+ "learning_rate": 0.0001837211738772711,
444
+ "loss": 0.8338,
445
  "step": 295
446
  },
447
  {
448
+ "epoch": 2.7027027027027026,
449
+ "grad_norm": 0.9140625,
450
+ "learning_rate": 0.00018285096492438424,
451
+ "loss": 0.8279,
452
  "step": 300
453
  },
454
  {
455
+ "epoch": 2.7477477477477477,
456
+ "grad_norm": 0.60546875,
457
+ "learning_rate": 0.00018196027279059117,
458
+ "loss": 0.7962,
459
  "step": 305
460
  },
461
  {
462
+ "epoch": 2.7927927927927927,
463
+ "grad_norm": 2.78125,
464
+ "learning_rate": 0.0001810493176810292,
465
+ "loss": 0.8192,
466
  "step": 310
467
  },
468
  {
469
+ "epoch": 2.8378378378378377,
470
+ "grad_norm": 0.63671875,
471
+ "learning_rate": 0.00018011832481043576,
472
+ "loss": 0.8147,
473
  "step": 315
474
  },
475
  {
476
+ "epoch": 2.8828828828828827,
477
+ "grad_norm": 0.56640625,
478
+ "learning_rate": 0.00017916752434746856,
479
+ "loss": 0.8255,
480
  "step": 320
481
  },
482
  {
483
+ "epoch": 2.9279279279279278,
484
+ "grad_norm": 1.8046875,
485
+ "learning_rate": 0.0001781971513578013,
486
+ "loss": 0.8059,
487
  "step": 325
488
  },
489
  {
490
+ "epoch": 2.972972972972973,
491
+ "grad_norm": 1.1640625,
492
+ "learning_rate": 0.00017720744574600863,
493
+ "loss": 0.8273,
494
  "step": 330
495
  },
496
  {
497
+ "epoch": 3.0,
498
+ "eval_loss": 2.1709225177764893,
499
+ "eval_runtime": 1.0054,
500
+ "eval_samples_per_second": 4.973,
501
+ "eval_steps_per_second": 1.989,
502
+ "step": 333
503
+ },
504
+ {
505
+ "epoch": 3.018018018018018,
506
+ "grad_norm": 0.83984375,
507
+ "learning_rate": 0.00017619865219625452,
508
+ "loss": 0.7934,
509
  "step": 335
510
  },
511
  {
512
+ "epoch": 3.063063063063063,
513
+ "grad_norm": 1.5859375,
514
+ "learning_rate": 0.00017517102011179933,
515
+ "loss": 0.7096,
516
  "step": 340
517
  },
518
  {
519
+ "epoch": 3.108108108108108,
520
+ "grad_norm": 0.91796875,
521
+ "learning_rate": 0.00017412480355334005,
522
+ "loss": 0.7203,
523
  "step": 345
524
  },
525
  {
526
+ "epoch": 3.153153153153153,
527
+ "grad_norm": 1.9296875,
528
+ "learning_rate": 0.00017306026117619889,
529
+ "loss": 0.7237,
530
  "step": 350
531
  },
532
  {
533
+ "epoch": 3.1981981981981984,
534
+ "grad_norm": 1.6328125,
535
+ "learning_rate": 0.00017197765616637636,
536
+ "loss": 0.738,
537
  "step": 355
538
  },
539
  {
540
+ "epoch": 3.2432432432432434,
541
+ "grad_norm": 2.0625,
542
+ "learning_rate": 0.00017087725617548385,
543
+ "loss": 0.7214,
544
  "step": 360
545
  },
546
  {
547
+ "epoch": 3.2882882882882885,
548
+ "grad_norm": 3.53125,
549
+ "learning_rate": 0.0001697593332545723,
550
+ "loss": 0.7549,
551
  "step": 365
552
  },
553
  {
554
+ "epoch": 3.3333333333333335,
555
+ "grad_norm": 4.25,
556
+ "learning_rate": 0.0001686241637868734,
557
+ "loss": 0.7575,
558
  "step": 370
559
  },
560
  {
561
+ "epoch": 3.3783783783783785,
562
+ "grad_norm": 1.6484375,
563
+ "learning_rate": 0.00016747202841946928,
564
+ "loss": 0.7392,
565
  "step": 375
566
  },
567
  {
568
+ "epoch": 3.4234234234234235,
569
+ "grad_norm": 1.6171875,
570
+ "learning_rate": 0.00016630321199390867,
571
+ "loss": 0.7251,
572
  "step": 380
573
  },
574
  {
575
+ "epoch": 3.4684684684684686,
576
+ "grad_norm": 1.5546875,
577
+ "learning_rate": 0.0001651180034757856,
578
+ "loss": 0.7285,
579
  "step": 385
580
  },
581
  {
582
+ "epoch": 3.5135135135135136,
583
+ "grad_norm": 1.0078125,
584
+ "learning_rate": 0.0001639166958832985,
585
+ "loss": 0.7114,
586
  "step": 390
587
  },
588
  {
589
+ "epoch": 3.5585585585585586,
590
+ "grad_norm": 1.2421875,
591
+ "learning_rate": 0.00016269958621480788,
592
+ "loss": 0.7223,
593
  "step": 395
594
  },
595
  {
596
+ "epoch": 3.6036036036036037,
597
+ "grad_norm": 0.61328125,
598
+ "learning_rate": 0.00016146697537540924,
599
+ "loss": 0.7273,
600
  "step": 400
601
  },
602
  {
603
+ "epoch": 3.6486486486486487,
604
+ "grad_norm": 0.67578125,
605
+ "learning_rate": 0.00016021916810254097,
606
+ "loss": 0.7328,
607
  "step": 405
608
  },
609
  {
610
+ "epoch": 3.6936936936936937,
611
+ "grad_norm": 1.1015625,
612
+ "learning_rate": 0.00015895647289064396,
613
+ "loss": 0.7409,
614
  "step": 410
615
  },
616
  {
617
+ "epoch": 3.7387387387387387,
618
+ "grad_norm": 0.7578125,
619
+ "learning_rate": 0.000157679201914893,
620
+ "loss": 0.7247,
 
 
 
 
 
 
 
 
621
  "step": 415
622
  },
623
  {
624
+ "epoch": 3.7837837837837838,
625
+ "grad_norm": 1.890625,
626
+ "learning_rate": 0.0001563876709540178,
627
+ "loss": 0.7446,
628
  "step": 420
629
  },
630
  {
631
+ "epoch": 3.828828828828829,
632
+ "grad_norm": 0.7109375,
633
+ "learning_rate": 0.0001550821993122334,
634
+ "loss": 0.7421,
635
  "step": 425
636
  },
637
  {
638
+ "epoch": 3.873873873873874,
639
+ "grad_norm": 0.73046875,
640
+ "learning_rate": 0.00015376310974029873,
641
+ "loss": 0.7362,
642
  "step": 430
643
  },
644
  {
645
+ "epoch": 3.918918918918919,
646
+ "grad_norm": 0.66015625,
647
+ "learning_rate": 0.00015243072835572318,
648
+ "loss": 0.7398,
649
  "step": 435
650
  },
651
  {
652
+ "epoch": 3.963963963963964,
653
+ "grad_norm": 0.69921875,
654
+ "learning_rate": 0.0001510853845621409,
655
+ "loss": 0.7586,
656
  "step": 440
657
  },
658
  {
659
+ "epoch": 4.0,
660
+ "eval_loss": 2.2163968086242676,
661
+ "eval_runtime": 1.0061,
662
+ "eval_samples_per_second": 4.97,
663
+ "eval_steps_per_second": 1.988,
664
+ "step": 444
665
+ },
666
+ {
667
+ "epoch": 4.009009009009009,
668
+ "grad_norm": 0.58203125,
669
+ "learning_rate": 0.00014972741096787242,
670
+ "loss": 0.7128,
671
  "step": 445
672
  },
673
  {
674
+ "epoch": 4.054054054054054,
675
+ "grad_norm": 0.75,
676
+ "learning_rate": 0.00014835714330369446,
677
+ "loss": 0.6463,
678
  "step": 450
679
  },
680
  {
681
+ "epoch": 4.099099099099099,
682
+ "grad_norm": 0.83203125,
683
+ "learning_rate": 0.00014697492033983707,
684
+ "loss": 0.6453,
685
  "step": 455
686
  },
687
  {
688
+ "epoch": 4.1441441441441444,
689
+ "grad_norm": 0.55859375,
690
+ "learning_rate": 0.00014558108380223012,
691
+ "loss": 0.647,
692
  "step": 460
693
  },
694
  {
695
+ "epoch": 4.1891891891891895,
696
+ "grad_norm": 1.3125,
697
+ "learning_rate": 0.00014417597828801832,
698
+ "loss": 0.626,
699
  "step": 465
700
  },
701
  {
702
+ "epoch": 4.2342342342342345,
703
+ "grad_norm": 0.85546875,
704
+ "learning_rate": 0.00014275995118036693,
705
+ "loss": 0.6334,
706
  "step": 470
707
  },
708
  {
709
+ "epoch": 4.2792792792792795,
710
+ "grad_norm": 0.69921875,
711
+ "learning_rate": 0.0001413333525625784,
712
+ "loss": 0.6435,
713
  "step": 475
714
  },
715
  {
716
+ "epoch": 4.324324324324325,
717
+ "grad_norm": 0.8046875,
718
+ "learning_rate": 0.00013989653513154165,
719
+ "loss": 0.6439,
720
  "step": 480
721
  },
722
  {
723
+ "epoch": 4.36936936936937,
724
+ "grad_norm": 1.0859375,
725
+ "learning_rate": 0.00013844985411053492,
726
+ "loss": 0.6559,
727
  "step": 485
728
  },
729
  {
730
+ "epoch": 4.414414414414415,
731
+ "grad_norm": 1.3359375,
732
+ "learning_rate": 0.00013699366716140435,
733
+ "loss": 0.6654,
734
  "step": 490
735
  },
736
  {
737
+ "epoch": 4.45945945945946,
738
+ "grad_norm": 0.80859375,
739
+ "learning_rate": 0.00013552833429613938,
740
+ "loss": 0.6783,
741
  "step": 495
742
  },
743
  {
744
+ "epoch": 4.504504504504505,
745
+ "grad_norm": 0.6875,
746
+ "learning_rate": 0.00013405421778786737,
747
+ "loss": 0.6543,
748
  "step": 500
749
  },
750
  {
751
+ "epoch": 4.54954954954955,
752
+ "grad_norm": 0.62890625,
753
+ "learning_rate": 0.00013257168208128908,
754
+ "loss": 0.6608,
755
  "step": 505
756
  },
757
  {
758
+ "epoch": 4.594594594594595,
759
+ "grad_norm": 0.60546875,
760
+ "learning_rate": 0.00013108109370257712,
761
+ "loss": 0.6621,
762
  "step": 510
763
  },
764
  {
765
+ "epoch": 4.63963963963964,
766
+ "grad_norm": 0.67578125,
767
+ "learning_rate": 0.00012958282116876026,
768
+ "loss": 0.656,
769
  "step": 515
770
  },
771
  {
772
+ "epoch": 4.684684684684685,
773
+ "grad_norm": 0.65234375,
774
+ "learning_rate": 0.00012807723489661495,
775
+ "loss": 0.6505,
776
  "step": 520
777
  },
778
  {
779
+ "epoch": 4.72972972972973,
780
+ "grad_norm": 0.921875,
781
+ "learning_rate": 0.00012656470711108764,
782
+ "loss": 0.6789,
783
  "step": 525
784
  },
785
  {
786
+ "epoch": 4.774774774774775,
787
+ "grad_norm": 0.61328125,
788
+ "learning_rate": 0.00012504561175326985,
789
+ "loss": 0.6588,
790
  "step": 530
791
  },
792
  {
793
+ "epoch": 4.81981981981982,
794
+ "grad_norm": 0.703125,
795
+ "learning_rate": 0.00012352032438794902,
796
+ "loss": 0.6534,
797
  "step": 535
798
  },
799
  {
800
+ "epoch": 4.864864864864865,
801
+ "grad_norm": 0.74609375,
802
+ "learning_rate": 0.00012198922211075778,
803
+ "loss": 0.6482,
804
  "step": 540
805
  },
806
  {
807
+ "epoch": 4.90990990990991,
808
+ "grad_norm": 0.94140625,
809
+ "learning_rate": 0.00012045268345494511,
810
+ "loss": 0.6595,
811
  "step": 545
812
  },
813
  {
814
+ "epoch": 4.954954954954955,
815
+ "grad_norm": 0.59765625,
816
+ "learning_rate": 0.00011891108829779165,
817
+ "loss": 0.6624,
818
+ "step": 550
 
819
  },
820
  {
821
+ "epoch": 5.0,
822
+ "grad_norm": 0.578125,
823
+ "learning_rate": 0.00011736481776669306,
824
+ "loss": 0.6613,
825
+ "step": 555
826
  },
827
  {
828
+ "epoch": 5.0,
829
+ "eval_loss": 2.3182225227355957,
830
+ "eval_runtime": 1.0028,
831
+ "eval_samples_per_second": 4.986,
832
+ "eval_steps_per_second": 1.994,
833
  "step": 555
834
  },
835
  {
836
+ "epoch": 5.045045045045045,
837
+ "grad_norm": 0.98046875,
838
+ "learning_rate": 0.0001158142541449341,
839
+ "loss": 0.5564,
840
  "step": 560
841
  },
842
  {
843
+ "epoch": 5.09009009009009,
844
+ "grad_norm": 0.69140625,
845
+ "learning_rate": 0.00011425978077717709,
846
+ "loss": 0.5273,
847
  "step": 565
848
  },
849
  {
850
+ "epoch": 5.135135135135135,
851
+ "grad_norm": 0.69921875,
852
+ "learning_rate": 0.00011270178197468789,
853
+ "loss": 0.5589,
854
  "step": 570
855
  },
856
  {
857
+ "epoch": 5.18018018018018,
858
+ "grad_norm": 0.6171875,
859
+ "learning_rate": 0.00011114064292032282,
860
+ "loss": 0.5593,
861
  "step": 575
862
  },
863
  {
864
+ "epoch": 5.225225225225225,
865
+ "grad_norm": 0.69921875,
866
+ "learning_rate": 0.00010957674957330042,
867
+ "loss": 0.5672,
868
  "step": 580
869
  },
870
  {
871
+ "epoch": 5.27027027027027,
872
+ "grad_norm": 0.69140625,
873
+ "learning_rate": 0.00010801048857378071,
874
+ "loss": 0.5444,
875
  "step": 585
876
  },
877
  {
878
+ "epoch": 5.315315315315315,
879
+ "grad_norm": 0.66796875,
880
+ "learning_rate": 0.00010644224714727681,
881
+ "loss": 0.5747,
882
  "step": 590
883
  },
884
  {
885
+ "epoch": 5.36036036036036,
886
+ "grad_norm": 0.68359375,
887
+ "learning_rate": 0.0001048724130089212,
888
+ "loss": 0.5609,
889
  "step": 595
890
  },
891
  {
892
+ "epoch": 5.405405405405405,
893
+ "grad_norm": 0.8984375,
894
+ "learning_rate": 0.00010330137426761135,
895
+ "loss": 0.5625,
896
  "step": 600
897
  },
898
  {
899
+ "epoch": 5.45045045045045,
900
+ "grad_norm": 0.76171875,
901
+ "learning_rate": 0.00010172951933005775,
902
+ "loss": 0.5671,
903
  "step": 605
904
  },
905
  {
906
+ "epoch": 5.495495495495495,
907
+ "grad_norm": 0.80859375,
908
+ "learning_rate": 0.00010015723680475846,
909
+ "loss": 0.564,
910
  "step": 610
911
  },
912
  {
913
+ "epoch": 5.54054054054054,
914
+ "grad_norm": 0.76171875,
915
+ "learning_rate": 9.858491540592382e-05,
916
+ "loss": 0.5784,
917
  "step": 615
918
  },
919
  {
920
+ "epoch": 5.585585585585585,
921
+ "grad_norm": 0.7265625,
922
+ "learning_rate": 9.70129438573747e-05,
923
+ "loss": 0.5672,
924
  "step": 620
925
  },
926
  {
927
+ "epoch": 5.63063063063063,
928
+ "grad_norm": 0.75390625,
929
+ "learning_rate": 9.54417107964389e-05,
930
+ "loss": 0.5592,
931
  "step": 625
932
  },
933
  {
934
+ "epoch": 5.675675675675675,
935
+ "grad_norm": 0.734375,
936
+ "learning_rate": 9.38716046778684e-05,
937
+ "loss": 0.5634,
938
  "step": 630
939
  },
940
  {
941
+ "epoch": 5.7207207207207205,
942
+ "grad_norm": 0.6640625,
943
+ "learning_rate": 9.230301367780208e-05,
944
+ "loss": 0.5691,
945
  "step": 635
946
  },
947
  {
948
+ "epoch": 5.7657657657657655,
949
+ "grad_norm": 0.6875,
950
+ "learning_rate": 9.07363255977973e-05,
951
+ "loss": 0.5722,
952
  "step": 640
953
  },
954
  {
955
+ "epoch": 5.8108108108108105,
956
+ "grad_norm": 0.76953125,
957
+ "learning_rate": 8.917192776895382e-05,
958
+ "loss": 0.5827,
959
  "step": 645
960
  },
961
  {
962
+ "epoch": 5.8558558558558556,
963
+ "grad_norm": 0.83203125,
964
+ "learning_rate": 8.76102069561545e-05,
965
+ "loss": 0.5745,
966
  "step": 650
967
  },
968
  {
969
+ "epoch": 5.900900900900901,
970
+ "grad_norm": 0.7265625,
971
+ "learning_rate": 8.605154926244543e-05,
972
+ "loss": 0.5614,
973
  "step": 655
974
  },
975
  {
976
+ "epoch": 5.945945945945946,
977
+ "grad_norm": 0.65625,
978
+ "learning_rate": 8.449634003358022e-05,
979
+ "loss": 0.5731,
980
  "step": 660
981
  },
982
  {
983
+ "epoch": 5.990990990990991,
984
+ "grad_norm": 0.8828125,
985
+ "learning_rate": 8.294496376275104e-05,
986
+ "loss": 0.577,
987
  "step": 665
988
  },
989
  {
990
+ "epoch": 6.0,
991
+ "eval_loss": 2.4773526191711426,
992
+ "eval_runtime": 1.0034,
993
+ "eval_samples_per_second": 4.983,
994
+ "eval_steps_per_second": 1.993,
995
+ "step": 666
996
+ },
997
+ {
998
+ "epoch": 6.036036036036036,
999
+ "grad_norm": 0.8984375,
1000
+ "learning_rate": 8.13978039955308e-05,
1001
+ "loss": 0.5142,
1002
  "step": 670
1003
  },
1004
  {
1005
+ "epoch": 6.081081081081081,
1006
+ "grad_norm": 0.8359375,
1007
+ "learning_rate": 7.985524323504948e-05,
1008
+ "loss": 0.4725,
1009
  "step": 675
1010
  },
1011
  {
1012
+ "epoch": 6.126126126126126,
1013
+ "grad_norm": 0.7734375,
1014
+ "learning_rate": 7.831766284742807e-05,
1015
+ "loss": 0.4671,
1016
  "step": 680
1017
  },
1018
  {
1019
+ "epoch": 6.171171171171171,
1020
+ "grad_norm": 0.7578125,
1021
+ "learning_rate": 7.678544296749384e-05,
1022
+ "loss": 0.4804,
1023
  "step": 685
1024
  },
1025
  {
1026
+ "epoch": 6.216216216216216,
1027
+ "grad_norm": 0.82421875,
1028
+ "learning_rate": 7.525896240479976e-05,
1029
+ "loss": 0.4704,
1030
+ "step": 690
 
1031
  },
1032
  {
1033
+ "epoch": 6.261261261261261,
1034
+ "grad_norm": 0.75,
1035
+ "learning_rate": 7.37385985499718e-05,
1036
+ "loss": 0.4659,
1037
+ "step": 695
1038
+ },
1039
+ {
1040
+ "epoch": 6.306306306306306,
1041
+ "grad_norm": 0.71484375,
1042
+ "learning_rate": 7.222472728140695e-05,
1043
+ "loss": 0.4697,
1044
+ "step": 700
1045
+ },
1046
+ {
1047
+ "epoch": 6.351351351351352,
1048
+ "grad_norm": 0.79296875,
1049
+ "learning_rate": 7.071772287234497e-05,
1050
+ "loss": 0.4912,
1051
+ "step": 705
1052
+ },
1053
+ {
1054
+ "epoch": 6.396396396396397,
1055
+ "grad_norm": 0.76953125,
1056
+ "learning_rate": 6.921795789833723e-05,
1057
+ "loss": 0.4689,
1058
+ "step": 710
1059
+ },
1060
+ {
1061
+ "epoch": 6.441441441441442,
1062
+ "grad_norm": 0.66796875,
1063
+ "learning_rate": 6.772580314513508e-05,
1064
+ "loss": 0.4753,
1065
+ "step": 715
1066
+ },
1067
+ {
1068
+ "epoch": 6.486486486486487,
1069
+ "grad_norm": 0.75,
1070
+ "learning_rate": 6.624162751702076e-05,
1071
+ "loss": 0.4759,
1072
+ "step": 720
1073
+ },
1074
+ {
1075
+ "epoch": 6.531531531531532,
1076
+ "grad_norm": 0.70703125,
1077
+ "learning_rate": 6.476579794560356e-05,
1078
+ "loss": 0.489,
1079
+ "step": 725
1080
+ },
1081
+ {
1082
+ "epoch": 6.576576576576577,
1083
+ "grad_norm": 0.7265625,
1084
+ "learning_rate": 6.329867929910347e-05,
1085
+ "loss": 0.473,
1086
+ "step": 730
1087
+ },
1088
+ {
1089
+ "epoch": 6.621621621621622,
1090
+ "grad_norm": 0.7109375,
1091
+ "learning_rate": 6.184063429214515e-05,
1092
+ "loss": 0.4793,
1093
+ "step": 735
1094
+ },
1095
+ {
1096
+ "epoch": 6.666666666666667,
1097
+ "grad_norm": 0.76171875,
1098
+ "learning_rate": 6.039202339608432e-05,
1099
+ "loss": 0.5071,
1100
+ "step": 740
1101
+ },
1102
+ {
1103
+ "epoch": 6.711711711711712,
1104
+ "grad_norm": 0.69921875,
1105
+ "learning_rate": 5.895320474988864e-05,
1106
+ "loss": 0.4741,
1107
+ "step": 745
1108
+ },
1109
+ {
1110
+ "epoch": 6.756756756756757,
1111
+ "grad_norm": 0.69921875,
1112
+ "learning_rate": 5.752453407159522e-05,
1113
+ "loss": 0.4799,
1114
+ "step": 750
1115
+ },
1116
+ {
1117
+ "epoch": 6.801801801801802,
1118
+ "grad_norm": 0.7578125,
1119
+ "learning_rate": 5.610636457036693e-05,
1120
+ "loss": 0.4901,
1121
+ "step": 755
1122
+ },
1123
+ {
1124
+ "epoch": 6.846846846846847,
1125
+ "grad_norm": 0.6953125,
1126
+ "learning_rate": 5.469904685916861e-05,
1127
+ "loss": 0.4858,
1128
+ "step": 760
1129
+ },
1130
+ {
1131
+ "epoch": 6.891891891891892,
1132
+ "grad_norm": 0.76953125,
1133
+ "learning_rate": 5.33029288680852e-05,
1134
+ "loss": 0.4895,
1135
+ "step": 765
1136
+ },
1137
+ {
1138
+ "epoch": 6.936936936936937,
1139
+ "grad_norm": 0.70703125,
1140
+ "learning_rate": 5.191835575830352e-05,
1141
+ "loss": 0.4935,
1142
+ "step": 770
1143
+ },
1144
+ {
1145
+ "epoch": 6.981981981981982,
1146
+ "grad_norm": 0.69921875,
1147
+ "learning_rate": 5.0545669836778144e-05,
1148
+ "loss": 0.4958,
1149
+ "step": 775
1150
+ },
1151
+ {
1152
+ "epoch": 7.0,
1153
+ "eval_loss": 2.7035882472991943,
1154
+ "eval_runtime": 1.0058,
1155
+ "eval_samples_per_second": 4.971,
1156
+ "eval_steps_per_second": 1.988,
1157
+ "step": 777
1158
+ },
1159
+ {
1160
+ "epoch": 7.027027027027027,
1161
+ "grad_norm": 0.6875,
1162
+ "learning_rate": 4.918521047160308e-05,
1163
+ "loss": 0.4443,
1164
+ "step": 780
1165
+ },
1166
+ {
1167
+ "epoch": 7.072072072072072,
1168
+ "grad_norm": 0.7734375,
1169
+ "learning_rate": 4.783731400811022e-05,
1170
+ "loss": 0.4139,
1171
+ "step": 785
1172
+ },
1173
+ {
1174
+ "epoch": 7.117117117117117,
1175
+ "grad_norm": 0.734375,
1176
+ "learning_rate": 4.650231368571486e-05,
1177
+ "loss": 0.41,
1178
+ "step": 790
1179
+ },
1180
+ {
1181
+ "epoch": 7.162162162162162,
1182
+ "grad_norm": 0.90625,
1183
+ "learning_rate": 4.518053955552903e-05,
1184
+ "loss": 0.4291,
1185
+ "step": 795
1186
+ },
1187
+ {
1188
+ "epoch": 7.207207207207207,
1189
+ "grad_norm": 0.71875,
1190
+ "learning_rate": 4.387231839876349e-05,
1191
+ "loss": 0.4141,
1192
+ "step": 800
1193
+ },
1194
+ {
1195
+ "epoch": 7.252252252252252,
1196
+ "grad_norm": 0.7265625,
1197
+ "learning_rate": 4.2577973645937674e-05,
1198
+ "loss": 0.4139,
1199
+ "step": 805
1200
+ },
1201
+ {
1202
+ "epoch": 7.297297297297297,
1203
+ "grad_norm": 0.76171875,
1204
+ "learning_rate": 4.129782529691815e-05,
1205
+ "loss": 0.4278,
1206
+ "step": 810
1207
+ },
1208
+ {
1209
+ "epoch": 7.342342342342342,
1210
+ "grad_norm": 0.73046875,
1211
+ "learning_rate": 4.003218984180552e-05,
1212
+ "loss": 0.4148,
1213
+ "step": 815
1214
+ },
1215
+ {
1216
+ "epoch": 7.387387387387387,
1217
+ "grad_norm": 0.79296875,
1218
+ "learning_rate": 3.878138018268866e-05,
1219
+ "loss": 0.4168,
1220
+ "step": 820
1221
+ },
1222
+ {
1223
+ "epoch": 7.4324324324324325,
1224
+ "grad_norm": 0.82421875,
1225
+ "learning_rate": 3.7545705556286126e-05,
1226
+ "loss": 0.4182,
1227
+ "step": 825
1228
+ },
1229
+ {
1230
+ "epoch": 7.4774774774774775,
1231
+ "grad_norm": 0.70703125,
1232
+ "learning_rate": 3.632547145749395e-05,
1233
+ "loss": 0.4239,
1234
+ "step": 830
1235
+ },
1236
+ {
1237
+ "epoch": 7.5225225225225225,
1238
+ "grad_norm": 0.78515625,
1239
+ "learning_rate": 3.5120979563858266e-05,
1240
+ "loss": 0.4137,
1241
+ "step": 835
1242
+ },
1243
+ {
1244
+ "epoch": 7.5675675675675675,
1245
+ "grad_norm": 0.73828125,
1246
+ "learning_rate": 3.393252766099187e-05,
1247
+ "loss": 0.4111,
1248
+ "step": 840
1249
+ },
1250
+ {
1251
+ "epoch": 7.612612612612613,
1252
+ "grad_norm": 0.7421875,
1253
+ "learning_rate": 3.2760409568952766e-05,
1254
+ "loss": 0.4179,
1255
+ "step": 845
1256
+ },
1257
+ {
1258
+ "epoch": 7.657657657657658,
1259
+ "grad_norm": 0.76171875,
1260
+ "learning_rate": 3.1604915069603436e-05,
1261
+ "loss": 0.429,
1262
+ "step": 850
1263
+ },
1264
+ {
1265
+ "epoch": 7.702702702702703,
1266
+ "grad_norm": 0.75,
1267
+ "learning_rate": 3.0466329834968233e-05,
1268
+ "loss": 0.4118,
1269
+ "step": 855
1270
+ },
1271
+ {
1272
+ "epoch": 7.747747747747748,
1273
+ "grad_norm": 0.71484375,
1274
+ "learning_rate": 2.9344935356606773e-05,
1275
+ "loss": 0.4049,
1276
+ "step": 860
1277
+ },
1278
+ {
1279
+ "epoch": 7.792792792792793,
1280
+ "grad_norm": 0.74609375,
1281
+ "learning_rate": 2.8241008876021215e-05,
1282
+ "loss": 0.413,
1283
+ "step": 865
1284
+ },
1285
+ {
1286
+ "epoch": 7.837837837837838,
1287
+ "grad_norm": 0.72265625,
1288
+ "learning_rate": 2.7154823316113932e-05,
1289
+ "loss": 0.4071,
1290
+ "step": 870
1291
+ },
1292
+ {
1293
+ "epoch": 7.882882882882883,
1294
+ "grad_norm": 0.734375,
1295
+ "learning_rate": 2.60866472137129e-05,
1296
+ "loss": 0.4073,
1297
+ "step": 875
1298
+ },
1299
+ {
1300
+ "epoch": 7.927927927927928,
1301
+ "grad_norm": 0.71875,
1302
+ "learning_rate": 2.5036744653181753e-05,
1303
+ "loss": 0.4124,
1304
+ "step": 880
1305
+ },
1306
+ {
1307
+ "epoch": 7.972972972972973,
1308
+ "grad_norm": 0.7578125,
1309
+ "learning_rate": 2.4005375201130274e-05,
1310
+ "loss": 0.4205,
1311
+ "step": 885
1312
+ },
1313
+ {
1314
+ "epoch": 8.0,
1315
+ "eval_loss": 2.9689488410949707,
1316
+ "eval_runtime": 1.0053,
1317
+ "eval_samples_per_second": 4.973,
1318
+ "eval_steps_per_second": 1.989,
1319
+ "step": 888
1320
+ },
1321
+ {
1322
+ "epoch": 8.018018018018019,
1323
+ "grad_norm": 0.68359375,
1324
+ "learning_rate": 2.29927938422419e-05,
1325
+ "loss": 0.4012,
1326
+ "step": 890
1327
+ },
1328
+ {
1329
+ "epoch": 8.063063063063064,
1330
+ "grad_norm": 1.0078125,
1331
+ "learning_rate": 2.199925091623418e-05,
1332
+ "loss": 0.3781,
1333
+ "step": 895
1334
+ },
1335
+ {
1336
+ "epoch": 8.108108108108109,
1337
+ "grad_norm": 0.8671875,
1338
+ "learning_rate": 2.102499205596743e-05,
1339
+ "loss": 0.3809,
1340
+ "step": 900
1341
+ },
1342
+ {
1343
+ "epoch": 8.153153153153154,
1344
+ "grad_norm": 0.70703125,
1345
+ "learning_rate": 2.0070258126717e-05,
1346
+ "loss": 0.3699,
1347
+ "step": 905
1348
+ },
1349
+ {
1350
+ "epoch": 8.198198198198199,
1351
+ "grad_norm": 0.6875,
1352
+ "learning_rate": 1.913528516662452e-05,
1353
+ "loss": 0.3742,
1354
+ "step": 910
1355
+ },
1356
+ {
1357
+ "epoch": 8.243243243243244,
1358
+ "grad_norm": 0.70703125,
1359
+ "learning_rate": 1.8220304328342252e-05,
1360
+ "loss": 0.378,
1361
+ "step": 915
1362
+ },
1363
+ {
1364
+ "epoch": 8.288288288288289,
1365
+ "grad_norm": 0.70703125,
1366
+ "learning_rate": 1.7325541821885384e-05,
1367
+ "loss": 0.3842,
1368
+ "step": 920
1369
+ },
1370
+ {
1371
+ "epoch": 8.333333333333334,
1372
+ "grad_norm": 0.75390625,
1373
+ "learning_rate": 1.6451218858706374e-05,
1374
+ "loss": 0.3894,
1375
+ "step": 925
1376
+ },
1377
+ {
1378
+ "epoch": 8.378378378378379,
1379
+ "grad_norm": 0.71875,
1380
+ "learning_rate": 1.5597551597004966e-05,
1381
+ "loss": 0.3758,
1382
+ "step": 930
1383
+ },
1384
+ {
1385
+ "epoch": 8.423423423423424,
1386
+ "grad_norm": 0.671875,
1387
+ "learning_rate": 1.476475108828762e-05,
1388
+ "loss": 0.3717,
1389
+ "step": 935
1390
+ },
1391
+ {
1392
+ "epoch": 8.468468468468469,
1393
+ "grad_norm": 0.703125,
1394
+ "learning_rate": 1.3953023225189243e-05,
1395
+ "loss": 0.3771,
1396
+ "step": 940
1397
+ },
1398
+ {
1399
+ "epoch": 8.513513513513514,
1400
+ "grad_norm": 0.71875,
1401
+ "learning_rate": 1.3162568690570743e-05,
1402
+ "loss": 0.3759,
1403
+ "step": 945
1404
+ },
1405
+ {
1406
+ "epoch": 8.558558558558559,
1407
+ "grad_norm": 0.74609375,
1408
+ "learning_rate": 1.23935829079042e-05,
1409
+ "loss": 0.3786,
1410
+ "step": 950
1411
+ },
1412
+ {
1413
+ "epoch": 8.603603603603604,
1414
+ "grad_norm": 0.7109375,
1415
+ "learning_rate": 1.1646255992958466e-05,
1416
+ "loss": 0.3734,
1417
+ "step": 955
1418
+ },
1419
+ {
1420
+ "epoch": 8.64864864864865,
1421
+ "grad_norm": 0.7265625,
1422
+ "learning_rate": 1.0920772706797167e-05,
1423
+ "loss": 0.3809,
1424
+ "step": 960
1425
+ },
1426
+ {
1427
+ "epoch": 8.693693693693694,
1428
+ "grad_norm": 0.7109375,
1429
+ "learning_rate": 1.0217312410100089e-05,
1430
+ "loss": 0.3767,
1431
+ "step": 965
1432
+ },
1433
+ {
1434
+ "epoch": 8.73873873873874,
1435
+ "grad_norm": 0.68359375,
1436
+ "learning_rate": 9.536049018820192e-06,
1437
+ "loss": 0.3786,
1438
+ "step": 970
1439
+ },
1440
+ {
1441
+ "epoch": 8.783783783783784,
1442
+ "grad_norm": 0.71875,
1443
+ "learning_rate": 8.87715096118642e-06,
1444
+ "loss": 0.3786,
1445
+ "step": 975
1446
+ },
1447
+ {
1448
+ "epoch": 8.82882882882883,
1449
+ "grad_norm": 0.74609375,
1450
+ "learning_rate": 8.240781136063346e-06,
1451
+ "loss": 0.3868,
1452
+ "step": 980
1453
+ },
1454
+ {
1455
+ "epoch": 8.873873873873874,
1456
+ "grad_norm": 0.72265625,
1457
+ "learning_rate": 7.6270968726777414e-06,
1458
+ "loss": 0.3767,
1459
+ "step": 985
1460
+ },
1461
+ {
1462
+ "epoch": 8.91891891891892,
1463
+ "grad_norm": 0.7578125,
1464
+ "learning_rate": 7.03624989172228e-06,
1465
+ "loss": 0.3791,
1466
+ "step": 990
1467
+ },
1468
+ {
1469
+ "epoch": 8.963963963963964,
1470
+ "grad_norm": 0.71875,
1471
+ "learning_rate": 6.468386267845717e-06,
1472
+ "loss": 0.382,
1473
+ "step": 995
1474
+ },
1475
+ {
1476
+ "epoch": 9.0,
1477
+ "eval_loss": 3.2251663208007812,
1478
+ "eval_runtime": 1.0069,
1479
+ "eval_samples_per_second": 4.966,
1480
+ "eval_steps_per_second": 1.986,
1481
+ "step": 999
1482
+ },
1483
+ {
1484
+ "epoch": 9.00900900900901,
1485
+ "grad_norm": 0.71484375,
1486
+ "learning_rate": 5.9236463935389065e-06,
1487
+ "loss": 0.3794,
1488
+ "step": 1000
1489
+ },
1490
+ {
1491
+ "epoch": 9.054054054054054,
1492
+ "grad_norm": 0.671875,
1493
+ "learning_rate": 5.402164944425758e-06,
1494
+ "loss": 0.3777,
1495
+ "step": 1005
1496
+ },
1497
+ {
1498
+ "epoch": 9.0990990990991,
1499
+ "grad_norm": 0.734375,
1500
+ "learning_rate": 4.904070845967468e-06,
1501
+ "loss": 0.3779,
1502
+ "step": 1010
1503
+ },
1504
+ {
1505
+ "epoch": 9.144144144144144,
1506
+ "grad_norm": 0.703125,
1507
+ "learning_rate": 4.429487241588304e-06,
1508
+ "loss": 0.3744,
1509
+ "step": 1015
1510
+ },
1511
+ {
1512
+ "epoch": 9.18918918918919,
1513
+ "grad_norm": 0.6953125,
1514
+ "learning_rate": 3.9785314622310495e-06,
1515
+ "loss": 0.3694,
1516
+ "step": 1020
1517
+ },
1518
+ {
1519
+ "epoch": 9.234234234234235,
1520
+ "grad_norm": 0.71875,
1521
+ "learning_rate": 3.5513149973492976e-06,
1522
+ "loss": 0.3751,
1523
+ "step": 1025
1524
+ },
1525
+ {
1526
+ "epoch": 9.27927927927928,
1527
+ "grad_norm": 0.7265625,
1528
+ "learning_rate": 3.1479434673440167e-06,
1529
+ "loss": 0.3685,
1530
+ "step": 1030
1531
+ },
1532
+ {
1533
+ "epoch": 9.324324324324325,
1534
+ "grad_norm": 0.68359375,
1535
+ "learning_rate": 2.7685165974510986e-06,
1536
+ "loss": 0.3653,
1537
+ "step": 1035
1538
+ },
1539
+ {
1540
+ "epoch": 9.36936936936937,
1541
+ "grad_norm": 0.71484375,
1542
+ "learning_rate": 2.4131281930864002e-06,
1543
+ "loss": 0.3728,
1544
+ "step": 1040
1545
+ },
1546
+ {
1547
+ "epoch": 9.414414414414415,
1548
+ "grad_norm": 0.734375,
1549
+ "learning_rate": 2.0818661166542074e-06,
1550
+ "loss": 0.3693,
1551
+ "step": 1045
1552
+ },
1553
+ {
1554
+ "epoch": 9.45945945945946,
1555
+ "grad_norm": 0.6875,
1556
+ "learning_rate": 1.7748122658251876e-06,
1557
+ "loss": 0.3764,
1558
+ "step": 1050
1559
+ },
1560
+ {
1561
+ "epoch": 9.504504504504505,
1562
+ "grad_norm": 0.7265625,
1563
+ "learning_rate": 1.4920425532888526e-06,
1564
+ "loss": 0.3654,
1565
+ "step": 1055
1566
+ },
1567
+ {
1568
+ "epoch": 9.54954954954955,
1569
+ "grad_norm": 0.66796875,
1570
+ "learning_rate": 1.2336268879856727e-06,
1571
+ "loss": 0.3747,
1572
+ "step": 1060
1573
+ },
1574
+ {
1575
+ "epoch": 9.594594594594595,
1576
+ "grad_norm": 0.69140625,
1577
+ "learning_rate": 9.996291578236228e-07,
1578
+ "loss": 0.3711,
1579
+ "step": 1065
1580
+ },
1581
+ {
1582
+ "epoch": 9.63963963963964,
1583
+ "grad_norm": 0.71484375,
1584
+ "learning_rate": 7.901072138831511e-07,
1585
+ "loss": 0.3722,
1586
+ "step": 1070
1587
+ },
1588
+ {
1589
+ "epoch": 9.684684684684685,
1590
+ "grad_norm": 0.7109375,
1591
+ "learning_rate": 6.051128561147756e-07,
1592
+ "loss": 0.3612,
1593
+ "step": 1075
1594
+ },
1595
+ {
1596
+ "epoch": 9.72972972972973,
1597
+ "grad_norm": 0.74609375,
1598
+ "learning_rate": 4.44691820532539e-07,
1599
+ "loss": 0.3647,
1600
+ "step": 1080
1601
+ },
1602
+ {
1603
+ "epoch": 9.774774774774775,
1604
+ "grad_norm": 0.6875,
1605
+ "learning_rate": 3.0888376790679795e-07,
1606
+ "loss": 0.3672,
1607
+ "step": 1085
1608
+ },
1609
+ {
1610
+ "epoch": 9.81981981981982,
1611
+ "grad_norm": 0.6484375,
1612
+ "learning_rate": 1.977222739588891e-07,
1613
+ "loss": 0.3659,
1614
+ "step": 1090
1615
+ },
1616
+ {
1617
+ "epoch": 9.864864864864865,
1618
+ "grad_norm": 0.67578125,
1619
+ "learning_rate": 1.1123482106021322e-07,
1620
+ "loss": 0.3692,
1621
+ "step": 1095
1622
+ },
1623
+ {
1624
+ "epoch": 9.90990990990991,
1625
+ "grad_norm": 0.6875,
1626
+ "learning_rate": 4.9442791437848136e-08,
1627
+ "loss": 0.3663,
1628
+ "step": 1100
1629
+ },
1630
+ {
1631
+ "epoch": 9.954954954954955,
1632
+ "grad_norm": 0.6875,
1633
+ "learning_rate": 1.2361461888166226e-08,
1634
+ "loss": 0.3673,
1635
+ "step": 1105
1636
+ },
1637
+ {
1638
+ "epoch": 10.0,
1639
+ "grad_norm": 0.6640625,
1640
+ "learning_rate": 0.0,
1641
+ "loss": 0.372,
1642
+ "step": 1110
1643
+ },
1644
+ {
1645
+ "epoch": 10.0,
1646
+ "eval_loss": 3.242992401123047,
1647
+ "eval_runtime": 1.0031,
1648
+ "eval_samples_per_second": 4.984,
1649
+ "eval_steps_per_second": 1.994,
1650
+ "step": 1110
1651
+ },
1652
+ {
1653
+ "epoch": 10.0,
1654
+ "step": 1110,
1655
+ "total_flos": 1.697049221804327e+18,
1656
+ "train_loss": 1.8630313719715084,
1657
+ "train_runtime": 9058.6901,
1658
+ "train_samples_per_second": 1.957,
1659
+ "train_steps_per_second": 0.123
1660
  }
1661
  ],
1662
  "logging_steps": 5,
1663
+ "max_steps": 1110,
1664
  "num_input_tokens_seen": 0,
1665
+ "num_train_epochs": 10,
1666
  "save_steps": 100,
1667
  "stateful_callbacks": {
1668
  "TrainerControl": {
 
1676
  "attributes": {}
1677
  }
1678
  },
1679
+ "total_flos": 1.697049221804327e+18,
1680
  "train_batch_size": 4,
1681
  "trial_name": null,
1682
  "trial_params": null