li-muyang commited on
Commit
5cf725b
·
verified ·
1 Parent(s): 40d6bef

Model save

Browse files
Files changed (4) hide show
  1. README.md +26 -15
  2. all_results.json +6 -11
  3. train_results.json +6 -6
  4. trainer_state.json +1769 -886
README.md CHANGED
@@ -3,15 +3,11 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: mistralai/Mistral-7B-v0.1
5
  tags:
6
- - alignment-handbook
7
- - trl
8
- - sft
9
- - generated_from_trainer
10
  - trl
11
  - sft
12
  - generated_from_trainer
13
  datasets:
14
- - HuggingFaceH4/ultrachat_200k
15
  model-index:
16
  - name: zephyr-7b-sft-full
17
  results: []
@@ -22,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # zephyr-7b-sft-full
24
 
25
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/ultrachat_200k dataset.
26
  It achieves the following results on the evaluation set:
27
- - Loss: 0.9420
28
 
29
  ## Model description
30
 
@@ -45,22 +41,37 @@ More information needed
45
  The following hyperparameters were used during training:
46
  - learning_rate: 2e-05
47
  - train_batch_size: 16
48
- - eval_batch_size: 8
49
  - seed: 42
50
  - distributed_type: multi-GPU
51
- - num_devices: 8
52
- - total_train_batch_size: 128
53
- - total_eval_batch_size: 64
54
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
55
  - lr_scheduler_type: cosine
56
  - lr_scheduler_warmup_ratio: 0.1
57
- - num_epochs: 1
58
 
59
  ### Training results
60
 
61
- | Training Loss | Epoch | Step | Validation Loss |
62
- |:-------------:|:-----:|:----:|:---------------:|
63
- | 0.9183 | 1.0 | 1084 | 0.9420 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: mistralai/Mistral-7B-v0.1
5
  tags:
 
 
 
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: zephyr-7b-sft-full
13
  results: []
 
18
 
19
  # zephyr-7b-sft-full
20
 
21
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.9934
24
 
25
  ## Model description
26
 
 
41
  The following hyperparameters were used during training:
42
  - learning_rate: 2e-05
43
  - train_batch_size: 16
44
+ - eval_batch_size: 16
45
  - seed: 42
46
  - distributed_type: multi-GPU
47
+ - num_devices: 16
48
+ - total_train_batch_size: 256
49
+ - total_eval_batch_size: 256
50
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
  - lr_scheduler_type: cosine
52
  - lr_scheduler_warmup_ratio: 0.1
53
+ - num_epochs: 3.0
54
 
55
  ### Training results
56
 
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:------:|:----:|:---------------:|
59
+ | 0.9681 | 0.1845 | 100 | 0.9788 |
60
+ | 0.9962 | 0.3690 | 200 | 1.0030 |
61
+ | 0.9917 | 0.5535 | 300 | 1.0008 |
62
+ | 0.9652 | 0.7380 | 400 | 0.9939 |
63
+ | 0.9666 | 0.9225 | 500 | 0.9816 |
64
+ | 0.7366 | 1.1070 | 600 | 0.9852 |
65
+ | 0.7228 | 1.2915 | 700 | 0.9835 |
66
+ | 0.7319 | 1.4760 | 800 | 0.9644 |
67
+ | 0.7177 | 1.6605 | 900 | 0.9529 |
68
+ | 0.7095 | 1.8450 | 1000 | 0.9394 |
69
+ | 0.4465 | 2.0295 | 1100 | 0.9917 |
70
+ | 0.4341 | 2.2140 | 1200 | 0.9979 |
71
+ | 0.432 | 2.3985 | 1300 | 0.9954 |
72
+ | 0.4301 | 2.5830 | 1400 | 0.9943 |
73
+ | 0.4361 | 2.7675 | 1500 | 0.9931 |
74
+ | 0.4256 | 2.9520 | 1600 | 0.9934 |
75
 
76
 
77
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.9419716000556946,
4
- "eval_runtime": 916.5084,
5
- "eval_samples": 23109,
6
- "eval_samples_per_second": 16.748,
7
- "eval_steps_per_second": 0.262,
8
- "total_flos": 453935093514240.0,
9
- "train_loss": 0.9848188322408613,
10
- "train_runtime": 36728.3484,
11
  "train_samples": 207864,
12
- "train_samples_per_second": 3.776,
13
- "train_steps_per_second": 0.03
14
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "total_flos": 1361805280542720.0,
4
+ "train_loss": 0.713569560815634,
5
+ "train_runtime": 59769.2599,
 
 
 
 
 
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 6.961,
8
+ "train_steps_per_second": 0.027
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 453935093514240.0,
4
- "train_loss": 0.9848188322408613,
5
- "train_runtime": 36728.3484,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 3.776,
8
- "train_steps_per_second": 0.03
9
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "total_flos": 1361805280542720.0,
4
+ "train_loss": 0.713569560815634,
5
+ "train_runtime": 59769.2599,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 6.961,
8
+ "train_steps_per_second": 0.027
9
  }
trainer_state.json CHANGED
@@ -1,1554 +1,2437 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "eval_steps": 100.0,
6
- "global_step": 1084,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0009225092250922509,
13
- "grad_norm": 9.199869276397154,
14
- "learning_rate": 1.8348623853211012e-07,
15
  "loss": 1.1392,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.004612546125461255,
20
- "grad_norm": 7.233201206284356,
21
- "learning_rate": 9.174311926605506e-07,
22
- "loss": 1.1291,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.00922509225092251,
27
- "grad_norm": 4.144472619198066,
28
- "learning_rate": 1.8348623853211011e-06,
29
- "loss": 1.0628,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.013837638376383764,
34
- "grad_norm": 2.6527022539947374,
35
- "learning_rate": 2.7522935779816517e-06,
36
- "loss": 1.0102,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.01845018450184502,
41
- "grad_norm": 2.7475157939980743,
42
- "learning_rate": 3.6697247706422022e-06,
43
- "loss": 1.0126,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.023062730627306273,
48
- "grad_norm": 2.3644745727694243,
49
- "learning_rate": 4.587155963302753e-06,
50
- "loss": 1.0031,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.027675276752767528,
55
- "grad_norm": 2.2459942557783146,
56
- "learning_rate": 5.504587155963303e-06,
57
- "loss": 0.9714,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.03228782287822878,
62
- "grad_norm": 2.8633038603162237,
63
- "learning_rate": 6.422018348623854e-06,
64
- "loss": 0.998,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.03690036900369004,
69
- "grad_norm": 3.2535687429097493,
70
- "learning_rate": 7.3394495412844045e-06,
71
- "loss": 1.0051,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.04151291512915129,
76
- "grad_norm": 3.2137144467730088,
77
- "learning_rate": 8.256880733944956e-06,
78
- "loss": 0.9837,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.046125461254612546,
83
- "grad_norm": 3.229126865982439,
84
- "learning_rate": 9.174311926605506e-06,
85
- "loss": 0.9808,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.0507380073800738,
90
- "grad_norm": 2.559458190466229,
91
- "learning_rate": 1.0091743119266055e-05,
92
- "loss": 0.9816,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.055350553505535055,
97
- "grad_norm": 2.2546043237809603,
98
- "learning_rate": 1.1009174311926607e-05,
99
- "loss": 0.9956,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.05996309963099631,
104
- "grad_norm": 1.9138564480742573,
105
- "learning_rate": 1.1926605504587156e-05,
106
- "loss": 0.9995,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.06457564575645756,
111
- "grad_norm": 2.5216610932942256,
112
- "learning_rate": 1.2844036697247708e-05,
113
- "loss": 0.9789,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.06918819188191883,
118
- "grad_norm": 2.3850758267877654,
119
- "learning_rate": 1.3761467889908258e-05,
120
- "loss": 0.9904,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.07380073800738007,
125
- "grad_norm": 2.4735172195856485,
126
- "learning_rate": 1.4678899082568809e-05,
127
- "loss": 0.9855,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.07841328413284133,
132
- "grad_norm": 2.534406745674543,
133
- "learning_rate": 1.559633027522936e-05,
134
- "loss": 1.0087,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.08302583025830258,
139
- "grad_norm": 2.0301929785147577,
140
- "learning_rate": 1.6513761467889912e-05,
141
- "loss": 1.0092,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.08763837638376384,
146
- "grad_norm": 2.440748895363732,
147
- "learning_rate": 1.743119266055046e-05,
148
- "loss": 1.0159,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.09225092250922509,
153
- "grad_norm": 1.8603714211581104,
154
- "learning_rate": 1.834862385321101e-05,
155
- "loss": 1.0225,
 
 
 
 
 
 
 
 
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.09686346863468635,
160
- "grad_norm": 2.056122625713521,
161
- "learning_rate": 1.9266055045871563e-05,
162
- "loss": 1.0501,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.1014760147601476,
167
- "grad_norm": 1.8374568202095534,
168
- "learning_rate": 1.9999948088910656e-05,
169
- "loss": 1.0353,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.10608856088560886,
174
- "grad_norm": 2.1690401471869323,
175
- "learning_rate": 1.9998131257372878e-05,
176
- "loss": 1.0457,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.11070110701107011,
181
- "grad_norm": 2.333795049697145,
182
- "learning_rate": 1.999371941029485e-05,
183
- "loss": 1.0262,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.11531365313653137,
188
- "grad_norm": 2.193045375950034,
189
- "learning_rate": 1.9986713692771732e-05,
190
- "loss": 1.0522,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.11992619926199262,
195
- "grad_norm": 2.090844050250459,
196
- "learning_rate": 1.9977115923137912e-05,
197
- "loss": 1.0223,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.12453874538745388,
202
- "grad_norm": 1.9519590219761234,
203
- "learning_rate": 1.9964928592495046e-05,
204
- "loss": 1.0536,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.12915129151291513,
209
- "grad_norm": 1.9148433671554628,
210
- "learning_rate": 1.9950154864065497e-05,
211
- "loss": 1.0495,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.13376383763837638,
216
- "grad_norm": 1.8328907186676047,
217
- "learning_rate": 1.993279857237133e-05,
218
- "loss": 1.029,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.13837638376383765,
223
- "grad_norm": 1.890355857976019,
224
- "learning_rate": 1.9912864222239045e-05,
225
- "loss": 1.0171,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.1429889298892989,
230
- "grad_norm": 1.883026281091422,
231
- "learning_rate": 1.9890356987630362e-05,
232
- "loss": 1.0687,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.14760147601476015,
237
- "grad_norm": 2.0771293885484727,
238
- "learning_rate": 1.986528271029931e-05,
239
- "loss": 1.0274,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 0.1522140221402214,
244
- "grad_norm": 2.359666912377762,
245
- "learning_rate": 1.9837647898276008e-05,
246
- "loss": 1.0406,
247
  "step": 165
248
  },
249
  {
250
- "epoch": 0.15682656826568267,
251
- "grad_norm": 2.3232785879151168,
252
- "learning_rate": 1.9807459724177497e-05,
253
- "loss": 1.0415,
254
  "step": 170
255
  },
256
  {
257
- "epoch": 0.16143911439114392,
258
- "grad_norm": 2.0936905877765186,
259
- "learning_rate": 1.977472602334609e-05,
260
- "loss": 1.033,
261
  "step": 175
262
  },
263
  {
264
- "epoch": 0.16605166051660517,
265
- "grad_norm": 1.8878158554896975,
266
- "learning_rate": 1.973945529181572e-05,
267
- "loss": 1.0364,
268
  "step": 180
269
  },
270
  {
271
- "epoch": 0.1706642066420664,
272
- "grad_norm": 1.9593900450172317,
273
- "learning_rate": 1.9701656684106764e-05,
274
- "loss": 1.0367,
275
  "step": 185
276
  },
277
  {
278
- "epoch": 0.1752767527675277,
279
- "grad_norm": 1.5858821175634874,
280
- "learning_rate": 1.9661340010850025e-05,
281
- "loss": 1.0214,
282
  "step": 190
283
  },
284
  {
285
- "epoch": 0.17988929889298894,
286
- "grad_norm": 1.6698918973649843,
287
- "learning_rate": 1.9618515736240353e-05,
288
- "loss": 1.0275,
289
  "step": 195
290
  },
291
  {
292
- "epoch": 0.18450184501845018,
293
- "grad_norm": 2.4565319786652853,
294
- "learning_rate": 1.9573194975320672e-05,
295
- "loss": 1.0387,
 
 
 
 
 
 
 
 
296
  "step": 200
297
  },
298
  {
299
- "epoch": 0.18911439114391143,
300
- "grad_norm": 1.8022901836910148,
301
- "learning_rate": 1.952538949109708e-05,
302
- "loss": 1.0283,
303
  "step": 205
304
  },
305
  {
306
- "epoch": 0.1937269372693727,
307
- "grad_norm": 1.8754516081940102,
308
- "learning_rate": 1.9475111691485737e-05,
309
- "loss": 1.0091,
310
  "step": 210
311
  },
312
  {
313
- "epoch": 0.19833948339483395,
314
- "grad_norm": 1.7377386091028952,
315
- "learning_rate": 1.9422374626092414e-05,
316
- "loss": 1.0196,
317
  "step": 215
318
  },
319
  {
320
- "epoch": 0.2029520295202952,
321
- "grad_norm": 1.6554468249518657,
322
- "learning_rate": 1.936719198282545e-05,
323
- "loss": 1.04,
324
  "step": 220
325
  },
326
  {
327
- "epoch": 0.20756457564575645,
328
- "grad_norm": 1.5846514749193226,
329
- "learning_rate": 1.930957808434307e-05,
330
- "loss": 1.0456,
331
  "step": 225
332
  },
333
  {
334
- "epoch": 0.21217712177121772,
335
- "grad_norm": 1.8827810885248515,
336
- "learning_rate": 1.9249547884335917e-05,
337
- "loss": 1.0264,
338
  "step": 230
339
  },
340
  {
341
- "epoch": 0.21678966789667897,
342
- "grad_norm": 1.7322775011210476,
343
- "learning_rate": 1.9187116963645845e-05,
344
- "loss": 1.0454,
345
  "step": 235
346
  },
347
  {
348
- "epoch": 0.22140221402214022,
349
- "grad_norm": 1.5658002998985245,
350
- "learning_rate": 1.912230152622189e-05,
351
- "loss": 1.0343,
352
  "step": 240
353
  },
354
  {
355
- "epoch": 0.22601476014760147,
356
- "grad_norm": 1.725101976536917,
357
- "learning_rate": 1.9055118394914545e-05,
358
- "loss": 1.0343,
359
  "step": 245
360
  },
361
  {
362
- "epoch": 0.23062730627306274,
363
- "grad_norm": 2.1904906773576087,
364
- "learning_rate": 1.898558500710939e-05,
365
- "loss": 1.0318,
366
  "step": 250
367
  },
368
  {
369
- "epoch": 0.235239852398524,
370
- "grad_norm": 1.8347760248935339,
371
- "learning_rate": 1.891371941020121e-05,
372
- "loss": 1.0389,
373
  "step": 255
374
  },
375
  {
376
- "epoch": 0.23985239852398524,
377
- "grad_norm": 1.857980454997456,
378
- "learning_rate": 1.88395402569098e-05,
379
- "loss": 1.0476,
380
  "step": 260
381
  },
382
  {
383
- "epoch": 0.2444649446494465,
384
- "grad_norm": 2.290390982017683,
385
- "learning_rate": 1.8763066800438638e-05,
386
- "loss": 1.0509,
387
  "step": 265
388
  },
389
  {
390
- "epoch": 0.24907749077490776,
391
- "grad_norm": 1.820494807037894,
392
- "learning_rate": 1.868431888947773e-05,
393
- "loss": 1.0473,
394
  "step": 270
395
  },
396
  {
397
- "epoch": 0.253690036900369,
398
- "grad_norm": 1.6939194646151865,
399
- "learning_rate": 1.860331696305188e-05,
400
- "loss": 1.0259,
401
  "step": 275
402
  },
403
  {
404
- "epoch": 0.25830258302583026,
405
- "grad_norm": 1.8537886490677649,
406
- "learning_rate": 1.852008204521572e-05,
407
- "loss": 1.0352,
408
  "step": 280
409
  },
410
  {
411
- "epoch": 0.2629151291512915,
412
- "grad_norm": 1.6085016709020972,
413
- "learning_rate": 1.8434635739596945e-05,
414
- "loss": 1.0253,
415
  "step": 285
416
  },
417
  {
418
- "epoch": 0.26752767527675275,
419
- "grad_norm": 1.6014838440490224,
420
- "learning_rate": 1.834700022378907e-05,
421
- "loss": 1.0361,
422
  "step": 290
423
  },
424
  {
425
- "epoch": 0.272140221402214,
426
- "grad_norm": 1.7338858274869917,
427
- "learning_rate": 1.825719824359524e-05,
428
- "loss": 1.0272,
429
  "step": 295
430
  },
431
  {
432
- "epoch": 0.2767527675276753,
433
- "grad_norm": 1.5899229118868345,
434
- "learning_rate": 1.816525310712456e-05,
435
- "loss": 1.0341,
 
 
 
 
 
 
 
 
436
  "step": 300
437
  },
438
  {
439
- "epoch": 0.28136531365313655,
440
- "grad_norm": 2.348086425746126,
441
- "learning_rate": 1.8071188678742457e-05,
442
- "loss": 1.0104,
443
  "step": 305
444
  },
445
  {
446
- "epoch": 0.2859778597785978,
447
- "grad_norm": 1.7668549809740794,
448
- "learning_rate": 1.7975029372876706e-05,
449
- "loss": 1.0333,
450
  "step": 310
451
  },
452
  {
453
- "epoch": 0.29059040590405905,
454
- "grad_norm": 1.6328725360023482,
455
- "learning_rate": 1.787680014768065e-05,
456
- "loss": 1.0221,
457
  "step": 315
458
  },
459
  {
460
- "epoch": 0.2952029520295203,
461
- "grad_norm": 1.6065778746306763,
462
- "learning_rate": 1.777652649855531e-05,
463
- "loss": 1.0219,
464
  "step": 320
465
  },
466
  {
467
- "epoch": 0.29981549815498154,
468
- "grad_norm": 1.6424952210499195,
469
- "learning_rate": 1.7674234451532065e-05,
470
- "loss": 1.0315,
471
  "step": 325
472
  },
473
  {
474
- "epoch": 0.3044280442804428,
475
- "grad_norm": 1.644109341241818,
476
- "learning_rate": 1.7569950556517566e-05,
477
- "loss": 1.0441,
478
  "step": 330
479
  },
480
  {
481
- "epoch": 0.30904059040590404,
482
- "grad_norm": 1.679640047213138,
483
- "learning_rate": 1.7463701880402738e-05,
484
- "loss": 1.0393,
485
  "step": 335
486
  },
487
  {
488
- "epoch": 0.31365313653136534,
489
- "grad_norm": 1.6684149890786646,
490
- "learning_rate": 1.7355516000037555e-05,
491
- "loss": 1.0293,
492
  "step": 340
493
  },
494
  {
495
- "epoch": 0.3182656826568266,
496
- "grad_norm": 1.5914184889923237,
497
- "learning_rate": 1.7245420995073453e-05,
498
- "loss": 1.0378,
499
  "step": 345
500
  },
501
  {
502
- "epoch": 0.32287822878228783,
503
- "grad_norm": 1.5692073860267395,
504
- "learning_rate": 1.7133445440675268e-05,
505
- "loss": 1.0143,
506
  "step": 350
507
  },
508
  {
509
- "epoch": 0.3274907749077491,
510
- "grad_norm": 1.5995342549553517,
511
- "learning_rate": 1.7019618400104572e-05,
512
- "loss": 1.0238,
513
  "step": 355
514
  },
515
  {
516
- "epoch": 0.33210332103321033,
517
- "grad_norm": 1.9464601435555215,
518
- "learning_rate": 1.6903969417176244e-05,
519
- "loss": 1.0288,
520
  "step": 360
521
  },
522
  {
523
- "epoch": 0.3367158671586716,
524
- "grad_norm": 1.57749745993734,
525
- "learning_rate": 1.6786528508590436e-05,
526
- "loss": 1.0185,
527
  "step": 365
528
  },
529
  {
530
- "epoch": 0.3413284132841328,
531
- "grad_norm": 3.1064222839234703,
532
- "learning_rate": 1.666732615614169e-05,
533
- "loss": 1.042,
534
  "step": 370
535
  },
536
  {
537
- "epoch": 0.3459409594095941,
538
- "grad_norm": 1.6220955769831102,
539
- "learning_rate": 1.6546393298807405e-05,
540
- "loss": 1.0267,
541
  "step": 375
542
  },
543
  {
544
- "epoch": 0.3505535055350554,
545
- "grad_norm": 1.4978378904701755,
546
- "learning_rate": 1.6423761324717636e-05,
547
- "loss": 1.0183,
548
  "step": 380
549
  },
550
  {
551
- "epoch": 0.3551660516605166,
552
- "grad_norm": 1.5484493050267714,
553
- "learning_rate": 1.6299462063008272e-05,
554
- "loss": 0.999,
555
  "step": 385
556
  },
557
  {
558
- "epoch": 0.35977859778597787,
559
- "grad_norm": 1.591209884115851,
560
- "learning_rate": 1.61735277755598e-05,
561
- "loss": 1.0099,
562
  "step": 390
563
  },
564
  {
565
- "epoch": 0.3643911439114391,
566
- "grad_norm": 1.5725700106534855,
567
- "learning_rate": 1.6045991148623752e-05,
568
- "loss": 1.03,
569
  "step": 395
570
  },
571
  {
572
- "epoch": 0.36900369003690037,
573
- "grad_norm": 1.605757495194556,
574
- "learning_rate": 1.5916885284338937e-05,
575
- "loss": 1.0104,
 
 
 
 
 
 
 
 
576
  "step": 400
577
  },
578
  {
579
- "epoch": 0.3736162361623616,
580
- "grad_norm": 1.5888640482959957,
581
- "learning_rate": 1.5786243692139826e-05,
582
- "loss": 1.0178,
583
  "step": 405
584
  },
585
  {
586
- "epoch": 0.37822878228782286,
587
- "grad_norm": 1.500246112968274,
588
- "learning_rate": 1.5654100280059155e-05,
589
- "loss": 1.0043,
590
  "step": 410
591
  },
592
  {
593
- "epoch": 0.3828413284132841,
594
- "grad_norm": 1.5809504761491522,
595
- "learning_rate": 1.5520489345927095e-05,
596
- "loss": 0.9976,
597
  "step": 415
598
  },
599
  {
600
- "epoch": 0.3874538745387454,
601
- "grad_norm": 1.8367787118836345,
602
- "learning_rate": 1.538544556846925e-05,
603
- "loss": 1.0417,
604
  "step": 420
605
  },
606
  {
607
- "epoch": 0.39206642066420666,
608
- "grad_norm": 1.492213176709814,
609
- "learning_rate": 1.5249003998305787e-05,
610
- "loss": 1.0099,
611
  "step": 425
612
  },
613
  {
614
- "epoch": 0.3966789667896679,
615
- "grad_norm": 1.5368549913004959,
616
- "learning_rate": 1.5111200048854055e-05,
617
- "loss": 1.0144,
618
  "step": 430
619
  },
620
  {
621
- "epoch": 0.40129151291512916,
622
- "grad_norm": 1.3456499423014299,
623
- "learning_rate": 1.4972069487137024e-05,
624
- "loss": 0.9951,
625
  "step": 435
626
  },
627
  {
628
- "epoch": 0.4059040590405904,
629
- "grad_norm": 1.4852289451988536,
630
- "learning_rate": 1.4831648424499953e-05,
631
- "loss": 1.0113,
632
  "step": 440
633
  },
634
  {
635
- "epoch": 0.41051660516605165,
636
- "grad_norm": 1.5328177858690297,
637
- "learning_rate": 1.4689973307237687e-05,
638
- "loss": 1.0115,
639
  "step": 445
640
  },
641
  {
642
- "epoch": 0.4151291512915129,
643
- "grad_norm": 1.5547653083379547,
644
- "learning_rate": 1.4547080907135024e-05,
645
- "loss": 1.0186,
646
  "step": 450
647
  },
648
  {
649
- "epoch": 0.41974169741697415,
650
- "grad_norm": 1.6012528016619334,
651
- "learning_rate": 1.4403008311922593e-05,
652
- "loss": 0.9945,
653
  "step": 455
654
  },
655
  {
656
- "epoch": 0.42435424354243545,
657
- "grad_norm": 1.455131706606304,
658
- "learning_rate": 1.4257792915650728e-05,
659
- "loss": 0.9964,
660
  "step": 460
661
  },
662
  {
663
- "epoch": 0.4289667896678967,
664
- "grad_norm": 1.487042234387078,
665
- "learning_rate": 1.4111472408983843e-05,
666
- "loss": 1.0065,
667
  "step": 465
668
  },
669
  {
670
- "epoch": 0.43357933579335795,
671
- "grad_norm": 1.5691743840811192,
672
- "learning_rate": 1.3964084769417823e-05,
673
- "loss": 1.02,
674
  "step": 470
675
  },
676
  {
677
- "epoch": 0.4381918819188192,
678
- "grad_norm": 1.5181505967295013,
679
- "learning_rate": 1.3815668251422953e-05,
680
- "loss": 1.0144,
681
  "step": 475
682
  },
683
  {
684
- "epoch": 0.44280442804428044,
685
- "grad_norm": 1.5130152153427416,
686
- "learning_rate": 1.3666261376514978e-05,
687
- "loss": 1.0013,
688
  "step": 480
689
  },
690
  {
691
- "epoch": 0.4474169741697417,
692
- "grad_norm": 1.4674728937674146,
693
- "learning_rate": 1.3515902923256832e-05,
694
- "loss": 1.0205,
695
  "step": 485
696
  },
697
  {
698
- "epoch": 0.45202952029520294,
699
- "grad_norm": 1.8760830941169657,
700
- "learning_rate": 1.3364631917193671e-05,
701
- "loss": 0.9969,
702
  "step": 490
703
  },
704
  {
705
- "epoch": 0.4566420664206642,
706
- "grad_norm": 1.4141733000626033,
707
- "learning_rate": 1.321248762072377e-05,
708
- "loss": 0.9836,
709
  "step": 495
710
  },
711
  {
712
- "epoch": 0.4612546125461255,
713
- "grad_norm": 1.6973965594511555,
714
- "learning_rate": 1.3059509522907998e-05,
715
- "loss": 1.0202,
716
  "step": 500
717
  },
718
  {
719
- "epoch": 0.46586715867158673,
720
- "grad_norm": 1.5269161840167254,
721
- "learning_rate": 1.2905737329220394e-05,
722
- "loss": 0.993,
 
 
 
 
 
 
 
 
723
  "step": 505
724
  },
725
  {
726
- "epoch": 0.470479704797048,
727
- "grad_norm": 1.4909374182409396,
728
- "learning_rate": 1.2751210951242636e-05,
729
- "loss": 1.0086,
730
  "step": 510
731
  },
732
  {
733
- "epoch": 0.47509225092250923,
734
- "grad_norm": 1.4153798189966285,
735
- "learning_rate": 1.2595970496304975e-05,
736
- "loss": 1.0111,
737
  "step": 515
738
  },
739
  {
740
- "epoch": 0.4797047970479705,
741
- "grad_norm": 1.5032582612787087,
742
- "learning_rate": 1.2440056257076376e-05,
743
- "loss": 1.0006,
744
  "step": 520
745
  },
746
  {
747
- "epoch": 0.4843173431734317,
748
- "grad_norm": 1.5509414258002427,
749
- "learning_rate": 1.2283508701106559e-05,
750
- "loss": 0.9802,
751
  "step": 525
752
  },
753
  {
754
- "epoch": 0.488929889298893,
755
- "grad_norm": 1.4222240180884669,
756
- "learning_rate": 1.2126368460322637e-05,
757
- "loss": 0.9947,
758
  "step": 530
759
  },
760
  {
761
- "epoch": 0.4935424354243542,
762
- "grad_norm": 1.4826692820488787,
763
- "learning_rate": 1.1968676320483103e-05,
764
- "loss": 0.9787,
765
  "step": 535
766
  },
767
  {
768
- "epoch": 0.4981549815498155,
769
- "grad_norm": 1.6379227632536115,
770
- "learning_rate": 1.1810473210591882e-05,
771
- "loss": 0.9932,
772
  "step": 540
773
  },
774
  {
775
- "epoch": 0.5027675276752768,
776
- "grad_norm": 1.4409220105845035,
777
- "learning_rate": 1.1651800192275197e-05,
778
- "loss": 0.9823,
779
  "step": 545
780
  },
781
  {
782
- "epoch": 0.507380073800738,
783
- "grad_norm": 1.5563205928244186,
784
- "learning_rate": 1.1492698449124042e-05,
785
- "loss": 0.9902,
786
  "step": 550
787
  },
788
  {
789
- "epoch": 0.5119926199261993,
790
- "grad_norm": 1.5195778886181963,
791
- "learning_rate": 1.1333209276004959e-05,
792
- "loss": 0.9963,
793
  "step": 555
794
  },
795
  {
796
- "epoch": 0.5166051660516605,
797
- "grad_norm": 1.5087175941528819,
798
- "learning_rate": 1.1173374068341962e-05,
799
- "loss": 0.9862,
800
  "step": 560
801
  },
802
  {
803
- "epoch": 0.5212177121771218,
804
- "grad_norm": 1.4673889879261985,
805
- "learning_rate": 1.1013234311372353e-05,
806
- "loss": 0.9816,
807
  "step": 565
808
  },
809
  {
810
- "epoch": 0.525830258302583,
811
- "grad_norm": 1.435236236989395,
812
- "learning_rate": 1.0852831569379217e-05,
813
- "loss": 0.9793,
814
  "step": 570
815
  },
816
  {
817
- "epoch": 0.5304428044280443,
818
- "grad_norm": 1.4114527764890656,
819
- "learning_rate": 1.0692207474903421e-05,
820
- "loss": 0.9791,
821
  "step": 575
822
  },
823
  {
824
- "epoch": 0.5350553505535055,
825
- "grad_norm": 1.420567074345227,
826
- "learning_rate": 1.0531403717937888e-05,
827
- "loss": 0.9773,
828
  "step": 580
829
  },
830
  {
831
- "epoch": 0.5396678966789668,
832
- "grad_norm": 1.4586967012029641,
833
- "learning_rate": 1.037046203510694e-05,
834
- "loss": 0.9769,
835
  "step": 585
836
  },
837
  {
838
- "epoch": 0.544280442804428,
839
- "grad_norm": 1.366063014582902,
840
- "learning_rate": 1.0209424198833571e-05,
841
- "loss": 0.9675,
842
  "step": 590
843
  },
844
  {
845
- "epoch": 0.5488929889298892,
846
- "grad_norm": 1.45679701107198,
847
- "learning_rate": 1.0048332006497406e-05,
848
- "loss": 0.9955,
849
  "step": 595
850
  },
851
  {
852
- "epoch": 0.5535055350553506,
853
- "grad_norm": 1.3265263093097257,
854
- "learning_rate": 9.887227269586184e-06,
855
- "loss": 0.9734,
856
  "step": 600
857
  },
858
  {
859
- "epoch": 0.5581180811808119,
860
- "grad_norm": 1.4378603761053164,
861
- "learning_rate": 9.7261518028436e-06,
862
- "loss": 0.9793,
 
 
 
 
 
 
 
 
863
  "step": 605
864
  },
865
  {
866
- "epoch": 0.5627306273062731,
867
- "grad_norm": 1.46938788585248,
868
- "learning_rate": 9.565147413416266e-06,
869
- "loss": 0.989,
870
  "step": 610
871
  },
872
  {
873
- "epoch": 0.5673431734317343,
874
- "grad_norm": 1.3602627962743308,
875
- "learning_rate": 9.404255890002677e-06,
876
- "loss": 0.9739,
877
  "step": 615
878
  },
879
  {
880
- "epoch": 0.5719557195571956,
881
- "grad_norm": 1.3509149354182792,
882
- "learning_rate": 9.243518992006944e-06,
883
- "loss": 0.9811,
884
  "step": 620
885
  },
886
  {
887
- "epoch": 0.5765682656826568,
888
- "grad_norm": 1.3289296656282001,
889
- "learning_rate": 9.082978438700138e-06,
890
- "loss": 0.969,
891
  "step": 625
892
  },
893
  {
894
- "epoch": 0.5811808118081181,
895
- "grad_norm": 1.3748603146537095,
896
- "learning_rate": 8.922675898392072e-06,
897
- "loss": 0.9783,
898
  "step": 630
899
  },
900
  {
901
- "epoch": 0.5857933579335793,
902
- "grad_norm": 1.4054407615389235,
903
- "learning_rate": 8.762652977616258e-06,
904
- "loss": 0.9872,
905
  "step": 635
906
  },
907
  {
908
- "epoch": 0.5904059040590406,
909
- "grad_norm": 1.4266146283393177,
910
- "learning_rate": 8.602951210330942e-06,
911
- "loss": 0.9875,
912
  "step": 640
913
  },
914
  {
915
- "epoch": 0.5950184501845018,
916
- "grad_norm": 1.3972501984892904,
917
- "learning_rate": 8.443612047138965e-06,
918
- "loss": 0.9622,
919
  "step": 645
920
  },
921
  {
922
- "epoch": 0.5996309963099631,
923
- "grad_norm": 1.3682517939432157,
924
- "learning_rate": 8.284676844529258e-06,
925
- "loss": 0.9803,
926
  "step": 650
927
  },
928
  {
929
- "epoch": 0.6042435424354243,
930
- "grad_norm": 1.3429050015886888,
931
- "learning_rate": 8.126186854142752e-06,
932
- "loss": 0.9712,
933
  "step": 655
934
  },
935
  {
936
- "epoch": 0.6088560885608856,
937
- "grad_norm": 1.4594824193372815,
938
- "learning_rate": 7.968183212065537e-06,
939
- "loss": 0.9622,
940
  "step": 660
941
  },
942
  {
943
- "epoch": 0.6134686346863468,
944
- "grad_norm": 1.3448226541798638,
945
- "learning_rate": 7.81070692815195e-06,
946
- "loss": 0.9722,
947
  "step": 665
948
  },
949
  {
950
- "epoch": 0.6180811808118081,
951
- "grad_norm": 1.3271486276830222,
952
- "learning_rate": 7.6537988753805e-06,
953
- "loss": 0.9757,
954
  "step": 670
955
  },
956
  {
957
- "epoch": 0.6226937269372693,
958
- "grad_norm": 1.3444906745257086,
959
- "learning_rate": 7.497499779245268e-06,
960
- "loss": 0.9727,
961
  "step": 675
962
  },
963
  {
964
- "epoch": 0.6273062730627307,
965
- "grad_norm": 1.3631286023111704,
966
- "learning_rate": 7.3418502071856004e-06,
967
- "loss": 0.966,
968
  "step": 680
969
  },
970
  {
971
- "epoch": 0.6319188191881919,
972
- "grad_norm": 1.4595389618690304,
973
- "learning_rate": 7.186890558056836e-06,
974
- "loss": 0.9646,
975
  "step": 685
976
  },
977
  {
978
- "epoch": 0.6365313653136532,
979
- "grad_norm": 1.3472290480867384,
980
- "learning_rate": 7.0326610516447825e-06,
981
- "loss": 0.9619,
982
  "step": 690
983
  },
984
  {
985
- "epoch": 0.6411439114391144,
986
- "grad_norm": 1.8007090081101473,
987
- "learning_rate": 6.879201718226658e-06,
988
- "loss": 0.9771,
989
  "step": 695
990
  },
991
  {
992
- "epoch": 0.6457564575645757,
993
- "grad_norm": 1.3774637805586714,
994
- "learning_rate": 6.7265523881812335e-06,
995
- "loss": 0.9421,
 
 
 
 
 
 
 
 
996
  "step": 700
997
  },
998
  {
999
- "epoch": 0.6503690036900369,
1000
- "grad_norm": 1.346269997716162,
1001
- "learning_rate": 6.574752681650864e-06,
1002
- "loss": 0.9418,
1003
  "step": 705
1004
  },
1005
  {
1006
- "epoch": 0.6549815498154982,
1007
- "grad_norm": 1.2742494405200788,
1008
- "learning_rate": 6.423841998258069e-06,
1009
- "loss": 0.9475,
1010
  "step": 710
1011
  },
1012
  {
1013
- "epoch": 0.6595940959409594,
1014
- "grad_norm": 1.3211417146326536,
1015
- "learning_rate": 6.273859506879365e-06,
1016
- "loss": 0.9624,
1017
  "step": 715
1018
  },
1019
  {
1020
- "epoch": 0.6642066420664207,
1021
- "grad_norm": 1.3166822514896896,
1022
- "learning_rate": 6.124844135478971e-06,
1023
- "loss": 0.9627,
1024
  "step": 720
1025
  },
1026
  {
1027
- "epoch": 0.6688191881918819,
1028
- "grad_norm": 1.3120951075995704,
1029
- "learning_rate": 5.976834561005069e-06,
1030
- "loss": 0.9508,
1031
  "step": 725
1032
  },
1033
  {
1034
- "epoch": 0.6734317343173432,
1035
- "grad_norm": 1.3290407324477753,
1036
- "learning_rate": 5.829869199351188e-06,
1037
- "loss": 0.9504,
1038
  "step": 730
1039
  },
1040
  {
1041
- "epoch": 0.6780442804428044,
1042
- "grad_norm": 1.297841981615856,
1043
- "learning_rate": 5.68398619538536e-06,
1044
- "loss": 0.9528,
1045
  "step": 735
1046
  },
1047
  {
1048
- "epoch": 0.6826568265682657,
1049
- "grad_norm": 1.3297326740637123,
1050
- "learning_rate": 5.53922341304961e-06,
1051
- "loss": 0.953,
1052
  "step": 740
1053
  },
1054
  {
1055
- "epoch": 0.6872693726937269,
1056
- "grad_norm": 1.270375299309487,
1057
- "learning_rate": 5.39561842553239e-06,
1058
- "loss": 0.9556,
1059
  "step": 745
1060
  },
1061
  {
1062
- "epoch": 0.6918819188191881,
1063
- "grad_norm": 1.3325965856351196,
1064
- "learning_rate": 5.2532085055164205e-06,
1065
- "loss": 0.9466,
1066
  "step": 750
1067
  },
1068
  {
1069
- "epoch": 0.6964944649446494,
1070
- "grad_norm": 1.384058443252621,
1071
- "learning_rate": 5.112030615504601e-06,
1072
- "loss": 0.9568,
1073
  "step": 755
1074
  },
1075
  {
1076
- "epoch": 0.7011070110701108,
1077
- "grad_norm": 1.3367666302254895,
1078
- "learning_rate": 4.972121398226371e-06,
1079
- "loss": 0.9515,
1080
  "step": 760
1081
  },
1082
  {
1083
- "epoch": 0.705719557195572,
1084
- "grad_norm": 1.2932861344083342,
1085
- "learning_rate": 4.833517167127077e-06,
1086
- "loss": 0.9542,
1087
  "step": 765
1088
  },
1089
  {
1090
- "epoch": 0.7103321033210332,
1091
- "grad_norm": 1.3002291268448138,
1092
- "learning_rate": 4.6962538969428416e-06,
1093
- "loss": 0.9493,
1094
  "step": 770
1095
  },
1096
  {
1097
- "epoch": 0.7149446494464945,
1098
- "grad_norm": 1.3319825264438672,
1099
- "learning_rate": 4.560367214363295e-06,
1100
- "loss": 0.9402,
1101
  "step": 775
1102
  },
1103
  {
1104
- "epoch": 0.7195571955719557,
1105
- "grad_norm": 1.3114581144726238,
1106
- "learning_rate": 4.425892388784681e-06,
1107
- "loss": 0.9418,
1108
  "step": 780
1109
  },
1110
  {
1111
- "epoch": 0.724169741697417,
1112
- "grad_norm": 1.2845316321533105,
1113
- "learning_rate": 4.292864323155684e-06,
1114
- "loss": 0.941,
1115
  "step": 785
1116
  },
1117
  {
1118
- "epoch": 0.7287822878228782,
1119
- "grad_norm": 1.3576670016030674,
1120
- "learning_rate": 4.161317544918345e-06,
1121
- "loss": 0.9514,
1122
  "step": 790
1123
  },
1124
  {
1125
- "epoch": 0.7333948339483395,
1126
- "grad_norm": 1.3650798187979218,
1127
- "learning_rate": 4.031286197046493e-06,
1128
- "loss": 0.9358,
1129
  "step": 795
1130
  },
1131
  {
1132
- "epoch": 0.7380073800738007,
1133
- "grad_norm": 1.2681901564602476,
1134
- "learning_rate": 3.902804029183907e-06,
1135
- "loss": 0.9258,
 
 
 
 
 
 
 
 
1136
  "step": 800
1137
  },
1138
  {
1139
- "epoch": 0.742619926199262,
1140
- "grad_norm": 1.3091859558198018,
1141
- "learning_rate": 3.775904388884618e-06,
1142
- "loss": 0.9597,
1143
  "step": 805
1144
  },
1145
  {
1146
- "epoch": 0.7472324723247232,
1147
- "grad_norm": 1.2731852848090845,
1148
- "learning_rate": 3.650620212957524e-06,
1149
- "loss": 0.9791,
1150
  "step": 810
1151
  },
1152
  {
1153
- "epoch": 0.7518450184501845,
1154
- "grad_norm": 1.4443358599132654,
1155
- "learning_rate": 3.5269840189176616e-06,
1156
- "loss": 0.9559,
1157
  "step": 815
1158
  },
1159
  {
1160
- "epoch": 0.7564575645756457,
1161
- "grad_norm": 1.2637245159696282,
1162
- "learning_rate": 3.405027896546277e-06,
1163
- "loss": 0.9522,
1164
  "step": 820
1165
  },
1166
  {
1167
- "epoch": 0.761070110701107,
1168
- "grad_norm": 1.2944529056122984,
1169
- "learning_rate": 3.2847834995619067e-06,
1170
- "loss": 0.9334,
1171
  "step": 825
1172
  },
1173
  {
1174
- "epoch": 0.7656826568265682,
1175
- "grad_norm": 1.2996833234879939,
1176
- "learning_rate": 3.1662820374046776e-06,
1177
- "loss": 0.9406,
1178
  "step": 830
1179
  },
1180
  {
1181
- "epoch": 0.7702952029520295,
1182
- "grad_norm": 1.3668529549836468,
1183
- "learning_rate": 3.0495542671358745e-06,
1184
- "loss": 0.9494,
1185
  "step": 835
1186
  },
1187
  {
1188
- "epoch": 0.7749077490774908,
1189
- "grad_norm": 1.271209707662383,
1190
- "learning_rate": 2.934630485454948e-06,
1191
- "loss": 0.9587,
1192
  "step": 840
1193
  },
1194
  {
1195
- "epoch": 0.7795202952029521,
1196
- "grad_norm": 1.373724729262656,
1197
- "learning_rate": 2.8215405208360237e-06,
1198
- "loss": 0.9267,
1199
  "step": 845
1200
  },
1201
  {
1202
- "epoch": 0.7841328413284133,
1203
- "grad_norm": 1.2858397081440855,
1204
- "learning_rate": 2.7103137257858867e-06,
1205
- "loss": 0.9368,
1206
  "step": 850
1207
  },
1208
  {
1209
- "epoch": 0.7887453874538746,
1210
- "grad_norm": 1.3265877626226328,
1211
- "learning_rate": 2.600978969225558e-06,
1212
- "loss": 0.9363,
1213
  "step": 855
1214
  },
1215
  {
1216
- "epoch": 0.7933579335793358,
1217
- "grad_norm": 1.3059330060904089,
1218
- "learning_rate": 2.493564628997369e-06,
1219
- "loss": 0.9331,
1220
  "step": 860
1221
  },
1222
  {
1223
- "epoch": 0.7979704797047971,
1224
- "grad_norm": 1.2933907163136256,
1225
- "learning_rate": 2.3880985844994674e-06,
1226
- "loss": 0.9315,
1227
  "step": 865
1228
  },
1229
  {
1230
- "epoch": 0.8025830258302583,
1231
- "grad_norm": 1.3825698113730438,
1232
- "learning_rate": 2.284608209449746e-06,
1233
- "loss": 0.9379,
1234
  "step": 870
1235
  },
1236
  {
1237
- "epoch": 0.8071955719557196,
1238
- "grad_norm": 1.2868105668821606,
1239
- "learning_rate": 2.183120364780975e-06,
1240
- "loss": 0.9371,
1241
  "step": 875
1242
  },
1243
  {
1244
- "epoch": 0.8118081180811808,
1245
- "grad_norm": 1.313632486404665,
1246
- "learning_rate": 2.083661391669043e-06,
1247
- "loss": 0.9338,
1248
  "step": 880
1249
  },
1250
  {
1251
- "epoch": 0.816420664206642,
1252
- "grad_norm": 1.2863066943500752,
1253
- "learning_rate": 1.986257104696121e-06,
1254
- "loss": 0.933,
1255
  "step": 885
1256
  },
1257
  {
1258
- "epoch": 0.8210332103321033,
1259
- "grad_norm": 1.3414781016896056,
1260
- "learning_rate": 1.8909327851504633e-06,
1261
- "loss": 0.9298,
1262
  "step": 890
1263
  },
1264
  {
1265
- "epoch": 0.8256457564575646,
1266
- "grad_norm": 1.3005106720291775,
1267
- "learning_rate": 1.7977131744646724e-06,
1268
- "loss": 0.949,
1269
  "step": 895
1270
  },
1271
  {
1272
- "epoch": 0.8302583025830258,
1273
- "grad_norm": 1.2541123587239678,
1274
- "learning_rate": 1.7066224677940313e-06,
1275
- "loss": 0.9364,
1276
  "step": 900
1277
  },
1278
  {
1279
- "epoch": 0.834870848708487,
1280
- "grad_norm": 1.3443378836164728,
1281
- "learning_rate": 1.6176843077366755e-06,
1282
- "loss": 0.9341,
 
 
 
 
 
 
 
 
1283
  "step": 905
1284
  },
1285
  {
1286
- "epoch": 0.8394833948339483,
1287
- "grad_norm": 1.3177278720111654,
1288
- "learning_rate": 1.5309217781971419e-06,
1289
- "loss": 0.9237,
1290
  "step": 910
1291
  },
1292
  {
1293
- "epoch": 0.8440959409594095,
1294
- "grad_norm": 1.2988630794091358,
1295
- "learning_rate": 1.446357398394934e-06,
1296
- "loss": 0.9375,
1297
  "step": 915
1298
  },
1299
  {
1300
- "epoch": 0.8487084870848709,
1301
- "grad_norm": 1.235110744813047,
1302
- "learning_rate": 1.3640131170196758e-06,
1303
- "loss": 0.9289,
1304
  "step": 920
1305
  },
1306
  {
1307
- "epoch": 0.8533210332103321,
1308
- "grad_norm": 1.2841260381250454,
1309
- "learning_rate": 1.2839103065343084e-06,
1310
- "loss": 0.9376,
1311
  "step": 925
1312
  },
1313
  {
1314
- "epoch": 0.8579335793357934,
1315
- "grad_norm": 1.3338182477923755,
1316
- "learning_rate": 1.2060697576278812e-06,
1317
- "loss": 0.9295,
1318
  "step": 930
1319
  },
1320
  {
1321
- "epoch": 0.8625461254612546,
1322
- "grad_norm": 1.2483887496253951,
1323
- "learning_rate": 1.1305116738193211e-06,
1324
- "loss": 0.9191,
1325
  "step": 935
1326
  },
1327
  {
1328
- "epoch": 0.8671586715867159,
1329
- "grad_norm": 1.3430815382294408,
1330
- "learning_rate": 1.0572556662136036e-06,
1331
- "loss": 0.9152,
1332
  "step": 940
1333
  },
1334
  {
1335
- "epoch": 0.8717712177121771,
1336
- "grad_norm": 1.2731245020834576,
1337
- "learning_rate": 9.863207484116987e-07,
1338
- "loss": 0.9396,
1339
  "step": 945
1340
  },
1341
  {
1342
- "epoch": 0.8763837638376384,
1343
- "grad_norm": 1.2635025334684167,
1344
- "learning_rate": 9.177253315755796e-07,
1345
- "loss": 0.9425,
1346
  "step": 950
1347
  },
1348
  {
1349
- "epoch": 0.8809963099630996,
1350
- "grad_norm": 1.2447698053780658,
1351
- "learning_rate": 8.514872196496182e-07,
1352
- "loss": 0.9144,
1353
  "step": 955
1354
  },
1355
  {
1356
- "epoch": 0.8856088560885609,
1357
- "grad_norm": 1.2339433320268876,
1358
- "learning_rate": 7.876236047395525e-07,
1359
- "loss": 0.9314,
1360
  "step": 960
1361
  },
1362
  {
1363
- "epoch": 0.8902214022140221,
1364
- "grad_norm": 1.2899193810519798,
1365
- "learning_rate": 7.26151062650291e-07,
1366
- "loss": 0.9339,
1367
  "step": 965
1368
  },
1369
  {
1370
- "epoch": 0.8948339483394834,
1371
- "grad_norm": 1.3045072453764868,
1372
- "learning_rate": 6.670855485836525e-07,
1373
- "loss": 0.9362,
1374
  "step": 970
1375
  },
1376
  {
1377
- "epoch": 0.8994464944649446,
1378
- "grad_norm": 1.2294171724462792,
1379
- "learning_rate": 6.104423929971948e-07,
1380
- "loss": 0.9179,
1381
  "step": 975
1382
  },
1383
  {
1384
- "epoch": 0.9040590405904059,
1385
- "grad_norm": 1.2969444296025832,
1386
- "learning_rate": 5.562362976251901e-07,
1387
- "loss": 0.9386,
1388
  "step": 980
1389
  },
1390
  {
1391
- "epoch": 0.9086715867158671,
1392
- "grad_norm": 1.3563048623700384,
1393
- "learning_rate": 5.044813316627994e-07,
1394
- "loss": 0.9293,
1395
  "step": 985
1396
  },
1397
  {
1398
- "epoch": 0.9132841328413284,
1399
- "grad_norm": 1.3602985297748236,
1400
- "learning_rate": 4.5519092811439627e-07,
1401
- "loss": 0.9325,
1402
  "step": 990
1403
  },
1404
  {
1405
- "epoch": 0.9178966789667896,
1406
- "grad_norm": 1.2576795558330498,
1407
- "learning_rate": 4.083778803070504e-07,
1408
- "loss": 0.9384,
1409
  "step": 995
1410
  },
1411
  {
1412
- "epoch": 0.922509225092251,
1413
- "grad_norm": 1.2395476578848061,
1414
- "learning_rate": 3.6405433856999684e-07,
1415
- "loss": 0.9195,
 
 
 
 
 
 
 
 
1416
  "step": 1000
1417
  },
1418
  {
1419
- "epoch": 0.9271217712177122,
1420
- "grad_norm": 1.2838613340485654,
1421
- "learning_rate": 3.2223180708102933e-07,
1422
- "loss": 0.9372,
1423
  "step": 1005
1424
  },
1425
  {
1426
- "epoch": 0.9317343173431735,
1427
- "grad_norm": 1.3034379499491984,
1428
- "learning_rate": 2.829211408805932e-07,
1429
- "loss": 0.9383,
1430
  "step": 1010
1431
  },
1432
  {
1433
- "epoch": 0.9363468634686347,
1434
- "grad_norm": 1.277753433076976,
1435
- "learning_rate": 2.461325430543482e-07,
1436
- "loss": 0.9203,
1437
  "step": 1015
1438
  },
1439
  {
1440
- "epoch": 0.940959409594096,
1441
- "grad_norm": 1.2622861558556866,
1442
- "learning_rate": 2.1187556208496885e-07,
1443
- "loss": 0.9231,
1444
  "step": 1020
1445
  },
1446
  {
1447
- "epoch": 0.9455719557195572,
1448
- "grad_norm": 1.26550227170926,
1449
- "learning_rate": 1.8015908937382587e-07,
1450
- "loss": 0.9314,
1451
  "step": 1025
1452
  },
1453
  {
1454
- "epoch": 0.9501845018450185,
1455
- "grad_norm": 1.293318597446816,
1456
- "learning_rate": 1.5099135693322776e-07,
1457
- "loss": 0.9153,
1458
  "step": 1030
1459
  },
1460
  {
1461
- "epoch": 0.9547970479704797,
1462
- "grad_norm": 1.5032902478334003,
1463
- "learning_rate": 1.2437993524979984e-07,
1464
- "loss": 0.9369,
1465
  "step": 1035
1466
  },
1467
  {
1468
- "epoch": 0.959409594095941,
1469
- "grad_norm": 1.2642728343874239,
1470
- "learning_rate": 1.0033173131956175e-07,
1471
- "loss": 0.9155,
1472
  "step": 1040
1473
  },
1474
  {
1475
- "epoch": 0.9640221402214022,
1476
- "grad_norm": 1.263191604346459,
1477
- "learning_rate": 7.885298685522235e-08,
1478
- "loss": 0.9309,
1479
  "step": 1045
1480
  },
1481
  {
1482
- "epoch": 0.9686346863468634,
1483
- "grad_norm": 1.2999256804979131,
1484
- "learning_rate": 5.99492766661347e-08,
1485
- "loss": 0.9399,
1486
  "step": 1050
1487
  },
1488
  {
1489
- "epoch": 0.9732472324723247,
1490
- "grad_norm": 1.25468458833374,
1491
- "learning_rate": 4.362550721136338e-08,
1492
- "loss": 0.9387,
1493
  "step": 1055
1494
  },
1495
  {
1496
- "epoch": 0.977859778597786,
1497
- "grad_norm": 1.2891226224049415,
1498
- "learning_rate": 2.988591532620322e-08,
1499
- "loss": 0.9259,
1500
  "step": 1060
1501
  },
1502
  {
1503
- "epoch": 0.9824723247232472,
1504
- "grad_norm": 1.2367296717631957,
1505
- "learning_rate": 1.8734067122514464e-08,
1506
- "loss": 0.9255,
1507
  "step": 1065
1508
  },
1509
  {
1510
- "epoch": 0.9870848708487084,
1511
- "grad_norm": 1.2584553913497565,
1512
- "learning_rate": 1.0172857063137643e-08,
1513
- "loss": 0.9337,
1514
  "step": 1070
1515
  },
1516
  {
1517
- "epoch": 0.9916974169741697,
1518
- "grad_norm": 1.2509146926461707,
1519
- "learning_rate": 4.204507210633368e-09,
1520
- "loss": 0.9295,
1521
  "step": 1075
1522
  },
1523
  {
1524
- "epoch": 0.996309963099631,
1525
- "grad_norm": 1.2915538030798408,
1526
- "learning_rate": 8.30566650548148e-10,
1527
- "loss": 0.9183,
1528
  "step": 1080
1529
  },
1530
  {
1531
- "epoch": 1.0,
1532
- "eval_loss": 0.9419716000556946,
1533
- "eval_runtime": 1011.6018,
1534
- "eval_samples_per_second": 15.174,
1535
- "eval_steps_per_second": 0.237,
1536
- "step": 1084
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1537
  },
1538
  {
1539
- "epoch": 1.0,
1540
- "step": 1084,
1541
- "total_flos": 453935093514240.0,
1542
- "train_loss": 0.9848188322408613,
1543
- "train_runtime": 36728.3484,
1544
- "train_samples_per_second": 3.776,
1545
- "train_steps_per_second": 0.03
1546
  }
1547
  ],
1548
  "logging_steps": 5,
1549
- "max_steps": 1084,
1550
  "num_input_tokens_seen": 0,
1551
- "num_train_epochs": 1,
1552
  "save_steps": 100,
1553
  "stateful_callbacks": {
1554
  "TrainerControl": {
@@ -1562,7 +2445,7 @@
1562
  "attributes": {}
1563
  }
1564
  },
1565
- "total_flos": 453935093514240.0,
1566
  "train_batch_size": 16,
1567
  "trial_name": null,
1568
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1626,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0018450184501845018,
13
+ "grad_norm": 9.194052941983164,
14
+ "learning_rate": 1.226993865030675e-07,
15
  "loss": 1.1392,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.00922509225092251,
20
+ "grad_norm": 8.728469464225432,
21
+ "learning_rate": 6.134969325153375e-07,
22
+ "loss": 1.1321,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.01845018450184502,
27
+ "grad_norm": 5.066035045474869,
28
+ "learning_rate": 1.226993865030675e-06,
29
+ "loss": 1.0802,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.027675276752767528,
34
+ "grad_norm": 6.151048691792626,
35
+ "learning_rate": 1.8404907975460124e-06,
36
+ "loss": 1.0186,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.03690036900369004,
41
+ "grad_norm": 2.030218046940431,
42
+ "learning_rate": 2.45398773006135e-06,
43
+ "loss": 1.0181,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.046125461254612546,
48
+ "grad_norm": 1.7169054577646434,
49
+ "learning_rate": 3.0674846625766875e-06,
50
+ "loss": 0.9867,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.055350553505535055,
55
+ "grad_norm": 1.414702551784086,
56
+ "learning_rate": 3.680981595092025e-06,
57
+ "loss": 0.9848,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.06457564575645756,
62
+ "grad_norm": 1.471062511929668,
63
+ "learning_rate": 4.294478527607362e-06,
64
+ "loss": 0.975,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.07380073800738007,
69
+ "grad_norm": 1.9876641303020315,
70
+ "learning_rate": 4.9079754601227e-06,
71
+ "loss": 0.9616,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.08302583025830258,
76
+ "grad_norm": 1.7086605102377759,
77
+ "learning_rate": 5.521472392638038e-06,
78
+ "loss": 0.9716,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.09225092250922509,
83
+ "grad_norm": 2.202769359683669,
84
+ "learning_rate": 6.134969325153375e-06,
85
+ "loss": 0.9766,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.1014760147601476,
90
+ "grad_norm": 1.6222357117334487,
91
+ "learning_rate": 6.748466257668712e-06,
92
+ "loss": 0.9929,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.11070110701107011,
97
+ "grad_norm": 2.161648398755977,
98
+ "learning_rate": 7.36196319018405e-06,
99
+ "loss": 0.9774,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.11992619926199262,
104
+ "grad_norm": 1.7198404521131392,
105
+ "learning_rate": 7.975460122699386e-06,
106
+ "loss": 0.9743,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.12915129151291513,
111
+ "grad_norm": 2.5936580446065594,
112
+ "learning_rate": 8.588957055214725e-06,
113
+ "loss": 0.9878,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.13837638376383765,
118
+ "grad_norm": 2.188257188915145,
119
+ "learning_rate": 9.202453987730062e-06,
120
+ "loss": 0.9568,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.14760147601476015,
125
+ "grad_norm": 1.7531151641523148,
126
+ "learning_rate": 9.8159509202454e-06,
127
+ "loss": 0.9789,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.15682656826568267,
132
+ "grad_norm": 1.8091240872427208,
133
+ "learning_rate": 1.0429447852760737e-05,
134
+ "loss": 0.9678,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.16605166051660517,
139
+ "grad_norm": 2.098514635540621,
140
+ "learning_rate": 1.1042944785276076e-05,
141
+ "loss": 0.9617,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.1752767527675277,
146
+ "grad_norm": 2.4275494428488607,
147
+ "learning_rate": 1.1656441717791411e-05,
148
+ "loss": 0.9676,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.18450184501845018,
153
+ "grad_norm": 2.0637923302738095,
154
+ "learning_rate": 1.226993865030675e-05,
155
+ "loss": 0.9681,
156
+ "step": 100
157
+ },
158
+ {
159
+ "epoch": 0.18450184501845018,
160
+ "eval_loss": 0.9788174629211426,
161
+ "eval_runtime": 515.1712,
162
+ "eval_samples_per_second": 29.796,
163
+ "eval_steps_per_second": 0.116,
164
  "step": 100
165
  },
166
  {
167
+ "epoch": 0.1937269372693727,
168
+ "grad_norm": 2.069416549180579,
169
+ "learning_rate": 1.2883435582822085e-05,
170
+ "loss": 0.9528,
171
  "step": 105
172
  },
173
  {
174
+ "epoch": 0.2029520295202952,
175
+ "grad_norm": 2.2916715973700024,
176
+ "learning_rate": 1.3496932515337424e-05,
177
+ "loss": 0.9696,
178
  "step": 110
179
  },
180
  {
181
+ "epoch": 0.21217712177121772,
182
+ "grad_norm": 2.062468142825091,
183
+ "learning_rate": 1.4110429447852763e-05,
184
+ "loss": 0.9747,
185
  "step": 115
186
  },
187
  {
188
+ "epoch": 0.22140221402214022,
189
+ "grad_norm": 1.7271367882138293,
190
+ "learning_rate": 1.47239263803681e-05,
191
+ "loss": 0.9786,
192
  "step": 120
193
  },
194
  {
195
+ "epoch": 0.23062730627306274,
196
+ "grad_norm": 1.9545058702706481,
197
+ "learning_rate": 1.5337423312883436e-05,
198
+ "loss": 0.9758,
199
  "step": 125
200
  },
201
  {
202
+ "epoch": 0.23985239852398524,
203
+ "grad_norm": 1.9400595646067775,
204
+ "learning_rate": 1.5950920245398772e-05,
205
+ "loss": 0.9829,
206
  "step": 130
207
  },
208
  {
209
+ "epoch": 0.24907749077490776,
210
+ "grad_norm": 1.865861850010034,
211
+ "learning_rate": 1.656441717791411e-05,
212
+ "loss": 0.9915,
213
  "step": 135
214
  },
215
  {
216
+ "epoch": 0.25830258302583026,
217
+ "grad_norm": 1.9529698824708406,
218
+ "learning_rate": 1.717791411042945e-05,
219
+ "loss": 0.9831,
220
  "step": 140
221
  },
222
  {
223
+ "epoch": 0.26752767527675275,
224
+ "grad_norm": 1.8749039852563243,
225
+ "learning_rate": 1.7791411042944788e-05,
226
+ "loss": 0.9842,
227
  "step": 145
228
  },
229
  {
230
+ "epoch": 0.2767527675276753,
231
+ "grad_norm": 1.4867806820095497,
232
+ "learning_rate": 1.8404907975460123e-05,
233
+ "loss": 0.9859,
234
  "step": 150
235
  },
236
  {
237
+ "epoch": 0.2859778597785978,
238
+ "grad_norm": 2.1169911338934644,
239
+ "learning_rate": 1.9018404907975462e-05,
240
+ "loss": 0.9771,
241
  "step": 155
242
  },
243
  {
244
+ "epoch": 0.2952029520295203,
245
+ "grad_norm": 1.5398155481235816,
246
+ "learning_rate": 1.96319018404908e-05,
247
+ "loss": 0.9817,
248
  "step": 160
249
  },
250
  {
251
+ "epoch": 0.3044280442804428,
252
+ "grad_norm": 1.7130250807487832,
253
+ "learning_rate": 1.9999907776750355e-05,
254
+ "loss": 0.9997,
255
  "step": 165
256
  },
257
  {
258
+ "epoch": 0.31365313653136534,
259
+ "grad_norm": 2.1366171045520383,
260
+ "learning_rate": 1.9998870284726968e-05,
261
+ "loss": 1.0004,
262
  "step": 170
263
  },
264
  {
265
+ "epoch": 0.32287822878228783,
266
+ "grad_norm": 2.01400597362679,
267
+ "learning_rate": 1.9996680141616956e-05,
268
+ "loss": 0.9937,
269
  "step": 175
270
  },
271
  {
272
+ "epoch": 0.33210332103321033,
273
+ "grad_norm": 1.925569839756876,
274
+ "learning_rate": 1.9993337599895925e-05,
275
+ "loss": 0.9939,
276
  "step": 180
277
  },
278
  {
279
+ "epoch": 0.3413284132841328,
280
+ "grad_norm": 1.4590251035585917,
281
+ "learning_rate": 1.998884304488584e-05,
282
+ "loss": 0.9982,
283
  "step": 185
284
  },
285
  {
286
+ "epoch": 0.3505535055350554,
287
+ "grad_norm": 1.531094729781709,
288
+ "learning_rate": 1.998319699471061e-05,
289
+ "loss": 0.9925,
290
  "step": 190
291
  },
292
  {
293
+ "epoch": 0.35977859778597787,
294
+ "grad_norm": 1.9624667257758441,
295
+ "learning_rate": 1.997640010023634e-05,
296
+ "loss": 0.9765,
297
  "step": 195
298
  },
299
  {
300
+ "epoch": 0.36900369003690037,
301
+ "grad_norm": 1.6943282928766075,
302
+ "learning_rate": 1.9968453144996345e-05,
303
+ "loss": 0.9962,
304
+ "step": 200
305
+ },
306
+ {
307
+ "epoch": 0.36900369003690037,
308
+ "eval_loss": 1.0030262470245361,
309
+ "eval_runtime": 518.1283,
310
+ "eval_samples_per_second": 29.626,
311
+ "eval_steps_per_second": 0.116,
312
  "step": 200
313
  },
314
  {
315
+ "epoch": 0.37822878228782286,
316
+ "grad_norm": 3.0469311990676857,
317
+ "learning_rate": 1.9959357045100764e-05,
318
+ "loss": 0.9947,
319
  "step": 205
320
  },
321
  {
322
+ "epoch": 0.3874538745387454,
323
+ "grad_norm": 2.2551326104892864,
324
+ "learning_rate": 1.9949112849131005e-05,
325
+ "loss": 1.0023,
326
  "step": 210
327
  },
328
  {
329
+ "epoch": 0.3966789667896679,
330
+ "grad_norm": 1.8771683509279502,
331
+ "learning_rate": 1.993772173801884e-05,
332
+ "loss": 0.9934,
333
  "step": 215
334
  },
335
  {
336
+ "epoch": 0.4059040590405904,
337
+ "grad_norm": 1.8016877222967922,
338
+ "learning_rate": 1.992518502491028e-05,
339
+ "loss": 0.9807,
340
  "step": 220
341
  },
342
  {
343
+ "epoch": 0.4151291512915129,
344
+ "grad_norm": 1.4456497466009737,
345
+ "learning_rate": 1.9911504155014187e-05,
346
+ "loss": 0.9926,
347
  "step": 225
348
  },
349
  {
350
+ "epoch": 0.42435424354243545,
351
+ "grad_norm": 1.5156073716841811,
352
+ "learning_rate": 1.989668070543569e-05,
353
+ "loss": 0.9766,
354
  "step": 230
355
  },
356
  {
357
+ "epoch": 0.43357933579335795,
358
+ "grad_norm": 1.3959824735787207,
359
+ "learning_rate": 1.9880716384994355e-05,
360
+ "loss": 0.9964,
361
  "step": 235
362
  },
363
  {
364
+ "epoch": 0.44280442804428044,
365
+ "grad_norm": 1.4724192694561282,
366
+ "learning_rate": 1.9863613034027224e-05,
367
+ "loss": 0.9942,
368
  "step": 240
369
  },
370
  {
371
+ "epoch": 0.45202952029520294,
372
+ "grad_norm": 2.064409139190994,
373
+ "learning_rate": 1.9845372624176646e-05,
374
+ "loss": 1.0103,
375
  "step": 245
376
  },
377
  {
378
+ "epoch": 0.4612546125461255,
379
+ "grad_norm": 2.190902421105104,
380
+ "learning_rate": 1.982599725816299e-05,
381
+ "loss": 1.0075,
382
  "step": 250
383
  },
384
  {
385
+ "epoch": 0.470479704797048,
386
+ "grad_norm": 1.9443583169417478,
387
+ "learning_rate": 1.9805489169542245e-05,
388
+ "loss": 0.9971,
389
  "step": 255
390
  },
391
  {
392
+ "epoch": 0.4797047970479705,
393
+ "grad_norm": 1.553791831408308,
394
+ "learning_rate": 1.978385072244857e-05,
395
+ "loss": 0.9992,
396
  "step": 260
397
  },
398
  {
399
+ "epoch": 0.488929889298893,
400
+ "grad_norm": 1.4174068635451451,
401
+ "learning_rate": 1.9761084411321706e-05,
402
+ "loss": 0.9793,
403
  "step": 265
404
  },
405
  {
406
+ "epoch": 0.4981549815498155,
407
+ "grad_norm": 1.4969414214930414,
408
+ "learning_rate": 1.9737192860619477e-05,
409
+ "loss": 0.9791,
410
  "step": 270
411
  },
412
  {
413
+ "epoch": 0.507380073800738,
414
+ "grad_norm": 1.4025421975340602,
415
+ "learning_rate": 1.971217882451521e-05,
416
+ "loss": 0.9796,
417
  "step": 275
418
  },
419
  {
420
+ "epoch": 0.5166051660516605,
421
+ "grad_norm": 1.4448369862138994,
422
+ "learning_rate": 1.9686045186580258e-05,
423
+ "loss": 0.9884,
424
  "step": 280
425
  },
426
  {
427
+ "epoch": 0.525830258302583,
428
+ "grad_norm": 2.0639483249182464,
429
+ "learning_rate": 1.9658794959451583e-05,
430
+ "loss": 0.9831,
431
  "step": 285
432
  },
433
  {
434
+ "epoch": 0.5350553505535055,
435
+ "grad_norm": 1.6048970102781592,
436
+ "learning_rate": 1.9630431284484447e-05,
437
+ "loss": 0.9849,
438
  "step": 290
439
  },
440
  {
441
+ "epoch": 0.544280442804428,
442
+ "grad_norm": 1.4540480684938577,
443
+ "learning_rate": 1.960095743139033e-05,
444
+ "loss": 0.9796,
445
  "step": 295
446
  },
447
  {
448
+ "epoch": 0.5535055350553506,
449
+ "grad_norm": 1.424947900669971,
450
+ "learning_rate": 1.957037679785994e-05,
451
+ "loss": 0.9917,
452
+ "step": 300
453
+ },
454
+ {
455
+ "epoch": 0.5535055350553506,
456
+ "eval_loss": 1.0008341073989868,
457
+ "eval_runtime": 513.1068,
458
+ "eval_samples_per_second": 29.916,
459
+ "eval_steps_per_second": 0.117,
460
  "step": 300
461
  },
462
  {
463
+ "epoch": 0.5627306273062731,
464
+ "grad_norm": 1.2480517696242786,
465
+ "learning_rate": 1.953869290917158e-05,
466
+ "loss": 0.9943,
467
  "step": 305
468
  },
469
  {
470
+ "epoch": 0.5719557195571956,
471
+ "grad_norm": 1.191133450390735,
472
+ "learning_rate": 1.9505909417784758e-05,
473
+ "loss": 0.9899,
474
  "step": 310
475
  },
476
  {
477
+ "epoch": 0.5811808118081181,
478
+ "grad_norm": 1.1766418475997753,
479
+ "learning_rate": 1.9472030102919102e-05,
480
+ "loss": 0.9883,
481
  "step": 315
482
  },
483
  {
484
+ "epoch": 0.5904059040590406,
485
+ "grad_norm": 1.2121897211885717,
486
+ "learning_rate": 1.9437058870118745e-05,
487
+ "loss": 1.0037,
488
  "step": 320
489
  },
490
  {
491
+ "epoch": 0.5996309963099631,
492
+ "grad_norm": 1.2903187102851559,
493
+ "learning_rate": 1.940099975080207e-05,
494
+ "loss": 0.9892,
495
  "step": 325
496
  },
497
  {
498
+ "epoch": 0.6088560885608856,
499
+ "grad_norm": 1.4260318993897811,
500
+ "learning_rate": 1.9363856901796984e-05,
501
+ "loss": 0.9896,
502
  "step": 330
503
  },
504
  {
505
+ "epoch": 0.6180811808118081,
506
+ "grad_norm": 1.324489901337969,
507
+ "learning_rate": 1.9325634604861728e-05,
508
+ "loss": 0.9978,
509
  "step": 335
510
  },
511
  {
512
+ "epoch": 0.6273062730627307,
513
+ "grad_norm": 1.275426852454915,
514
+ "learning_rate": 1.9286337266191295e-05,
515
+ "loss": 0.993,
516
  "step": 340
517
  },
518
  {
519
+ "epoch": 0.6365313653136532,
520
+ "grad_norm": 1.329213272796139,
521
+ "learning_rate": 1.9245969415909464e-05,
522
+ "loss": 0.9879,
523
  "step": 345
524
  },
525
  {
526
+ "epoch": 0.6457564575645757,
527
+ "grad_norm": 1.4085398606096227,
528
+ "learning_rate": 1.9204535707546602e-05,
529
+ "loss": 0.9869,
530
  "step": 350
531
  },
532
  {
533
+ "epoch": 0.6549815498154982,
534
+ "grad_norm": 1.1848936755869721,
535
+ "learning_rate": 1.916204091750321e-05,
536
+ "loss": 0.9726,
537
  "step": 355
538
  },
539
  {
540
+ "epoch": 0.6642066420664207,
541
+ "grad_norm": 1.2968309154541056,
542
+ "learning_rate": 1.9118489944499287e-05,
543
+ "loss": 0.9902,
544
  "step": 360
545
  },
546
  {
547
+ "epoch": 0.6734317343173432,
548
+ "grad_norm": 1.2286246913756114,
549
+ "learning_rate": 1.907388780900964e-05,
550
+ "loss": 0.9811,
551
  "step": 365
552
  },
553
  {
554
+ "epoch": 0.6826568265682657,
555
+ "grad_norm": 1.2591567733071325,
556
+ "learning_rate": 1.902823965268513e-05,
557
+ "loss": 0.9858,
558
  "step": 370
559
  },
560
  {
561
+ "epoch": 0.6918819188191881,
562
+ "grad_norm": 1.4378514619406175,
563
+ "learning_rate": 1.8981550737759932e-05,
564
+ "loss": 0.9828,
565
  "step": 375
566
  },
567
  {
568
+ "epoch": 0.7011070110701108,
569
+ "grad_norm": 1.497308547977116,
570
+ "learning_rate": 1.8933826446444933e-05,
571
+ "loss": 0.9892,
572
  "step": 380
573
  },
574
  {
575
+ "epoch": 0.7103321033210332,
576
+ "grad_norm": 1.1745393096620436,
577
+ "learning_rate": 1.888507228030729e-05,
578
+ "loss": 0.9859,
579
  "step": 385
580
  },
581
  {
582
+ "epoch": 0.7195571955719557,
583
+ "grad_norm": 1.2233160586824499,
584
+ "learning_rate": 1.8835293859636177e-05,
585
+ "loss": 0.9763,
586
  "step": 390
587
  },
588
  {
589
+ "epoch": 0.7287822878228782,
590
+ "grad_norm": 1.3127902536541989,
591
+ "learning_rate": 1.8784496922794947e-05,
592
+ "loss": 0.981,
593
  "step": 395
594
  },
595
  {
596
+ "epoch": 0.7380073800738007,
597
+ "grad_norm": 1.3089866347753676,
598
+ "learning_rate": 1.873268732555957e-05,
599
+ "loss": 0.9652,
600
+ "step": 400
601
+ },
602
+ {
603
+ "epoch": 0.7380073800738007,
604
+ "eval_loss": 0.993894636631012,
605
+ "eval_runtime": 513.7147,
606
+ "eval_samples_per_second": 29.88,
607
+ "eval_steps_per_second": 0.117,
608
  "step": 400
609
  },
610
  {
611
+ "epoch": 0.7472324723247232,
612
+ "grad_norm": 1.3088189271034285,
613
+ "learning_rate": 1.8679871040443632e-05,
614
+ "loss": 1.0048,
615
  "step": 405
616
  },
617
  {
618
+ "epoch": 0.7564575645756457,
619
+ "grad_norm": 1.2954577066196238,
620
+ "learning_rate": 1.8626054156009807e-05,
621
+ "loss": 0.9927,
622
  "step": 410
623
  },
624
  {
625
+ "epoch": 0.7656826568265682,
626
+ "grad_norm": 1.317981053662398,
627
+ "learning_rate": 1.8571242876167995e-05,
628
+ "loss": 0.9752,
629
  "step": 415
630
  },
631
  {
632
+ "epoch": 0.7749077490774908,
633
+ "grad_norm": 1.4156756831610378,
634
+ "learning_rate": 1.851544351946014e-05,
635
+ "loss": 0.9945,
636
  "step": 420
637
  },
638
  {
639
+ "epoch": 0.7841328413284133,
640
+ "grad_norm": 1.1285773664771428,
641
+ "learning_rate": 1.845866251833183e-05,
642
+ "loss": 0.9708,
643
  "step": 425
644
  },
645
  {
646
+ "epoch": 0.7933579335793358,
647
+ "grad_norm": 1.2640468813011223,
648
+ "learning_rate": 1.8400906418390808e-05,
649
+ "loss": 0.9757,
650
  "step": 430
651
  },
652
  {
653
+ "epoch": 0.8025830258302583,
654
+ "grad_norm": 1.288546177133416,
655
+ "learning_rate": 1.834218187765237e-05,
656
+ "loss": 0.976,
657
  "step": 435
658
  },
659
  {
660
+ "epoch": 0.8118081180811808,
661
+ "grad_norm": 1.3086160465192265,
662
+ "learning_rate": 1.8282495665771864e-05,
663
+ "loss": 0.9761,
664
  "step": 440
665
  },
666
  {
667
+ "epoch": 0.8210332103321033,
668
+ "grad_norm": 1.1919282548241303,
669
+ "learning_rate": 1.8221854663264294e-05,
670
+ "loss": 0.9718,
671
  "step": 445
672
  },
673
  {
674
+ "epoch": 0.8302583025830258,
675
+ "grad_norm": 1.2454331164701038,
676
+ "learning_rate": 1.8160265860711134e-05,
677
+ "loss": 0.9842,
678
  "step": 450
679
  },
680
  {
681
+ "epoch": 0.8394833948339483,
682
+ "grad_norm": 1.183454477783249,
683
+ "learning_rate": 1.8097736357954487e-05,
684
+ "loss": 0.9705,
685
  "step": 455
686
  },
687
  {
688
+ "epoch": 0.8487084870848709,
689
+ "grad_norm": 1.1394535207411802,
690
+ "learning_rate": 1.8034273363278615e-05,
691
+ "loss": 0.9751,
692
  "step": 460
693
  },
694
  {
695
+ "epoch": 0.8579335793357934,
696
+ "grad_norm": 1.1866949984179949,
697
+ "learning_rate": 1.7969884192578977e-05,
698
+ "loss": 0.9749,
699
  "step": 465
700
  },
701
  {
702
+ "epoch": 0.8671586715867159,
703
+ "grad_norm": 1.299660479182102,
704
+ "learning_rate": 1.7904576268518886e-05,
705
+ "loss": 0.9598,
706
  "step": 470
707
  },
708
  {
709
+ "epoch": 0.8763837638376384,
710
+ "grad_norm": 1.2221383874437446,
711
+ "learning_rate": 1.783835711967382e-05,
712
+ "loss": 0.9842,
713
  "step": 475
714
  },
715
  {
716
+ "epoch": 0.8856088560885609,
717
+ "grad_norm": 1.2535423952991984,
718
+ "learning_rate": 1.7771234379663545e-05,
719
+ "loss": 0.9641,
720
  "step": 480
721
  },
722
  {
723
+ "epoch": 0.8948339483394834,
724
+ "grad_norm": 1.4654400132426395,
725
+ "learning_rate": 1.770321578627213e-05,
726
+ "loss": 0.9784,
727
  "step": 485
728
  },
729
  {
730
+ "epoch": 0.9040590405904059,
731
+ "grad_norm": 1.3747052246285973,
732
+ "learning_rate": 1.763430918055595e-05,
733
+ "loss": 0.9694,
734
  "step": 490
735
  },
736
  {
737
+ "epoch": 0.9132841328413284,
738
+ "grad_norm": 1.1551950486505687,
739
+ "learning_rate": 1.756452250593979e-05,
740
+ "loss": 0.9727,
741
  "step": 495
742
  },
743
  {
744
+ "epoch": 0.922509225092251,
745
+ "grad_norm": 1.128236535385729,
746
+ "learning_rate": 1.7493863807301116e-05,
747
+ "loss": 0.9666,
748
  "step": 500
749
  },
750
  {
751
+ "epoch": 0.922509225092251,
752
+ "eval_loss": 0.9816026091575623,
753
+ "eval_runtime": 517.2137,
754
+ "eval_samples_per_second": 29.678,
755
+ "eval_steps_per_second": 0.116,
756
+ "step": 500
757
+ },
758
+ {
759
+ "epoch": 0.9317343173431735,
760
+ "grad_norm": 1.230218009681161,
761
+ "learning_rate": 1.74223412300427e-05,
762
+ "loss": 0.9769,
763
  "step": 505
764
  },
765
  {
766
+ "epoch": 0.940959409594096,
767
+ "grad_norm": 1.1847589898088133,
768
+ "learning_rate": 1.7349963019153638e-05,
769
+ "loss": 0.9628,
770
  "step": 510
771
  },
772
  {
773
+ "epoch": 0.9501845018450185,
774
+ "grad_norm": 1.2246308831747907,
775
+ "learning_rate": 1.7276737518258865e-05,
776
+ "loss": 0.9602,
777
  "step": 515
778
  },
779
  {
780
+ "epoch": 0.959409594095941,
781
+ "grad_norm": 1.1390750572317663,
782
+ "learning_rate": 1.7202673168657318e-05,
783
+ "loss": 0.9627,
784
  "step": 520
785
  },
786
  {
787
+ "epoch": 0.9686346863468634,
788
+ "grad_norm": 1.1728205351456946,
789
+ "learning_rate": 1.7127778508348858e-05,
790
+ "loss": 0.9714,
791
  "step": 525
792
  },
793
  {
794
+ "epoch": 0.977859778597786,
795
+ "grad_norm": 1.2796699310011739,
796
+ "learning_rate": 1.7052062171050008e-05,
797
+ "loss": 0.967,
798
  "step": 530
799
  },
800
  {
801
+ "epoch": 0.9870848708487084,
802
+ "grad_norm": 1.1205342517216532,
803
+ "learning_rate": 1.6975532885198678e-05,
804
+ "loss": 0.9663,
805
  "step": 535
806
  },
807
  {
808
+ "epoch": 0.996309963099631,
809
+ "grad_norm": 1.185279277131673,
810
+ "learning_rate": 1.6898199472947972e-05,
811
+ "loss": 0.9581,
812
  "step": 540
813
  },
814
  {
815
+ "epoch": 1.0055350553505535,
816
+ "grad_norm": 3.007398366081561,
817
+ "learning_rate": 1.6820070849149174e-05,
818
+ "loss": 0.8519,
819
  "step": 545
820
  },
821
  {
822
+ "epoch": 1.014760147601476,
823
+ "grad_norm": 2.1038299784593337,
824
+ "learning_rate": 1.6741156020324086e-05,
825
+ "loss": 0.7509,
826
  "step": 550
827
  },
828
  {
829
+ "epoch": 1.0239852398523985,
830
+ "grad_norm": 1.5701183943228265,
831
+ "learning_rate": 1.6661464083626734e-05,
832
+ "loss": 0.7453,
833
  "step": 555
834
  },
835
  {
836
+ "epoch": 1.033210332103321,
837
+ "grad_norm": 1.2911074026361753,
838
+ "learning_rate": 1.6581004225794715e-05,
839
+ "loss": 0.7391,
840
  "step": 560
841
  },
842
  {
843
+ "epoch": 1.0424354243542435,
844
+ "grad_norm": 1.5938907876285198,
845
+ "learning_rate": 1.649978572209012e-05,
846
+ "loss": 0.7347,
847
  "step": 565
848
  },
849
  {
850
+ "epoch": 1.051660516605166,
851
+ "grad_norm": 1.3495506131008623,
852
+ "learning_rate": 1.6417817935230318e-05,
853
+ "loss": 0.7396,
854
  "step": 570
855
  },
856
  {
857
+ "epoch": 1.0608856088560885,
858
+ "grad_norm": 1.2781771587882627,
859
+ "learning_rate": 1.6335110314308654e-05,
860
+ "loss": 0.7305,
861
  "step": 575
862
  },
863
  {
864
+ "epoch": 1.070110701107011,
865
+ "grad_norm": 1.5798733908227265,
866
+ "learning_rate": 1.6251672393705155e-05,
867
+ "loss": 0.7365,
868
  "step": 580
869
  },
870
  {
871
+ "epoch": 1.0793357933579335,
872
+ "grad_norm": 1.416304183876239,
873
+ "learning_rate": 1.6167513791987423e-05,
874
+ "loss": 0.7373,
875
  "step": 585
876
  },
877
  {
878
+ "epoch": 1.088560885608856,
879
+ "grad_norm": 1.3677150489575043,
880
+ "learning_rate": 1.6082644210801846e-05,
881
+ "loss": 0.7299,
882
  "step": 590
883
  },
884
  {
885
+ "epoch": 1.0977859778597785,
886
+ "grad_norm": 1.3506677105351055,
887
+ "learning_rate": 1.5997073433755187e-05,
888
+ "loss": 0.7426,
889
  "step": 595
890
  },
891
  {
892
+ "epoch": 1.1070110701107012,
893
+ "grad_norm": 1.461155474048458,
894
+ "learning_rate": 1.5910811325286768e-05,
895
+ "loss": 0.7366,
896
  "step": 600
897
  },
898
  {
899
+ "epoch": 1.1070110701107012,
900
+ "eval_loss": 0.9852360486984253,
901
+ "eval_runtime": 516.2338,
902
+ "eval_samples_per_second": 29.735,
903
+ "eval_steps_per_second": 0.116,
904
+ "step": 600
905
+ },
906
+ {
907
+ "epoch": 1.1162361623616237,
908
+ "grad_norm": 1.2999195127889172,
909
+ "learning_rate": 1.582386782953129e-05,
910
+ "loss": 0.7351,
911
  "step": 605
912
  },
913
  {
914
+ "epoch": 1.1254612546125462,
915
+ "grad_norm": 1.5599221554130673,
916
+ "learning_rate": 1.5736252969172522e-05,
917
+ "loss": 0.7335,
918
  "step": 610
919
  },
920
  {
921
+ "epoch": 1.1346863468634687,
922
+ "grad_norm": 1.30824219510555,
923
+ "learning_rate": 1.5647976844287884e-05,
924
+ "loss": 0.7321,
925
  "step": 615
926
  },
927
  {
928
+ "epoch": 1.1439114391143912,
929
+ "grad_norm": 1.3590431139669035,
930
+ "learning_rate": 1.5559049631184136e-05,
931
+ "loss": 0.7294,
932
  "step": 620
933
  },
934
  {
935
+ "epoch": 1.1531365313653137,
936
+ "grad_norm": 1.5685872513743657,
937
+ "learning_rate": 1.5469481581224274e-05,
938
+ "loss": 0.7372,
939
  "step": 625
940
  },
941
  {
942
+ "epoch": 1.1623616236162362,
943
+ "grad_norm": 1.4194329169102744,
944
+ "learning_rate": 1.5379283019645757e-05,
945
+ "loss": 0.7423,
946
  "step": 630
947
  },
948
  {
949
+ "epoch": 1.1715867158671587,
950
+ "grad_norm": 1.8516238628155155,
951
+ "learning_rate": 1.5288464344370267e-05,
952
+ "loss": 0.7389,
953
  "step": 635
954
  },
955
  {
956
+ "epoch": 1.1808118081180812,
957
+ "grad_norm": 1.3787465939384576,
958
+ "learning_rate": 1.5197036024805018e-05,
959
+ "loss": 0.7277,
960
  "step": 640
961
  },
962
  {
963
+ "epoch": 1.1900369003690037,
964
+ "grad_norm": 1.2679935699299498,
965
+ "learning_rate": 1.5105008600635888e-05,
966
+ "loss": 0.7251,
967
  "step": 645
968
  },
969
  {
970
+ "epoch": 1.1992619926199262,
971
+ "grad_norm": 1.3661565990701046,
972
+ "learning_rate": 1.5012392680612408e-05,
973
+ "loss": 0.7348,
974
  "step": 650
975
  },
976
  {
977
+ "epoch": 1.2084870848708487,
978
+ "grad_norm": 1.380476117633752,
979
+ "learning_rate": 1.4919198941324813e-05,
980
+ "loss": 0.733,
981
  "step": 655
982
  },
983
  {
984
+ "epoch": 1.2177121771217712,
985
+ "grad_norm": 1.301175007422796,
986
+ "learning_rate": 1.4825438125973263e-05,
987
+ "loss": 0.7331,
988
  "step": 660
989
  },
990
  {
991
+ "epoch": 1.2269372693726937,
992
+ "grad_norm": 1.3531205842843421,
993
+ "learning_rate": 1.4731121043129392e-05,
994
+ "loss": 0.7379,
995
  "step": 665
996
  },
997
  {
998
+ "epoch": 1.2361623616236161,
999
+ "grad_norm": 1.444864127952419,
1000
+ "learning_rate": 1.4636258565490304e-05,
1001
+ "loss": 0.739,
1002
  "step": 670
1003
  },
1004
  {
1005
+ "epoch": 1.2453874538745389,
1006
+ "grad_norm": 1.2863648775710423,
1007
+ "learning_rate": 1.4540861628625207e-05,
1008
+ "loss": 0.7368,
1009
  "step": 675
1010
  },
1011
  {
1012
+ "epoch": 1.2546125461254611,
1013
+ "grad_norm": 1.2200332099647682,
1014
+ "learning_rate": 1.444494122971476e-05,
1015
+ "loss": 0.7343,
1016
  "step": 680
1017
  },
1018
  {
1019
+ "epoch": 1.2638376383763839,
1020
+ "grad_norm": 1.3714375121406106,
1021
+ "learning_rate": 1.4348508426283342e-05,
1022
+ "loss": 0.7391,
1023
  "step": 685
1024
  },
1025
  {
1026
+ "epoch": 1.2730627306273063,
1027
+ "grad_norm": 1.2638691361743832,
1028
+ "learning_rate": 1.4251574334924395e-05,
1029
+ "loss": 0.7397,
1030
  "step": 690
1031
  },
1032
  {
1033
+ "epoch": 1.2822878228782288,
1034
+ "grad_norm": 1.4011111864399106,
1035
+ "learning_rate": 1.4154150130018867e-05,
1036
+ "loss": 0.7374,
1037
  "step": 695
1038
  },
1039
  {
1040
+ "epoch": 1.2915129151291513,
1041
+ "grad_norm": 1.2912923761278596,
1042
+ "learning_rate": 1.4056247042447096e-05,
1043
+ "loss": 0.7228,
1044
+ "step": 700
1045
+ },
1046
+ {
1047
+ "epoch": 1.2915129151291513,
1048
+ "eval_loss": 0.9835454225540161,
1049
+ "eval_runtime": 517.9285,
1050
+ "eval_samples_per_second": 29.637,
1051
+ "eval_steps_per_second": 0.116,
1052
  "step": 700
1053
  },
1054
  {
1055
+ "epoch": 1.3007380073800738,
1056
+ "grad_norm": 1.5854901671726367,
1057
+ "learning_rate": 1.3957876358294115e-05,
1058
+ "loss": 0.7296,
1059
  "step": 705
1060
  },
1061
  {
1062
+ "epoch": 1.3099630996309963,
1063
+ "grad_norm": 1.38846996136312,
1064
+ "learning_rate": 1.385904941754862e-05,
1065
+ "loss": 0.7257,
1066
  "step": 710
1067
  },
1068
  {
1069
+ "epoch": 1.3191881918819188,
1070
+ "grad_norm": 1.5297133474564781,
1071
+ "learning_rate": 1.375977761279571e-05,
1072
+ "loss": 0.7352,
1073
  "step": 715
1074
  },
1075
  {
1076
+ "epoch": 1.3284132841328413,
1077
+ "grad_norm": 1.287259224142701,
1078
+ "learning_rate": 1.366007238790358e-05,
1079
+ "loss": 0.7301,
1080
  "step": 720
1081
  },
1082
  {
1083
+ "epoch": 1.3376383763837638,
1084
+ "grad_norm": 1.2884194224179173,
1085
+ "learning_rate": 1.3559945236704286e-05,
1086
+ "loss": 0.7383,
1087
  "step": 725
1088
  },
1089
  {
1090
+ "epoch": 1.3468634686346863,
1091
+ "grad_norm": 1.3779553004575515,
1092
+ "learning_rate": 1.3459407701668762e-05,
1093
+ "loss": 0.7313,
1094
  "step": 730
1095
  },
1096
  {
1097
+ "epoch": 1.3560885608856088,
1098
+ "grad_norm": 1.5349656095564503,
1099
+ "learning_rate": 1.3358471372576229e-05,
1100
+ "loss": 0.7334,
1101
  "step": 735
1102
  },
1103
  {
1104
+ "epoch": 1.3653136531365313,
1105
+ "grad_norm": 1.3570612666553503,
1106
+ "learning_rate": 1.3257147885178125e-05,
1107
+ "loss": 0.7253,
1108
  "step": 740
1109
  },
1110
  {
1111
+ "epoch": 1.3745387453874538,
1112
+ "grad_norm": 1.3514442377769267,
1113
+ "learning_rate": 1.3155448919856792e-05,
1114
+ "loss": 0.7375,
1115
  "step": 745
1116
  },
1117
  {
1118
+ "epoch": 1.3837638376383765,
1119
+ "grad_norm": 1.338752928401098,
1120
+ "learning_rate": 1.3053386200278963e-05,
1121
+ "loss": 0.7349,
1122
  "step": 750
1123
  },
1124
  {
1125
+ "epoch": 1.3929889298892988,
1126
+ "grad_norm": 1.3943704063449442,
1127
+ "learning_rate": 1.2950971492044272e-05,
1128
+ "loss": 0.7338,
1129
  "step": 755
1130
  },
1131
  {
1132
+ "epoch": 1.4022140221402215,
1133
+ "grad_norm": 1.3567491078204894,
1134
+ "learning_rate": 1.2848216601328958e-05,
1135
+ "loss": 0.7385,
1136
  "step": 760
1137
  },
1138
  {
1139
+ "epoch": 1.4114391143911438,
1140
+ "grad_norm": 1.2556919848553412,
1141
+ "learning_rate": 1.2745133373524855e-05,
1142
+ "loss": 0.7457,
1143
  "step": 765
1144
  },
1145
  {
1146
+ "epoch": 1.4206642066420665,
1147
+ "grad_norm": 1.3027608934231716,
1148
+ "learning_rate": 1.2641733691873884e-05,
1149
+ "loss": 0.7342,
1150
  "step": 770
1151
  },
1152
  {
1153
+ "epoch": 1.429889298892989,
1154
+ "grad_norm": 1.2668132369825373,
1155
+ "learning_rate": 1.2538029476098175e-05,
1156
+ "loss": 0.7317,
1157
  "step": 775
1158
  },
1159
  {
1160
+ "epoch": 1.4391143911439115,
1161
+ "grad_norm": 1.2498842281077402,
1162
+ "learning_rate": 1.2434032681025986e-05,
1163
+ "loss": 0.732,
1164
  "step": 780
1165
  },
1166
  {
1167
+ "epoch": 1.448339483394834,
1168
+ "grad_norm": 1.221148464370588,
1169
+ "learning_rate": 1.2329755295213568e-05,
1170
+ "loss": 0.7168,
1171
  "step": 785
1172
  },
1173
  {
1174
+ "epoch": 1.4575645756457565,
1175
+ "grad_norm": 1.2029873246463332,
1176
+ "learning_rate": 1.2225209339563144e-05,
1177
+ "loss": 0.7299,
1178
  "step": 790
1179
  },
1180
  {
1181
+ "epoch": 1.466789667896679,
1182
+ "grad_norm": 1.2769506053242343,
1183
+ "learning_rate": 1.2120406865937174e-05,
1184
+ "loss": 0.7385,
1185
  "step": 795
1186
  },
1187
  {
1188
+ "epoch": 1.4760147601476015,
1189
+ "grad_norm": 1.5254063393209267,
1190
+ "learning_rate": 1.2015359955769021e-05,
1191
+ "loss": 0.7319,
1192
+ "step": 800
1193
+ },
1194
+ {
1195
+ "epoch": 1.4760147601476015,
1196
+ "eval_loss": 0.9644125699996948,
1197
+ "eval_runtime": 512.8317,
1198
+ "eval_samples_per_second": 29.932,
1199
+ "eval_steps_per_second": 0.117,
1200
  "step": 800
1201
  },
1202
  {
1203
+ "epoch": 1.485239852398524,
1204
+ "grad_norm": 1.4657220418578245,
1205
+ "learning_rate": 1.1910080718670246e-05,
1206
+ "loss": 0.7234,
1207
  "step": 805
1208
  },
1209
  {
1210
+ "epoch": 1.4944649446494465,
1211
+ "grad_norm": 1.3333083489866098,
1212
+ "learning_rate": 1.1804581291034615e-05,
1213
+ "loss": 0.7314,
1214
  "step": 810
1215
  },
1216
  {
1217
+ "epoch": 1.503690036900369,
1218
+ "grad_norm": 1.3111534531304956,
1219
+ "learning_rate": 1.169887383463906e-05,
1220
+ "loss": 0.7212,
1221
  "step": 815
1222
  },
1223
  {
1224
+ "epoch": 1.5129151291512914,
1225
+ "grad_norm": 1.2536260067392955,
1226
+ "learning_rate": 1.1592970535241668e-05,
1227
+ "loss": 0.723,
1228
  "step": 820
1229
  },
1230
  {
1231
+ "epoch": 1.5221402214022142,
1232
+ "grad_norm": 1.239943596383526,
1233
+ "learning_rate": 1.1486883601176944e-05,
1234
+ "loss": 0.7315,
1235
  "step": 825
1236
  },
1237
  {
1238
+ "epoch": 1.5313653136531364,
1239
+ "grad_norm": 1.188861248391431,
1240
+ "learning_rate": 1.1380625261948458e-05,
1241
+ "loss": 0.7301,
1242
  "step": 830
1243
  },
1244
  {
1245
+ "epoch": 1.5405904059040592,
1246
+ "grad_norm": 1.247650108627454,
1247
+ "learning_rate": 1.127420776681905e-05,
1248
+ "loss": 0.7202,
1249
  "step": 835
1250
  },
1251
  {
1252
+ "epoch": 1.5498154981549814,
1253
+ "grad_norm": 1.4048683840262912,
1254
+ "learning_rate": 1.1167643383398746e-05,
1255
+ "loss": 0.7247,
1256
  "step": 840
1257
  },
1258
  {
1259
+ "epoch": 1.5590405904059041,
1260
+ "grad_norm": 1.2897015340446114,
1261
+ "learning_rate": 1.1060944396230583e-05,
1262
+ "loss": 0.7311,
1263
  "step": 845
1264
  },
1265
  {
1266
+ "epoch": 1.5682656826568264,
1267
+ "grad_norm": 1.21939417183643,
1268
+ "learning_rate": 1.0954123105374468e-05,
1269
+ "loss": 0.7249,
1270
  "step": 850
1271
  },
1272
  {
1273
+ "epoch": 1.5774907749077491,
1274
+ "grad_norm": 1.2309319468475195,
1275
+ "learning_rate": 1.0847191824989252e-05,
1276
+ "loss": 0.7298,
1277
  "step": 855
1278
  },
1279
  {
1280
+ "epoch": 1.5867158671586716,
1281
+ "grad_norm": 1.2218109998078897,
1282
+ "learning_rate": 1.0740162881913165e-05,
1283
+ "loss": 0.7223,
1284
  "step": 860
1285
  },
1286
  {
1287
+ "epoch": 1.5959409594095941,
1288
+ "grad_norm": 1.4183791452745522,
1289
+ "learning_rate": 1.0633048614242817e-05,
1290
+ "loss": 0.7359,
1291
  "step": 865
1292
  },
1293
  {
1294
+ "epoch": 1.6051660516605166,
1295
+ "grad_norm": 1.2210289040303786,
1296
+ "learning_rate": 1.0525861369910877e-05,
1297
+ "loss": 0.7302,
1298
  "step": 870
1299
  },
1300
  {
1301
+ "epoch": 1.6143911439114391,
1302
+ "grad_norm": 1.3175608261808258,
1303
+ "learning_rate": 1.0418613505262623e-05,
1304
+ "loss": 0.7226,
1305
  "step": 875
1306
  },
1307
  {
1308
+ "epoch": 1.6236162361623616,
1309
+ "grad_norm": 1.3018239201611663,
1310
+ "learning_rate": 1.0311317383631532e-05,
1311
+ "loss": 0.7227,
1312
  "step": 880
1313
  },
1314
  {
1315
+ "epoch": 1.632841328413284,
1316
+ "grad_norm": 1.1647552351758403,
1317
+ "learning_rate": 1.0203985373914056e-05,
1318
+ "loss": 0.7204,
1319
  "step": 885
1320
  },
1321
  {
1322
+ "epoch": 1.6420664206642066,
1323
+ "grad_norm": 1.210717925144679,
1324
+ "learning_rate": 1.0096629849143757e-05,
1325
+ "loss": 0.7115,
1326
  "step": 890
1327
  },
1328
  {
1329
+ "epoch": 1.651291512915129,
1330
+ "grad_norm": 1.1959081633999162,
1331
+ "learning_rate": 9.989263185064974e-06,
1332
+ "loss": 0.7164,
1333
  "step": 895
1334
  },
1335
  {
1336
+ "epoch": 1.6605166051660518,
1337
+ "grad_norm": 1.1679984043624778,
1338
+ "learning_rate": 9.881897758706155e-06,
1339
+ "loss": 0.7177,
1340
  "step": 900
1341
  },
1342
  {
1343
+ "epoch": 1.6605166051660518,
1344
+ "eval_loss": 0.9529369473457336,
1345
+ "eval_runtime": 516.4151,
1346
+ "eval_samples_per_second": 29.724,
1347
+ "eval_steps_per_second": 0.116,
1348
+ "step": 900
1349
+ },
1350
+ {
1351
+ "epoch": 1.669741697416974,
1352
+ "grad_norm": 1.1784785526719634,
1353
+ "learning_rate": 9.77454594695308e-06,
1354
+ "loss": 0.7274,
1355
  "step": 905
1356
  },
1357
  {
1358
+ "epoch": 1.6789667896678968,
1359
+ "grad_norm": 1.1964871209199903,
1360
+ "learning_rate": 9.667220125122044e-06,
1361
+ "loss": 0.7119,
1362
  "step": 910
1363
  },
1364
  {
1365
+ "epoch": 1.688191881918819,
1366
+ "grad_norm": 1.173031357661576,
1367
+ "learning_rate": 9.559932665533291e-06,
1368
+ "loss": 0.7134,
1369
  "step": 915
1370
  },
1371
  {
1372
+ "epoch": 1.6974169741697418,
1373
+ "grad_norm": 1.2312863536042935,
1374
+ "learning_rate": 9.452695936084728e-06,
1375
+ "loss": 0.7144,
1376
  "step": 920
1377
  },
1378
  {
1379
+ "epoch": 1.706642066420664,
1380
+ "grad_norm": 1.2013984113686338,
1381
+ "learning_rate": 9.345522298826177e-06,
1382
+ "loss": 0.7146,
1383
  "step": 925
1384
  },
1385
  {
1386
+ "epoch": 1.7158671586715868,
1387
+ "grad_norm": 1.1285995450468198,
1388
+ "learning_rate": 9.238424108534333e-06,
1389
+ "loss": 0.7126,
1390
  "step": 930
1391
  },
1392
  {
1393
+ "epoch": 1.725092250922509,
1394
+ "grad_norm": 1.1727971825533714,
1395
+ "learning_rate": 9.131413711288485e-06,
1396
+ "loss": 0.7173,
1397
  "step": 935
1398
  },
1399
  {
1400
+ "epoch": 1.7343173431734318,
1401
+ "grad_norm": 1.198238879588798,
1402
+ "learning_rate": 9.024503443047318e-06,
1403
+ "loss": 0.7186,
1404
  "step": 940
1405
  },
1406
  {
1407
+ "epoch": 1.7435424354243543,
1408
+ "grad_norm": 1.2092538734459182,
1409
+ "learning_rate": 8.917705628226823e-06,
1410
+ "loss": 0.7064,
1411
  "step": 945
1412
  },
1413
  {
1414
+ "epoch": 1.7527675276752768,
1415
+ "grad_norm": 1.1850959753551464,
1416
+ "learning_rate": 8.81103257827957e-06,
1417
+ "loss": 0.7196,
1418
  "step": 950
1419
  },
1420
  {
1421
+ "epoch": 1.7619926199261993,
1422
+ "grad_norm": 1.1849846233150378,
1423
+ "learning_rate": 8.704496590275479e-06,
1424
+ "loss": 0.7181,
1425
  "step": 955
1426
  },
1427
  {
1428
+ "epoch": 1.7712177121771218,
1429
+ "grad_norm": 1.1192440025321218,
1430
+ "learning_rate": 8.598109945484208e-06,
1431
+ "loss": 0.7127,
1432
  "step": 960
1433
  },
1434
  {
1435
+ "epoch": 1.7804428044280443,
1436
+ "grad_norm": 1.185810311236685,
1437
+ "learning_rate": 8.491884907959426e-06,
1438
+ "loss": 0.7092,
1439
  "step": 965
1440
  },
1441
  {
1442
+ "epoch": 1.7896678966789668,
1443
+ "grad_norm": 1.1653670987242044,
1444
+ "learning_rate": 8.385833723125006e-06,
1445
+ "loss": 0.7115,
1446
  "step": 970
1447
  },
1448
  {
1449
+ "epoch": 1.7988929889298892,
1450
+ "grad_norm": 1.2928934171032893,
1451
+ "learning_rate": 8.279968616363417e-06,
1452
+ "loss": 0.7116,
1453
  "step": 975
1454
  },
1455
  {
1456
+ "epoch": 1.8081180811808117,
1457
+ "grad_norm": 1.1749460908752425,
1458
+ "learning_rate": 8.174301791606384e-06,
1459
+ "loss": 0.7159,
1460
  "step": 980
1461
  },
1462
  {
1463
+ "epoch": 1.8173431734317345,
1464
+ "grad_norm": 1.2968530721458553,
1465
+ "learning_rate": 8.06884542992806e-06,
1466
+ "loss": 0.7022,
1467
  "step": 985
1468
  },
1469
  {
1470
+ "epoch": 1.8265682656826567,
1471
+ "grad_norm": 1.214409149915767,
1472
+ "learning_rate": 7.963611688140814e-06,
1473
+ "loss": 0.705,
1474
  "step": 990
1475
  },
1476
  {
1477
+ "epoch": 1.8357933579335795,
1478
+ "grad_norm": 1.1751136227927774,
1479
+ "learning_rate": 7.858612697393792e-06,
1480
+ "loss": 0.7166,
1481
  "step": 995
1482
  },
1483
  {
1484
+ "epoch": 1.8450184501845017,
1485
+ "grad_norm": 1.2707314516132002,
1486
+ "learning_rate": 7.753860561774495e-06,
1487
+ "loss": 0.7095,
1488
+ "step": 1000
1489
+ },
1490
+ {
1491
+ "epoch": 1.8450184501845017,
1492
+ "eval_loss": 0.9393758773803711,
1493
+ "eval_runtime": 524.5955,
1494
+ "eval_samples_per_second": 29.261,
1495
+ "eval_steps_per_second": 0.114,
1496
  "step": 1000
1497
  },
1498
  {
1499
+ "epoch": 1.8542435424354244,
1500
+ "grad_norm": 1.2737022554438457,
1501
+ "learning_rate": 7.649367356913422e-06,
1502
+ "loss": 0.7133,
1503
  "step": 1005
1504
  },
1505
  {
1506
+ "epoch": 1.8634686346863467,
1507
+ "grad_norm": 1.2146494230865963,
1508
+ "learning_rate": 7.545145128592009e-06,
1509
+ "loss": 0.7162,
1510
  "step": 1010
1511
  },
1512
  {
1513
+ "epoch": 1.8726937269372694,
1514
+ "grad_norm": 1.2563305762066708,
1515
+ "learning_rate": 7.441205891354037e-06,
1516
+ "loss": 0.7128,
1517
  "step": 1015
1518
  },
1519
  {
1520
+ "epoch": 1.881918819188192,
1521
+ "grad_norm": 1.2400110075293442,
1522
+ "learning_rate": 7.337561627120591e-06,
1523
+ "loss": 0.7059,
1524
  "step": 1020
1525
  },
1526
  {
1527
+ "epoch": 1.8911439114391144,
1528
+ "grad_norm": 1.2653437150866325,
1529
+ "learning_rate": 7.234224283808832e-06,
1530
+ "loss": 0.7058,
1531
  "step": 1025
1532
  },
1533
  {
1534
+ "epoch": 1.900369003690037,
1535
+ "grad_norm": 1.1646651085367645,
1536
+ "learning_rate": 7.131205773954636e-06,
1537
+ "loss": 0.706,
1538
  "step": 1030
1539
  },
1540
  {
1541
+ "epoch": 1.9095940959409594,
1542
+ "grad_norm": 1.1518551233990397,
1543
+ "learning_rate": 7.028517973339361e-06,
1544
+ "loss": 0.7138,
1545
  "step": 1035
1546
  },
1547
  {
1548
+ "epoch": 1.918819188191882,
1549
+ "grad_norm": 1.223360815231687,
1550
+ "learning_rate": 6.926172719620827e-06,
1551
+ "loss": 0.697,
1552
  "step": 1040
1553
  },
1554
  {
1555
+ "epoch": 1.9280442804428044,
1556
+ "grad_norm": 1.2198079984824493,
1557
+ "learning_rate": 6.824181810968675e-06,
1558
+ "loss": 0.7004,
1559
  "step": 1045
1560
  },
1561
  {
1562
+ "epoch": 1.937269372693727,
1563
+ "grad_norm": 1.176959664107674,
1564
+ "learning_rate": 6.722557004704322e-06,
1565
+ "loss": 0.7082,
1566
  "step": 1050
1567
  },
1568
  {
1569
+ "epoch": 1.9464944649446494,
1570
+ "grad_norm": 1.1844320699248965,
1571
+ "learning_rate": 6.62131001594558e-06,
1572
+ "loss": 0.7043,
1573
  "step": 1055
1574
  },
1575
  {
1576
+ "epoch": 1.9557195571955721,
1577
+ "grad_norm": 1.148753422424237,
1578
+ "learning_rate": 6.520452516256157e-06,
1579
+ "loss": 0.6949,
1580
  "step": 1060
1581
  },
1582
  {
1583
+ "epoch": 1.9649446494464944,
1584
+ "grad_norm": 1.1572577267352544,
1585
+ "learning_rate": 6.419996132300203e-06,
1586
+ "loss": 0.7071,
1587
  "step": 1065
1588
  },
1589
  {
1590
+ "epoch": 1.974169741697417,
1591
+ "grad_norm": 1.2001014830908205,
1592
+ "learning_rate": 6.319952444501984e-06,
1593
+ "loss": 0.7103,
1594
  "step": 1070
1595
  },
1596
  {
1597
+ "epoch": 1.9833948339483394,
1598
+ "grad_norm": 1.4841715888010063,
1599
+ "learning_rate": 6.220332985710936e-06,
1600
+ "loss": 0.694,
1601
  "step": 1075
1602
  },
1603
  {
1604
+ "epoch": 1.992619926199262,
1605
+ "grad_norm": 1.4256755997357629,
1606
+ "learning_rate": 6.121149239872151e-06,
1607
+ "loss": 0.6964,
1608
  "step": 1080
1609
  },
1610
  {
1611
+ "epoch": 2.0018450184501844,
1612
+ "grad_norm": 4.270149025567802,
1613
+ "learning_rate": 6.0224126407025616e-06,
1614
+ "loss": 0.6543,
1615
+ "step": 1085
1616
+ },
1617
+ {
1618
+ "epoch": 2.011070110701107,
1619
+ "grad_norm": 2.6490744221351044,
1620
+ "learning_rate": 5.924134570372863e-06,
1621
+ "loss": 0.4529,
1622
+ "step": 1090
1623
+ },
1624
+ {
1625
+ "epoch": 2.0202952029520294,
1626
+ "grad_norm": 2.2645999605838227,
1627
+ "learning_rate": 5.826326358195391e-06,
1628
+ "loss": 0.4559,
1629
+ "step": 1095
1630
+ },
1631
+ {
1632
+ "epoch": 2.029520295202952,
1633
+ "grad_norm": 1.5705400512864462,
1634
+ "learning_rate": 5.728999279318131e-06,
1635
+ "loss": 0.4465,
1636
+ "step": 1100
1637
+ },
1638
+ {
1639
+ "epoch": 2.029520295202952,
1640
+ "eval_loss": 0.9917108416557312,
1641
+ "eval_runtime": 517.7798,
1642
+ "eval_samples_per_second": 29.646,
1643
+ "eval_steps_per_second": 0.116,
1644
+ "step": 1100
1645
+ },
1646
+ {
1647
+ "epoch": 2.0387453874538743,
1648
+ "grad_norm": 1.6254518927847355,
1649
+ "learning_rate": 5.632164553424904e-06,
1650
+ "loss": 0.4353,
1651
+ "step": 1105
1652
+ },
1653
+ {
1654
+ "epoch": 2.047970479704797,
1655
+ "grad_norm": 14.583137561537578,
1656
+ "learning_rate": 5.5358333434420054e-06,
1657
+ "loss": 0.4424,
1658
+ "step": 1110
1659
+ },
1660
+ {
1661
+ "epoch": 2.0571955719557193,
1662
+ "grad_norm": 1.447005279720627,
1663
+ "learning_rate": 5.440016754251364e-06,
1664
+ "loss": 0.4423,
1665
+ "step": 1115
1666
+ },
1667
+ {
1668
+ "epoch": 2.066420664206642,
1669
+ "grad_norm": 1.4595204240426687,
1670
+ "learning_rate": 5.344725831410369e-06,
1671
+ "loss": 0.4384,
1672
+ "step": 1120
1673
+ },
1674
+ {
1675
+ "epoch": 2.0756457564575648,
1676
+ "grad_norm": 1.3190598016289843,
1677
+ "learning_rate": 5.24997155987859e-06,
1678
+ "loss": 0.4368,
1679
+ "step": 1125
1680
+ },
1681
+ {
1682
+ "epoch": 2.084870848708487,
1683
+ "grad_norm": 1.322338946677976,
1684
+ "learning_rate": 5.155764862751427e-06,
1685
+ "loss": 0.4392,
1686
+ "step": 1130
1687
+ },
1688
+ {
1689
+ "epoch": 2.0940959409594098,
1690
+ "grad_norm": 1.3472757392525208,
1691
+ "learning_rate": 5.062116600000933e-06,
1692
+ "loss": 0.4297,
1693
+ "step": 1135
1694
+ },
1695
+ {
1696
+ "epoch": 2.103321033210332,
1697
+ "grad_norm": 1.2895577097092337,
1698
+ "learning_rate": 4.969037567223881e-06,
1699
+ "loss": 0.4413,
1700
+ "step": 1140
1701
+ },
1702
+ {
1703
+ "epoch": 2.1125461254612548,
1704
+ "grad_norm": 1.3471090116973288,
1705
+ "learning_rate": 4.876538494397274e-06,
1706
+ "loss": 0.4317,
1707
+ "step": 1145
1708
+ },
1709
+ {
1710
+ "epoch": 2.121771217712177,
1711
+ "grad_norm": 1.3092628602239211,
1712
+ "learning_rate": 4.784630044641435e-06,
1713
+ "loss": 0.4509,
1714
+ "step": 1150
1715
+ },
1716
+ {
1717
+ "epoch": 2.1309963099630997,
1718
+ "grad_norm": 1.344809966917295,
1719
+ "learning_rate": 4.6933228129907395e-06,
1720
+ "loss": 0.4375,
1721
+ "step": 1155
1722
+ },
1723
+ {
1724
+ "epoch": 2.140221402214022,
1725
+ "grad_norm": 1.3014430618254322,
1726
+ "learning_rate": 4.602627325172279e-06,
1727
+ "loss": 0.4424,
1728
+ "step": 1160
1729
+ },
1730
+ {
1731
+ "epoch": 2.1494464944649447,
1732
+ "grad_norm": 1.3672933559982345,
1733
+ "learning_rate": 4.512554036392448e-06,
1734
+ "loss": 0.4419,
1735
+ "step": 1165
1736
+ },
1737
+ {
1738
+ "epoch": 2.158671586715867,
1739
+ "grad_norm": 1.3446667993737584,
1740
+ "learning_rate": 4.423113330131708e-06,
1741
+ "loss": 0.4303,
1742
+ "step": 1170
1743
+ },
1744
+ {
1745
+ "epoch": 2.1678966789667897,
1746
+ "grad_norm": 1.3257443131859206,
1747
+ "learning_rate": 4.33431551694758e-06,
1748
+ "loss": 0.4369,
1749
+ "step": 1175
1750
+ },
1751
+ {
1752
+ "epoch": 2.177121771217712,
1753
+ "grad_norm": 1.3655737456565726,
1754
+ "learning_rate": 4.246170833286075e-06,
1755
+ "loss": 0.4293,
1756
+ "step": 1180
1757
+ },
1758
+ {
1759
+ "epoch": 2.1863468634686347,
1760
+ "grad_norm": 1.3298593125645854,
1761
+ "learning_rate": 4.1586894403016576e-06,
1762
+ "loss": 0.439,
1763
+ "step": 1185
1764
+ },
1765
+ {
1766
+ "epoch": 2.195571955719557,
1767
+ "grad_norm": 1.32505780264794,
1768
+ "learning_rate": 4.071881422685877e-06,
1769
+ "loss": 0.4285,
1770
+ "step": 1190
1771
+ },
1772
+ {
1773
+ "epoch": 2.2047970479704797,
1774
+ "grad_norm": 1.3004312804341762,
1775
+ "learning_rate": 3.985756787504837e-06,
1776
+ "loss": 0.4353,
1777
+ "step": 1195
1778
+ },
1779
+ {
1780
+ "epoch": 2.2140221402214024,
1781
+ "grad_norm": 1.3177561620055287,
1782
+ "learning_rate": 3.9003254630455775e-06,
1783
+ "loss": 0.4341,
1784
+ "step": 1200
1785
+ },
1786
+ {
1787
+ "epoch": 2.2140221402214024,
1788
+ "eval_loss": 0.9978848695755005,
1789
+ "eval_runtime": 514.7843,
1790
+ "eval_samples_per_second": 29.818,
1791
+ "eval_steps_per_second": 0.117,
1792
+ "step": 1200
1793
+ },
1794
+ {
1795
+ "epoch": 2.2232472324723247,
1796
+ "grad_norm": 1.3438896554856818,
1797
+ "learning_rate": 3.815597297671578e-06,
1798
+ "loss": 0.4336,
1799
+ "step": 1205
1800
+ },
1801
+ {
1802
+ "epoch": 2.2324723247232474,
1803
+ "grad_norm": 1.2896295540334282,
1804
+ "learning_rate": 3.731582058687462e-06,
1805
+ "loss": 0.435,
1806
+ "step": 1210
1807
+ },
1808
+ {
1809
+ "epoch": 2.2416974169741697,
1810
+ "grad_norm": 1.358035688644123,
1811
+ "learning_rate": 3.6482894312130146e-06,
1812
+ "loss": 0.4324,
1813
+ "step": 1215
1814
+ },
1815
+ {
1816
+ "epoch": 2.2509225092250924,
1817
+ "grad_norm": 1.312197292051631,
1818
+ "learning_rate": 3.565729017066729e-06,
1819
+ "loss": 0.4315,
1820
+ "step": 1220
1821
+ },
1822
+ {
1823
+ "epoch": 2.2601476014760147,
1824
+ "grad_norm": 1.3227121347141655,
1825
+ "learning_rate": 3.483910333658913e-06,
1826
+ "loss": 0.4364,
1827
+ "step": 1225
1828
+ },
1829
+ {
1830
+ "epoch": 2.2693726937269374,
1831
+ "grad_norm": 1.3256090212374516,
1832
+ "learning_rate": 3.402842812894529e-06,
1833
+ "loss": 0.4356,
1834
+ "step": 1230
1835
+ },
1836
+ {
1837
+ "epoch": 2.2785977859778597,
1838
+ "grad_norm": 1.317549750635349,
1839
+ "learning_rate": 3.3225358000859287e-06,
1840
+ "loss": 0.4349,
1841
+ "step": 1235
1842
+ },
1843
+ {
1844
+ "epoch": 2.2878228782287824,
1845
+ "grad_norm": 1.2612830347481554,
1846
+ "learning_rate": 3.2429985528755127e-06,
1847
+ "loss": 0.4306,
1848
+ "step": 1240
1849
+ },
1850
+ {
1851
+ "epoch": 2.2970479704797047,
1852
+ "grad_norm": 1.3450073317730427,
1853
+ "learning_rate": 3.1642402401685557e-06,
1854
+ "loss": 0.4361,
1855
+ "step": 1245
1856
+ },
1857
+ {
1858
+ "epoch": 2.3062730627306274,
1859
+ "grad_norm": 1.3431835139445107,
1860
+ "learning_rate": 3.0862699410762043e-06,
1861
+ "loss": 0.4393,
1862
+ "step": 1250
1863
+ },
1864
+ {
1865
+ "epoch": 2.3154981549815496,
1866
+ "grad_norm": 1.3379126436430948,
1867
+ "learning_rate": 3.0090966438688774e-06,
1868
+ "loss": 0.4306,
1869
+ "step": 1255
1870
+ },
1871
+ {
1872
+ "epoch": 2.3247232472324724,
1873
+ "grad_norm": 1.2809064467748859,
1874
+ "learning_rate": 2.9327292449401067e-06,
1875
+ "loss": 0.4416,
1876
+ "step": 1260
1877
+ },
1878
+ {
1879
+ "epoch": 2.3339483394833946,
1880
+ "grad_norm": 1.3548015164880183,
1881
+ "learning_rate": 2.8571765477809645e-06,
1882
+ "loss": 0.4338,
1883
+ "step": 1265
1884
+ },
1885
+ {
1886
+ "epoch": 2.3431734317343174,
1887
+ "grad_norm": 1.320665427008479,
1888
+ "learning_rate": 2.7824472619652386e-06,
1889
+ "loss": 0.4361,
1890
+ "step": 1270
1891
+ },
1892
+ {
1893
+ "epoch": 2.35239852398524,
1894
+ "grad_norm": 1.3096646770487193,
1895
+ "learning_rate": 2.7085500021453838e-06,
1896
+ "loss": 0.4294,
1897
+ "step": 1275
1898
+ },
1899
+ {
1900
+ "epoch": 2.3616236162361623,
1901
+ "grad_norm": 1.2800372167523524,
1902
+ "learning_rate": 2.635493287059464e-06,
1903
+ "loss": 0.4299,
1904
+ "step": 1280
1905
+ },
1906
+ {
1907
+ "epoch": 2.3708487084870846,
1908
+ "grad_norm": 1.303993086907089,
1909
+ "learning_rate": 2.563285538549104e-06,
1910
+ "loss": 0.4361,
1911
+ "step": 1285
1912
+ },
1913
+ {
1914
+ "epoch": 2.3800738007380073,
1915
+ "grad_norm": 1.2720280407092956,
1916
+ "learning_rate": 2.491935080588658e-06,
1917
+ "loss": 0.4384,
1918
+ "step": 1290
1919
+ },
1920
+ {
1921
+ "epoch": 2.38929889298893,
1922
+ "grad_norm": 1.2941980810201439,
1923
+ "learning_rate": 2.421450138325625e-06,
1924
+ "loss": 0.4306,
1925
+ "step": 1295
1926
+ },
1927
+ {
1928
+ "epoch": 2.3985239852398523,
1929
+ "grad_norm": 1.2949495993502738,
1930
+ "learning_rate": 2.351838837132464e-06,
1931
+ "loss": 0.432,
1932
+ "step": 1300
1933
+ },
1934
+ {
1935
+ "epoch": 2.3985239852398523,
1936
+ "eval_loss": 0.9954376816749573,
1937
+ "eval_runtime": 519.9495,
1938
+ "eval_samples_per_second": 29.522,
1939
+ "eval_steps_per_second": 0.115,
1940
+ "step": 1300
1941
+ },
1942
+ {
1943
+ "epoch": 2.407749077490775,
1944
+ "grad_norm": 1.3018815365771563,
1945
+ "learning_rate": 2.283109201669936e-06,
1946
+ "loss": 0.4357,
1947
+ "step": 1305
1948
+ },
1949
+ {
1950
+ "epoch": 2.4169741697416973,
1951
+ "grad_norm": 1.2956106687686837,
1952
+ "learning_rate": 2.2152691549620155e-06,
1953
+ "loss": 0.4283,
1954
+ "step": 1310
1955
+ },
1956
+ {
1957
+ "epoch": 2.42619926199262,
1958
+ "grad_norm": 1.287230882437174,
1959
+ "learning_rate": 2.148326517482543e-06,
1960
+ "loss": 0.4303,
1961
+ "step": 1315
1962
+ },
1963
+ {
1964
+ "epoch": 2.4354243542435423,
1965
+ "grad_norm": 1.2592322120333668,
1966
+ "learning_rate": 2.0822890062537106e-06,
1967
+ "loss": 0.4366,
1968
+ "step": 1320
1969
+ },
1970
+ {
1971
+ "epoch": 2.444649446494465,
1972
+ "grad_norm": 1.3039469988205457,
1973
+ "learning_rate": 2.01716423395644e-06,
1974
+ "loss": 0.4317,
1975
+ "step": 1325
1976
+ },
1977
+ {
1978
+ "epoch": 2.4538745387453873,
1979
+ "grad_norm": 1.282772824972497,
1980
+ "learning_rate": 1.9529597080528207e-06,
1981
+ "loss": 0.4272,
1982
+ "step": 1330
1983
+ },
1984
+ {
1985
+ "epoch": 2.46309963099631,
1986
+ "grad_norm": 1.3227463435260074,
1987
+ "learning_rate": 1.8896828299206494e-06,
1988
+ "loss": 0.4256,
1989
+ "step": 1335
1990
+ },
1991
+ {
1992
+ "epoch": 2.4723247232472323,
1993
+ "grad_norm": 1.3607936617452498,
1994
+ "learning_rate": 1.8273408940002202e-06,
1995
+ "loss": 0.4389,
1996
+ "step": 1340
1997
+ },
1998
+ {
1999
+ "epoch": 2.481549815498155,
2000
+ "grad_norm": 1.2740801988744865,
2001
+ "learning_rate": 1.7659410869534466e-06,
2002
+ "loss": 0.4247,
2003
+ "step": 1345
2004
+ },
2005
+ {
2006
+ "epoch": 2.4907749077490777,
2007
+ "grad_norm": 1.2544315701192987,
2008
+ "learning_rate": 1.7054904868353717e-06,
2009
+ "loss": 0.4256,
2010
+ "step": 1350
2011
+ },
2012
+ {
2013
+ "epoch": 2.5,
2014
+ "grad_norm": 1.31550558585801,
2015
+ "learning_rate": 1.6459960622782466e-06,
2016
+ "loss": 0.428,
2017
+ "step": 1355
2018
+ },
2019
+ {
2020
+ "epoch": 2.5092250922509223,
2021
+ "grad_norm": 1.3030144767834306,
2022
+ "learning_rate": 1.587464671688187e-06,
2023
+ "loss": 0.4217,
2024
+ "step": 1360
2025
+ },
2026
+ {
2027
+ "epoch": 2.518450184501845,
2028
+ "grad_norm": 1.261812680015863,
2029
+ "learning_rate": 1.5299030624545563e-06,
2030
+ "loss": 0.4381,
2031
+ "step": 1365
2032
+ },
2033
+ {
2034
+ "epoch": 2.5276752767527677,
2035
+ "grad_norm": 1.3015065571944802,
2036
+ "learning_rate": 1.4733178701721262e-06,
2037
+ "loss": 0.4337,
2038
+ "step": 1370
2039
+ },
2040
+ {
2041
+ "epoch": 2.53690036900369,
2042
+ "grad_norm": 1.2805139778312684,
2043
+ "learning_rate": 1.4177156178761508e-06,
2044
+ "loss": 0.4313,
2045
+ "step": 1375
2046
+ },
2047
+ {
2048
+ "epoch": 2.5461254612546127,
2049
+ "grad_norm": 1.3271791125805354,
2050
+ "learning_rate": 1.363102715290402e-06,
2051
+ "loss": 0.4314,
2052
+ "step": 1380
2053
+ },
2054
+ {
2055
+ "epoch": 2.555350553505535,
2056
+ "grad_norm": 1.3155240192251205,
2057
+ "learning_rate": 1.3094854580882599e-06,
2058
+ "loss": 0.4298,
2059
+ "step": 1385
2060
+ },
2061
+ {
2062
+ "epoch": 2.5645756457564577,
2063
+ "grad_norm": 1.2884517504542843,
2064
+ "learning_rate": 1.2568700271669676e-06,
2065
+ "loss": 0.4315,
2066
+ "step": 1390
2067
+ },
2068
+ {
2069
+ "epoch": 2.57380073800738,
2070
+ "grad_norm": 1.2601572769871257,
2071
+ "learning_rate": 1.2052624879351105e-06,
2072
+ "loss": 0.4341,
2073
+ "step": 1395
2074
+ },
2075
+ {
2076
+ "epoch": 2.5830258302583027,
2077
+ "grad_norm": 1.283042988722646,
2078
+ "learning_rate": 1.1546687896133924e-06,
2079
+ "loss": 0.4301,
2080
+ "step": 1400
2081
+ },
2082
+ {
2083
+ "epoch": 2.5830258302583027,
2084
+ "eval_loss": 0.9943162798881531,
2085
+ "eval_runtime": 513.9906,
2086
+ "eval_samples_per_second": 29.864,
2087
+ "eval_steps_per_second": 0.117,
2088
+ "step": 1400
2089
+ },
2090
+ {
2091
+ "epoch": 2.592250922509225,
2092
+ "grad_norm": 1.269448040169663,
2093
+ "learning_rate": 1.1050947645488419e-06,
2094
+ "loss": 0.424,
2095
+ "step": 1405
2096
+ },
2097
+ {
2098
+ "epoch": 2.6014760147601477,
2099
+ "grad_norm": 1.291108826010762,
2100
+ "learning_rate": 1.0565461275424504e-06,
2101
+ "loss": 0.4288,
2102
+ "step": 1410
2103
+ },
2104
+ {
2105
+ "epoch": 2.61070110701107,
2106
+ "grad_norm": 1.246075371329031,
2107
+ "learning_rate": 1.0090284751903989e-06,
2108
+ "loss": 0.4308,
2109
+ "step": 1415
2110
+ },
2111
+ {
2112
+ "epoch": 2.6199261992619927,
2113
+ "grad_norm": 1.268331381912208,
2114
+ "learning_rate": 9.625472852388739e-07,
2115
+ "loss": 0.4274,
2116
+ "step": 1420
2117
+ },
2118
+ {
2119
+ "epoch": 2.6291512915129154,
2120
+ "grad_norm": 1.2558980878489436,
2121
+ "learning_rate": 9.171079159526186e-07,
2122
+ "loss": 0.4263,
2123
+ "step": 1425
2124
+ },
2125
+ {
2126
+ "epoch": 2.6383763837638377,
2127
+ "grad_norm": 1.2507458001549574,
2128
+ "learning_rate": 8.727156054972374e-07,
2129
+ "loss": 0.4364,
2130
+ "step": 1430
2131
+ },
2132
+ {
2133
+ "epoch": 2.64760147601476,
2134
+ "grad_norm": 1.2344093421817917,
2135
+ "learning_rate": 8.29375471335343e-07,
2136
+ "loss": 0.43,
2137
+ "step": 1435
2138
+ },
2139
+ {
2140
+ "epoch": 2.6568265682656826,
2141
+ "grad_norm": 1.2520176453134155,
2142
+ "learning_rate": 7.870925096366366e-07,
2143
+ "loss": 0.4298,
2144
+ "step": 1440
2145
+ },
2146
+ {
2147
+ "epoch": 2.6660516605166054,
2148
+ "grad_norm": 1.2874930933327957,
2149
+ "learning_rate": 7.458715947019468e-07,
2150
+ "loss": 0.4262,
2151
+ "step": 1445
2152
+ },
2153
+ {
2154
+ "epoch": 2.6752767527675276,
2155
+ "grad_norm": 1.2682188739552445,
2156
+ "learning_rate": 7.057174784013432e-07,
2157
+ "loss": 0.4339,
2158
+ "step": 1450
2159
+ },
2160
+ {
2161
+ "epoch": 2.6845018450184504,
2162
+ "grad_norm": 1.2828645340804818,
2163
+ "learning_rate": 6.666347896263326e-07,
2164
+ "loss": 0.4274,
2165
+ "step": 1455
2166
+ },
2167
+ {
2168
+ "epoch": 2.6937269372693726,
2169
+ "grad_norm": 1.2595258026091076,
2170
+ "learning_rate": 6.286280337562656e-07,
2171
+ "loss": 0.4303,
2172
+ "step": 1460
2173
+ },
2174
+ {
2175
+ "epoch": 2.7029520295202953,
2176
+ "grad_norm": 1.24521822647123,
2177
+ "learning_rate": 5.917015921389569e-07,
2178
+ "loss": 0.4288,
2179
+ "step": 1465
2180
+ },
2181
+ {
2182
+ "epoch": 2.7121771217712176,
2183
+ "grad_norm": 1.232445478302712,
2184
+ "learning_rate": 5.558597215856065e-07,
2185
+ "loss": 0.4285,
2186
+ "step": 1470
2187
+ },
2188
+ {
2189
+ "epoch": 2.7214022140221403,
2190
+ "grad_norm": 1.216057817991593,
2191
+ "learning_rate": 5.211065538800952e-07,
2192
+ "loss": 0.4208,
2193
+ "step": 1475
2194
+ },
2195
+ {
2196
+ "epoch": 2.7306273062730626,
2197
+ "grad_norm": 1.288524367589534,
2198
+ "learning_rate": 4.874460953026705e-07,
2199
+ "loss": 0.4255,
2200
+ "step": 1480
2201
+ },
2202
+ {
2203
+ "epoch": 2.7398523985239853,
2204
+ "grad_norm": 1.2332155213343263,
2205
+ "learning_rate": 4.548822261681107e-07,
2206
+ "loss": 0.423,
2207
+ "step": 1485
2208
+ },
2209
+ {
2210
+ "epoch": 2.7490774907749076,
2211
+ "grad_norm": 1.2278878382563285,
2212
+ "learning_rate": 4.2341870037841516e-07,
2213
+ "loss": 0.4291,
2214
+ "step": 1490
2215
+ },
2216
+ {
2217
+ "epoch": 2.7583025830258303,
2218
+ "grad_norm": 1.262898121860552,
2219
+ "learning_rate": 3.930591449900578e-07,
2220
+ "loss": 0.4247,
2221
+ "step": 1495
2222
+ },
2223
+ {
2224
+ "epoch": 2.767527675276753,
2225
+ "grad_norm": 1.2437619506416164,
2226
+ "learning_rate": 3.638070597958665e-07,
2227
+ "loss": 0.4361,
2228
+ "step": 1500
2229
+ },
2230
+ {
2231
+ "epoch": 2.767527675276753,
2232
+ "eval_loss": 0.9930853247642517,
2233
+ "eval_runtime": 516.1928,
2234
+ "eval_samples_per_second": 29.737,
2235
+ "eval_steps_per_second": 0.116,
2236
+ "step": 1500
2237
+ },
2238
+ {
2239
+ "epoch": 2.7767527675276753,
2240
+ "grad_norm": 1.2468366513522777,
2241
+ "learning_rate": 3.356658169215743e-07,
2242
+ "loss": 0.4282,
2243
+ "step": 1505
2244
+ },
2245
+ {
2246
+ "epoch": 2.7859778597785976,
2247
+ "grad_norm": 1.2336029324910027,
2248
+ "learning_rate": 3.0863866043708393e-07,
2249
+ "loss": 0.4267,
2250
+ "step": 1510
2251
+ },
2252
+ {
2253
+ "epoch": 2.7952029520295203,
2254
+ "grad_norm": 1.3330748292636831,
2255
+ "learning_rate": 2.8272870598250677e-07,
2256
+ "loss": 0.4281,
2257
+ "step": 1515
2258
+ },
2259
+ {
2260
+ "epoch": 2.804428044280443,
2261
+ "grad_norm": 1.2486193575900169,
2262
+ "learning_rate": 2.5793894040898384e-07,
2263
+ "loss": 0.4224,
2264
+ "step": 1520
2265
+ },
2266
+ {
2267
+ "epoch": 2.8136531365313653,
2268
+ "grad_norm": 1.235394179484528,
2269
+ "learning_rate": 2.3427222143438065e-07,
2270
+ "loss": 0.4184,
2271
+ "step": 1525
2272
+ },
2273
+ {
2274
+ "epoch": 2.8228782287822876,
2275
+ "grad_norm": 1.2913244981868073,
2276
+ "learning_rate": 2.117312773138458e-07,
2277
+ "loss": 0.4238,
2278
+ "step": 1530
2279
+ },
2280
+ {
2281
+ "epoch": 2.8321033210332103,
2282
+ "grad_norm": 1.2580451640594703,
2283
+ "learning_rate": 1.903187065253076e-07,
2284
+ "loss": 0.4274,
2285
+ "step": 1535
2286
+ },
2287
+ {
2288
+ "epoch": 2.841328413284133,
2289
+ "grad_norm": 1.262849856657073,
2290
+ "learning_rate": 1.7003697746992398e-07,
2291
+ "loss": 0.4242,
2292
+ "step": 1540
2293
+ },
2294
+ {
2295
+ "epoch": 2.8505535055350553,
2296
+ "grad_norm": 1.2336423601103856,
2297
+ "learning_rate": 1.5088842818752892e-07,
2298
+ "loss": 0.4338,
2299
+ "step": 1545
2300
+ },
2301
+ {
2302
+ "epoch": 2.859778597785978,
2303
+ "grad_norm": 1.279029201429549,
2304
+ "learning_rate": 1.3287526608711132e-07,
2305
+ "loss": 0.4247,
2306
+ "step": 1550
2307
+ },
2308
+ {
2309
+ "epoch": 2.8690036900369003,
2310
+ "grad_norm": 1.2569044993333771,
2311
+ "learning_rate": 1.1599956769234533e-07,
2312
+ "loss": 0.4167,
2313
+ "step": 1555
2314
+ },
2315
+ {
2316
+ "epoch": 2.878228782287823,
2317
+ "grad_norm": 1.229520630461672,
2318
+ "learning_rate": 1.0026327840221728e-07,
2319
+ "loss": 0.4182,
2320
+ "step": 1560
2321
+ },
2322
+ {
2323
+ "epoch": 2.8874538745387452,
2324
+ "grad_norm": 1.255986608003343,
2325
+ "learning_rate": 8.566821226675514e-08,
2326
+ "loss": 0.4294,
2327
+ "step": 1565
2328
+ },
2329
+ {
2330
+ "epoch": 2.896678966789668,
2331
+ "grad_norm": 1.2895814979486142,
2332
+ "learning_rate": 7.22160517779169e-08,
2333
+ "loss": 0.429,
2334
+ "step": 1570
2335
+ },
2336
+ {
2337
+ "epoch": 2.9059040590405907,
2338
+ "grad_norm": 1.2790109206046127,
2339
+ "learning_rate": 5.99083476756357e-08,
2340
+ "loss": 0.4261,
2341
+ "step": 1575
2342
+ },
2343
+ {
2344
+ "epoch": 2.915129151291513,
2345
+ "grad_norm": 1.2194809596900478,
2346
+ "learning_rate": 4.87465187690439e-08,
2347
+ "loss": 0.4211,
2348
+ "step": 1580
2349
+ },
2350
+ {
2351
+ "epoch": 2.9243542435424352,
2352
+ "grad_norm": 1.2665552740156838,
2353
+ "learning_rate": 3.873185177292737e-08,
2354
+ "loss": 0.4251,
2355
+ "step": 1585
2356
+ },
2357
+ {
2358
+ "epoch": 2.933579335793358,
2359
+ "grad_norm": 1.2812371627035533,
2360
+ "learning_rate": 2.9865501159387355e-08,
2361
+ "loss": 0.4282,
2362
+ "step": 1590
2363
+ },
2364
+ {
2365
+ "epoch": 2.9428044280442807,
2366
+ "grad_norm": 1.2399165066075877,
2367
+ "learning_rate": 2.214848902475808e-08,
2368
+ "loss": 0.4341,
2369
+ "step": 1595
2370
+ },
2371
+ {
2372
+ "epoch": 2.952029520295203,
2373
+ "grad_norm": 1.2154194504631015,
2374
+ "learning_rate": 1.558170497178213e-08,
2375
+ "loss": 0.4256,
2376
+ "step": 1600
2377
+ },
2378
+ {
2379
+ "epoch": 2.952029520295203,
2380
+ "eval_loss": 0.9934021830558777,
2381
+ "eval_runtime": 525.4852,
2382
+ "eval_samples_per_second": 29.211,
2383
+ "eval_steps_per_second": 0.114,
2384
+ "step": 1600
2385
+ },
2386
+ {
2387
+ "epoch": 2.961254612546125,
2388
+ "grad_norm": 1.2717521820081574,
2389
+ "learning_rate": 1.0165906007056914e-08,
2390
+ "loss": 0.4323,
2391
+ "step": 1605
2392
+ },
2393
+ {
2394
+ "epoch": 2.970479704797048,
2395
+ "grad_norm": 1.2491830905746684,
2396
+ "learning_rate": 5.901716453770023e-09,
2397
+ "loss": 0.4271,
2398
+ "step": 1610
2399
+ },
2400
+ {
2401
+ "epoch": 2.9797047970479706,
2402
+ "grad_norm": 1.2521953436091506,
2403
+ "learning_rate": 2.7896278797256983e-09,
2404
+ "loss": 0.4256,
2405
+ "step": 1615
2406
+ },
2407
+ {
2408
+ "epoch": 2.988929889298893,
2409
+ "grad_norm": 1.2335508198968657,
2410
+ "learning_rate": 8.299990406823721e-10,
2411
+ "loss": 0.4342,
2412
+ "step": 1620
2413
+ },
2414
+ {
2415
+ "epoch": 2.9981549815498156,
2416
+ "grad_norm": 1.2480273735451688,
2417
+ "learning_rate": 2.3055838990204693e-11,
2418
+ "loss": 0.4266,
2419
+ "step": 1625
2420
  },
2421
  {
2422
+ "epoch": 3.0,
2423
+ "step": 1626,
2424
+ "total_flos": 1361805280542720.0,
2425
+ "train_loss": 0.713569560815634,
2426
+ "train_runtime": 59769.2599,
2427
+ "train_samples_per_second": 6.961,
2428
+ "train_steps_per_second": 0.027
2429
  }
2430
  ],
2431
  "logging_steps": 5,
2432
+ "max_steps": 1626,
2433
  "num_input_tokens_seen": 0,
2434
+ "num_train_epochs": 3,
2435
  "save_steps": 100,
2436
  "stateful_callbacks": {
2437
  "TrainerControl": {
 
2445
  "attributes": {}
2446
  }
2447
  },
2448
+ "total_flos": 1361805280542720.0,
2449
  "train_batch_size": 16,
2450
  "trial_name": null,
2451
  "trial_params": null