MHGanainy commited on
Commit
b2096f4
1 Parent(s): 6a66f79

MHGanainy/8-clusters-balanced-lex-best-v2-4

Browse files
Files changed (4) hide show
  1. all_results.json +9 -9
  2. eval_results.json +5 -5
  3. train_results.json +4 -4
  4. trainer_state.json +111 -111
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.9224168062210083,
4
- "eval_runtime": 63.692,
5
- "eval_samples_per_second": 15.591,
6
- "eval_steps_per_second": 1.963,
7
- "perplexity": 6.837463340656789,
8
  "total_flos": 1.0003539689472e+17,
9
- "train_loss": 2.061040549059463,
10
- "train_runtime": 1615.6801,
11
- "train_samples_per_second": 6.798,
12
- "train_steps_per_second": 3.399
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 1.922713041305542,
4
+ "eval_runtime": 63.7378,
5
+ "eval_samples_per_second": 15.579,
6
+ "eval_steps_per_second": 1.961,
7
+ "perplexity": 6.839489137228699,
8
  "total_flos": 1.0003539689472e+17,
9
+ "train_loss": 2.061430201075499,
10
+ "train_runtime": 1617.6118,
11
+ "train_samples_per_second": 6.79,
12
+ "train_steps_per_second": 3.395
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 1.9224168062210083,
4
- "eval_runtime": 63.692,
5
- "eval_samples_per_second": 15.591,
6
- "eval_steps_per_second": 1.963,
7
- "perplexity": 6.837463340656789
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "eval_loss": 1.922713041305542,
4
+ "eval_runtime": 63.7378,
5
+ "eval_samples_per_second": 15.579,
6
+ "eval_steps_per_second": 1.961,
7
+ "perplexity": 6.839489137228699
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.0003539689472e+17,
4
- "train_loss": 2.061040549059463,
5
- "train_runtime": 1615.6801,
6
- "train_samples_per_second": 6.798,
7
- "train_steps_per_second": 3.399
8
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 1.0003539689472e+17,
4
+ "train_loss": 2.061430201075499,
5
+ "train_runtime": 1617.6118,
6
+ "train_samples_per_second": 6.79,
7
+ "train_steps_per_second": 3.395
8
  }
trainer_state.json CHANGED
@@ -10,398 +10,398 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01820830298616169,
13
- "grad_norm": 0.22019147872924805,
14
  "learning_rate": 3.642987249544627e-06,
15
  "loss": 2.4398,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.03641660597232338,
20
- "grad_norm": 0.1777569204568863,
21
  "learning_rate": 7.285974499089254e-06,
22
  "loss": 2.4058,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.05462490895848507,
27
- "grad_norm": 0.2692304253578186,
28
  "learning_rate": 1.0928961748633882e-05,
29
  "loss": 2.3914,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.07283321194464676,
34
- "grad_norm": 0.36335742473602295,
35
  "learning_rate": 1.4571948998178507e-05,
36
- "loss": 2.3211,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.09104151493080845,
41
- "grad_norm": 0.5098339319229126,
42
  "learning_rate": 1.8214936247723133e-05,
43
- "loss": 2.2797,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.10924981791697014,
48
- "grad_norm": 0.5050227642059326,
49
  "learning_rate": 1.999474720010985e-05,
50
- "loss": 2.2523,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.12745812090313183,
55
- "grad_norm": 0.6238083839416504,
56
  "learning_rate": 1.9953983978532914e-05,
57
- "loss": 2.2214,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.14566642388929352,
62
- "grad_norm": 0.6451362371444702,
63
  "learning_rate": 1.987302601308333e-05,
64
- "loss": 2.1591,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 0.1638747268754552,
69
- "grad_norm": 0.6711136102676392,
70
  "learning_rate": 1.9752200216552278e-05,
71
- "loss": 2.1624,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 0.1820830298616169,
76
- "grad_norm": 0.7889479994773865,
77
  "learning_rate": 1.9591994490261997e-05,
78
- "loss": 2.0842,
79
  "step": 1000
80
  },
81
  {
82
  "epoch": 0.20029133284777859,
83
- "grad_norm": 0.8510493040084839,
84
  "learning_rate": 1.9393055753893e-05,
85
- "loss": 2.1171,
86
  "step": 1100
87
  },
88
  {
89
  "epoch": 0.21849963583394028,
90
- "grad_norm": 0.741717517375946,
91
  "learning_rate": 1.915618733318621e-05,
92
- "loss": 2.1071,
93
  "step": 1200
94
  },
95
  {
96
  "epoch": 0.23670793882010197,
97
- "grad_norm": 0.7924832701683044,
98
  "learning_rate": 1.8882345716068708e-05,
99
- "loss": 2.0552,
100
  "step": 1300
101
  },
102
  {
103
  "epoch": 0.25491624180626365,
104
- "grad_norm": 0.8211630582809448,
105
  "learning_rate": 1.8572636690301997e-05,
106
- "loss": 2.0649,
107
  "step": 1400
108
  },
109
  {
110
  "epoch": 0.27312454479242537,
111
- "grad_norm": 0.7808334231376648,
112
  "learning_rate": 1.8228310878249212e-05,
113
- "loss": 2.0604,
114
  "step": 1500
115
  },
116
  {
117
  "epoch": 0.29133284777858703,
118
- "grad_norm": 1.0906625986099243,
119
  "learning_rate": 1.7850758686792054e-05,
120
- "loss": 2.08,
121
  "step": 1600
122
  },
123
  {
124
  "epoch": 0.30954115076474875,
125
- "grad_norm": 0.7606624960899353,
126
  "learning_rate": 1.7441504692790104e-05,
127
- "loss": 2.0447,
128
  "step": 1700
129
  },
130
  {
131
  "epoch": 0.3277494537509104,
132
- "grad_norm": 0.8482190370559692,
133
  "learning_rate": 1.700220148675417e-05,
134
- "loss": 2.0584,
135
  "step": 1800
136
  },
137
  {
138
  "epoch": 0.34595775673707213,
139
- "grad_norm": 0.8170859813690186,
140
  "learning_rate": 1.6534622999593437e-05,
141
- "loss": 2.0788,
142
  "step": 1900
143
  },
144
  {
145
  "epoch": 0.3641660597232338,
146
- "grad_norm": 0.9151561260223389,
147
  "learning_rate": 1.6040657339383255e-05,
148
- "loss": 2.0458,
149
  "step": 2000
150
  },
151
  {
152
  "epoch": 0.3823743627093955,
153
- "grad_norm": 0.7852152585983276,
154
  "learning_rate": 1.5522299167079173e-05,
155
- "loss": 2.0271,
156
  "step": 2100
157
  },
158
  {
159
  "epoch": 0.40058266569555717,
160
- "grad_norm": 1.1423839330673218,
161
  "learning_rate": 1.4981641641964437e-05,
162
- "loss": 2.0153,
163
  "step": 2200
164
  },
165
  {
166
  "epoch": 0.4187909686817189,
167
- "grad_norm": 0.9260895252227783,
168
  "learning_rate": 1.44208679693558e-05,
169
- "loss": 2.038,
170
  "step": 2300
171
  },
172
  {
173
  "epoch": 0.43699927166788055,
174
- "grad_norm": 0.8138246536254883,
175
  "learning_rate": 1.384224258469838e-05,
176
- "loss": 2.0325,
177
  "step": 2400
178
  },
179
  {
180
  "epoch": 0.45520757465404227,
181
- "grad_norm": 0.8293213844299316,
182
  "learning_rate": 1.3248102009648686e-05,
183
- "loss": 1.9852,
184
  "step": 2500
185
  },
186
  {
187
  "epoch": 0.47341587764020393,
188
- "grad_norm": 1.0487293004989624,
189
  "learning_rate": 1.2640845417069571e-05,
190
- "loss": 2.0304,
191
  "step": 2600
192
  },
193
  {
194
  "epoch": 0.49162418062636565,
195
- "grad_norm": 1.4424030780792236,
196
  "learning_rate": 1.2022924943036024e-05,
197
- "loss": 2.0349,
198
  "step": 2700
199
  },
200
  {
201
  "epoch": 0.5098324836125273,
202
- "grad_norm": 0.930513322353363,
203
  "learning_rate": 1.139683578497262e-05,
204
- "loss": 2.0298,
205
  "step": 2800
206
  },
207
  {
208
  "epoch": 0.528040786598689,
209
- "grad_norm": 1.3939330577850342,
210
  "learning_rate": 1.0765106125906782e-05,
211
  "loss": 2.0071,
212
  "step": 2900
213
  },
214
  {
215
  "epoch": 0.5462490895848507,
216
- "grad_norm": 1.0249812602996826,
217
  "learning_rate": 1.0130286925524367e-05,
218
- "loss": 1.9692,
219
  "step": 3000
220
  },
221
  {
222
  "epoch": 0.5644573925710124,
223
- "grad_norm": 0.9683905243873596,
224
  "learning_rate": 9.494941619251817e-06,
225
- "loss": 2.0243,
226
  "step": 3100
227
  },
228
  {
229
  "epoch": 0.5826656955571741,
230
- "grad_norm": 1.1064997911453247,
231
  "learning_rate": 8.861635766960579e-06,
232
- "loss": 1.9983,
233
  "step": 3200
234
  },
235
  {
236
  "epoch": 0.6008739985433358,
237
- "grad_norm": 0.9611035585403442,
238
  "learning_rate": 8.232926693092881e-06,
239
- "loss": 1.9898,
240
  "step": 3300
241
  },
242
  {
243
  "epoch": 0.6190823015294975,
244
- "grad_norm": 1.2558187246322632,
245
  "learning_rate": 7.611353160042658e-06,
246
- "loss": 1.9698,
247
  "step": 3400
248
  },
249
  {
250
  "epoch": 0.6372906045156591,
251
- "grad_norm": 1.1066709756851196,
252
  "learning_rate": 6.99942511649105e-06,
253
- "loss": 2.0468,
254
  "step": 3500
255
  },
256
  {
257
  "epoch": 0.6554989075018208,
258
- "grad_norm": 1.0138076543807983,
259
  "learning_rate": 6.399613562093272e-06,
260
- "loss": 2.0535,
261
  "step": 3600
262
  },
263
  {
264
  "epoch": 0.6737072104879825,
265
- "grad_norm": 1.125917673110962,
266
  "learning_rate": 5.814340569443867e-06,
267
- "loss": 2.009,
268
  "step": 3700
269
  },
270
  {
271
  "epoch": 0.6919155134741443,
272
- "grad_norm": 0.7838294506072998,
273
  "learning_rate": 5.245969503612125e-06,
274
- "loss": 1.9229,
275
  "step": 3800
276
  },
277
  {
278
  "epoch": 0.7101238164603059,
279
- "grad_norm": 1.051643967628479,
280
  "learning_rate": 4.696795478741786e-06,
281
- "loss": 1.9857,
282
  "step": 3900
283
  },
284
  {
285
  "epoch": 0.7283321194464676,
286
- "grad_norm": 0.9563839435577393,
287
  "learning_rate": 4.169036090251809e-06,
288
- "loss": 2.0503,
289
  "step": 4000
290
  },
291
  {
292
  "epoch": 0.7465404224326293,
293
- "grad_norm": 0.8592619299888611,
294
  "learning_rate": 3.6648224600620653e-06,
295
- "loss": 2.0066,
296
  "step": 4100
297
  },
298
  {
299
  "epoch": 0.764748725418791,
300
- "grad_norm": 0.9203991293907166,
301
  "learning_rate": 3.1861906310038825e-06,
302
- "loss": 1.9719,
303
  "step": 4200
304
  },
305
  {
306
  "epoch": 0.7829570284049526,
307
- "grad_norm": 0.9755929112434387,
308
  "learning_rate": 2.735073345165228e-06,
309
- "loss": 1.9785,
310
  "step": 4300
311
  },
312
  {
313
  "epoch": 0.8011653313911143,
314
- "grad_norm": 1.030912160873413,
315
  "learning_rate": 2.313292239370102e-06,
316
- "loss": 2.0122,
317
  "step": 4400
318
  },
319
  {
320
  "epoch": 0.8193736343772761,
321
- "grad_norm": 1.2122974395751953,
322
  "learning_rate": 1.9225504893071823e-06,
323
- "loss": 1.9747,
324
  "step": 4500
325
  },
326
  {
327
  "epoch": 0.8375819373634378,
328
- "grad_norm": 1.0663396120071411,
329
  "learning_rate": 1.5644259320111733e-06,
330
- "loss": 1.9379,
331
  "step": 4600
332
  },
333
  {
334
  "epoch": 0.8557902403495994,
335
- "grad_norm": 0.9615539908409119,
336
  "learning_rate": 1.2403646944686198e-06,
337
  "loss": 1.9893,
338
  "step": 4700
339
  },
340
  {
341
  "epoch": 0.8739985433357611,
342
- "grad_norm": 1.184589147567749,
343
  "learning_rate": 9.516753540762868e-07,
344
- "loss": 1.9812,
345
  "step": 4800
346
  },
347
  {
348
  "epoch": 0.8922068463219228,
349
- "grad_norm": 1.2580232620239258,
350
  "learning_rate": 6.995236545324624e-07,
351
- "loss": 1.948,
352
  "step": 4900
353
  },
354
  {
355
  "epoch": 0.9104151493080845,
356
- "grad_norm": 0.958463728427887,
357
  "learning_rate": 4.849277984987221e-07,
358
- "loss": 1.9752,
359
  "step": 5000
360
  },
361
  {
362
  "epoch": 0.9286234522942461,
363
- "grad_norm": 1.0817508697509766,
364
  "learning_rate": 3.0875433604064976e-07,
365
- "loss": 2.0005,
366
  "step": 5100
367
  },
368
  {
369
  "epoch": 0.9468317552804079,
370
- "grad_norm": 1.1332553625106812,
371
  "learning_rate": 1.7171466545021665e-07,
372
- "loss": 1.9255,
373
  "step": 5200
374
  },
375
  {
376
  "epoch": 0.9650400582665696,
377
- "grad_norm": 0.9218988418579102,
378
  "learning_rate": 7.436216057970735e-08,
379
- "loss": 2.0052,
380
  "step": 5300
381
  },
382
  {
383
  "epoch": 0.9832483612527313,
384
- "grad_norm": 1.2976170778274536,
385
  "learning_rate": 1.708993628716016e-08,
386
- "loss": 2.0214,
387
  "step": 5400
388
  },
389
  {
390
  "epoch": 1.0,
391
- "eval_loss": 1.9224168062210083,
392
- "eval_runtime": 63.6773,
393
- "eval_samples_per_second": 15.594,
394
- "eval_steps_per_second": 1.963,
395
  "step": 5492
396
  },
397
  {
398
  "epoch": 1.0,
399
  "step": 5492,
400
  "total_flos": 1.0003539689472e+17,
401
- "train_loss": 2.061040549059463,
402
- "train_runtime": 1615.6801,
403
- "train_samples_per_second": 6.798,
404
- "train_steps_per_second": 3.399
405
  }
406
  ],
407
  "logging_steps": 100,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01820830298616169,
13
+ "grad_norm": 0.22140225768089294,
14
  "learning_rate": 3.642987249544627e-06,
15
  "loss": 2.4398,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.03641660597232338,
20
+ "grad_norm": 0.17718623578548431,
21
  "learning_rate": 7.285974499089254e-06,
22
  "loss": 2.4058,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.05462490895848507,
27
+ "grad_norm": 0.26951104402542114,
28
  "learning_rate": 1.0928961748633882e-05,
29
  "loss": 2.3914,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.07283321194464676,
34
+ "grad_norm": 0.36239442229270935,
35
  "learning_rate": 1.4571948998178507e-05,
36
+ "loss": 2.3212,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.09104151493080845,
41
+ "grad_norm": 0.505577564239502,
42
  "learning_rate": 1.8214936247723133e-05,
43
+ "loss": 2.2799,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.10924981791697014,
48
+ "grad_norm": 0.5019211173057556,
49
  "learning_rate": 1.999474720010985e-05,
50
+ "loss": 2.2525,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.12745812090313183,
55
+ "grad_norm": 0.6163275837898254,
56
  "learning_rate": 1.9953983978532914e-05,
57
+ "loss": 2.2219,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.14566642388929352,
62
+ "grad_norm": 0.6447716355323792,
63
  "learning_rate": 1.987302601308333e-05,
64
+ "loss": 2.1597,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 0.1638747268754552,
69
+ "grad_norm": 0.6667017936706543,
70
  "learning_rate": 1.9752200216552278e-05,
71
+ "loss": 2.1629,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 0.1820830298616169,
76
+ "grad_norm": 0.7740277647972107,
77
  "learning_rate": 1.9591994490261997e-05,
78
+ "loss": 2.0851,
79
  "step": 1000
80
  },
81
  {
82
  "epoch": 0.20029133284777859,
83
+ "grad_norm": 0.8541315197944641,
84
  "learning_rate": 1.9393055753893e-05,
85
+ "loss": 2.1176,
86
  "step": 1100
87
  },
88
  {
89
  "epoch": 0.21849963583394028,
90
+ "grad_norm": 0.7334938645362854,
91
  "learning_rate": 1.915618733318621e-05,
92
+ "loss": 2.1079,
93
  "step": 1200
94
  },
95
  {
96
  "epoch": 0.23670793882010197,
97
+ "grad_norm": 0.7850305438041687,
98
  "learning_rate": 1.8882345716068708e-05,
99
+ "loss": 2.0558,
100
  "step": 1300
101
  },
102
  {
103
  "epoch": 0.25491624180626365,
104
+ "grad_norm": 0.8258129358291626,
105
  "learning_rate": 1.8572636690301997e-05,
106
+ "loss": 2.0657,
107
  "step": 1400
108
  },
109
  {
110
  "epoch": 0.27312454479242537,
111
+ "grad_norm": 0.7766979932785034,
112
  "learning_rate": 1.8228310878249212e-05,
113
+ "loss": 2.061,
114
  "step": 1500
115
  },
116
  {
117
  "epoch": 0.29133284777858703,
118
+ "grad_norm": 1.0766857862472534,
119
  "learning_rate": 1.7850758686792054e-05,
120
+ "loss": 2.0807,
121
  "step": 1600
122
  },
123
  {
124
  "epoch": 0.30954115076474875,
125
+ "grad_norm": 0.7605693936347961,
126
  "learning_rate": 1.7441504692790104e-05,
127
+ "loss": 2.0452,
128
  "step": 1700
129
  },
130
  {
131
  "epoch": 0.3277494537509104,
132
+ "grad_norm": 0.8467194437980652,
133
  "learning_rate": 1.700220148675417e-05,
134
+ "loss": 2.0589,
135
  "step": 1800
136
  },
137
  {
138
  "epoch": 0.34595775673707213,
139
+ "grad_norm": 0.8118539452552795,
140
  "learning_rate": 1.6534622999593437e-05,
141
+ "loss": 2.0793,
142
  "step": 1900
143
  },
144
  {
145
  "epoch": 0.3641660597232338,
146
+ "grad_norm": 0.9152652621269226,
147
  "learning_rate": 1.6040657339383255e-05,
148
+ "loss": 2.0462,
149
  "step": 2000
150
  },
151
  {
152
  "epoch": 0.3823743627093955,
153
+ "grad_norm": 0.7831888794898987,
154
  "learning_rate": 1.5522299167079173e-05,
155
+ "loss": 2.0275,
156
  "step": 2100
157
  },
158
  {
159
  "epoch": 0.40058266569555717,
160
+ "grad_norm": 1.125390887260437,
161
  "learning_rate": 1.4981641641964437e-05,
162
+ "loss": 2.0157,
163
  "step": 2200
164
  },
165
  {
166
  "epoch": 0.4187909686817189,
167
+ "grad_norm": 0.9301549196243286,
168
  "learning_rate": 1.44208679693558e-05,
169
+ "loss": 2.0384,
170
  "step": 2300
171
  },
172
  {
173
  "epoch": 0.43699927166788055,
174
+ "grad_norm": 0.8163829445838928,
175
  "learning_rate": 1.384224258469838e-05,
176
+ "loss": 2.0331,
177
  "step": 2400
178
  },
179
  {
180
  "epoch": 0.45520757465404227,
181
+ "grad_norm": 0.8261762261390686,
182
  "learning_rate": 1.3248102009648686e-05,
183
+ "loss": 1.9856,
184
  "step": 2500
185
  },
186
  {
187
  "epoch": 0.47341587764020393,
188
+ "grad_norm": 1.0488537549972534,
189
  "learning_rate": 1.2640845417069571e-05,
190
+ "loss": 2.031,
191
  "step": 2600
192
  },
193
  {
194
  "epoch": 0.49162418062636565,
195
+ "grad_norm": 1.400754451751709,
196
  "learning_rate": 1.2022924943036024e-05,
197
+ "loss": 2.0352,
198
  "step": 2700
199
  },
200
  {
201
  "epoch": 0.5098324836125273,
202
+ "grad_norm": 0.9314346313476562,
203
  "learning_rate": 1.139683578497262e-05,
204
+ "loss": 2.0303,
205
  "step": 2800
206
  },
207
  {
208
  "epoch": 0.528040786598689,
209
+ "grad_norm": 1.420285701751709,
210
  "learning_rate": 1.0765106125906782e-05,
211
  "loss": 2.0071,
212
  "step": 2900
213
  },
214
  {
215
  "epoch": 0.5462490895848507,
216
+ "grad_norm": 1.0269055366516113,
217
  "learning_rate": 1.0130286925524367e-05,
218
+ "loss": 1.9697,
219
  "step": 3000
220
  },
221
  {
222
  "epoch": 0.5644573925710124,
223
+ "grad_norm": 0.9681551456451416,
224
  "learning_rate": 9.494941619251817e-06,
225
+ "loss": 2.0246,
226
  "step": 3100
227
  },
228
  {
229
  "epoch": 0.5826656955571741,
230
+ "grad_norm": 1.1162570714950562,
231
  "learning_rate": 8.861635766960579e-06,
232
+ "loss": 1.9988,
233
  "step": 3200
234
  },
235
  {
236
  "epoch": 0.6008739985433358,
237
+ "grad_norm": 0.9726856350898743,
238
  "learning_rate": 8.232926693092881e-06,
239
+ "loss": 1.9904,
240
  "step": 3300
241
  },
242
  {
243
  "epoch": 0.6190823015294975,
244
+ "grad_norm": 1.2489055395126343,
245
  "learning_rate": 7.611353160042658e-06,
246
+ "loss": 1.9702,
247
  "step": 3400
248
  },
249
  {
250
  "epoch": 0.6372906045156591,
251
+ "grad_norm": 1.1108543872833252,
252
  "learning_rate": 6.99942511649105e-06,
253
+ "loss": 2.0471,
254
  "step": 3500
255
  },
256
  {
257
  "epoch": 0.6554989075018208,
258
+ "grad_norm": 1.0132871866226196,
259
  "learning_rate": 6.399613562093272e-06,
260
+ "loss": 2.0539,
261
  "step": 3600
262
  },
263
  {
264
  "epoch": 0.6737072104879825,
265
+ "grad_norm": 1.1130536794662476,
266
  "learning_rate": 5.814340569443867e-06,
267
+ "loss": 2.0095,
268
  "step": 3700
269
  },
270
  {
271
  "epoch": 0.6919155134741443,
272
+ "grad_norm": 0.7807123064994812,
273
  "learning_rate": 5.245969503612125e-06,
274
+ "loss": 1.9232,
275
  "step": 3800
276
  },
277
  {
278
  "epoch": 0.7101238164603059,
279
+ "grad_norm": 1.0500433444976807,
280
  "learning_rate": 4.696795478741786e-06,
281
+ "loss": 1.986,
282
  "step": 3900
283
  },
284
  {
285
  "epoch": 0.7283321194464676,
286
+ "grad_norm": 0.9565054774284363,
287
  "learning_rate": 4.169036090251809e-06,
288
+ "loss": 2.0505,
289
  "step": 4000
290
  },
291
  {
292
  "epoch": 0.7465404224326293,
293
+ "grad_norm": 0.8539314270019531,
294
  "learning_rate": 3.6648224600620653e-06,
295
+ "loss": 2.0069,
296
  "step": 4100
297
  },
298
  {
299
  "epoch": 0.764748725418791,
300
+ "grad_norm": 0.9163336157798767,
301
  "learning_rate": 3.1861906310038825e-06,
302
+ "loss": 1.9723,
303
  "step": 4200
304
  },
305
  {
306
  "epoch": 0.7829570284049526,
307
+ "grad_norm": 0.9746565222740173,
308
  "learning_rate": 2.735073345165228e-06,
309
+ "loss": 1.9787,
310
  "step": 4300
311
  },
312
  {
313
  "epoch": 0.8011653313911143,
314
+ "grad_norm": 1.0261231660842896,
315
  "learning_rate": 2.313292239370102e-06,
316
+ "loss": 2.0124,
317
  "step": 4400
318
  },
319
  {
320
  "epoch": 0.8193736343772761,
321
+ "grad_norm": 1.207588791847229,
322
  "learning_rate": 1.9225504893071823e-06,
323
+ "loss": 1.9749,
324
  "step": 4500
325
  },
326
  {
327
  "epoch": 0.8375819373634378,
328
+ "grad_norm": 1.0550912618637085,
329
  "learning_rate": 1.5644259320111733e-06,
330
+ "loss": 1.9382,
331
  "step": 4600
332
  },
333
  {
334
  "epoch": 0.8557902403495994,
335
+ "grad_norm": 0.9657886624336243,
336
  "learning_rate": 1.2403646944686198e-06,
337
  "loss": 1.9893,
338
  "step": 4700
339
  },
340
  {
341
  "epoch": 0.8739985433357611,
342
+ "grad_norm": 1.1723967790603638,
343
  "learning_rate": 9.516753540762868e-07,
344
+ "loss": 1.9816,
345
  "step": 4800
346
  },
347
  {
348
  "epoch": 0.8922068463219228,
349
+ "grad_norm": 1.271134376525879,
350
  "learning_rate": 6.995236545324624e-07,
351
+ "loss": 1.9485,
352
  "step": 4900
353
  },
354
  {
355
  "epoch": 0.9104151493080845,
356
+ "grad_norm": 0.9425839185714722,
357
  "learning_rate": 4.849277984987221e-07,
358
+ "loss": 1.9754,
359
  "step": 5000
360
  },
361
  {
362
  "epoch": 0.9286234522942461,
363
+ "grad_norm": 1.077772617340088,
364
  "learning_rate": 3.0875433604064976e-07,
365
+ "loss": 2.0007,
366
  "step": 5100
367
  },
368
  {
369
  "epoch": 0.9468317552804079,
370
+ "grad_norm": 1.13583242893219,
371
  "learning_rate": 1.7171466545021665e-07,
372
+ "loss": 1.926,
373
  "step": 5200
374
  },
375
  {
376
  "epoch": 0.9650400582665696,
377
+ "grad_norm": 0.9153927564620972,
378
  "learning_rate": 7.436216057970735e-08,
379
+ "loss": 2.0055,
380
  "step": 5300
381
  },
382
  {
383
  "epoch": 0.9832483612527313,
384
+ "grad_norm": 1.2890238761901855,
385
  "learning_rate": 1.708993628716016e-08,
386
+ "loss": 2.0217,
387
  "step": 5400
388
  },
389
  {
390
  "epoch": 1.0,
391
+ "eval_loss": 1.922713041305542,
392
+ "eval_runtime": 63.7429,
393
+ "eval_samples_per_second": 15.578,
394
+ "eval_steps_per_second": 1.961,
395
  "step": 5492
396
  },
397
  {
398
  "epoch": 1.0,
399
  "step": 5492,
400
  "total_flos": 1.0003539689472e+17,
401
+ "train_loss": 2.061430201075499,
402
+ "train_runtime": 1617.6118,
403
+ "train_samples_per_second": 6.79,
404
+ "train_steps_per_second": 3.395
405
  }
406
  ],
407
  "logging_steps": 100,