KaushiGihan commited on
Commit
4d269d1
·
verified ·
1 Parent(s): c3de9df

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +8 -0
  2. train_results.json +8 -0
  3. trainer_state.json +862 -0
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9259259259259259,
3
+ "total_flos": 1.9112276736e+17,
4
+ "train_loss": 1.9073539078235626,
5
+ "train_runtime": 260.6876,
6
+ "train_samples_per_second": 1.534,
7
+ "train_steps_per_second": 0.384
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9259259259259259,
3
+ "total_flos": 1.9112276736e+17,
4
+ "train_loss": 1.9073539078235626,
5
+ "train_runtime": 260.6876,
6
+ "train_samples_per_second": 1.534,
7
+ "train_steps_per_second": 0.384
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,862 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9259259259259259,
5
+ "eval_steps": 20,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009259259259259259,
13
+ "grad_norm": NaN,
14
+ "learning_rate": 1e-05,
15
+ "loss": 1.9801,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.018518518518518517,
20
+ "grad_norm": NaN,
21
+ "learning_rate": 1e-05,
22
+ "loss": 1.7876,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.027777777777777776,
27
+ "grad_norm": Infinity,
28
+ "learning_rate": 1e-05,
29
+ "loss": 2.3855,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.037037037037037035,
34
+ "grad_norm": 129.57530212402344,
35
+ "learning_rate": 9.9e-06,
36
+ "loss": 1.8948,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.046296296296296294,
41
+ "grad_norm": Infinity,
42
+ "learning_rate": 9.9e-06,
43
+ "loss": 2.1217,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.05555555555555555,
48
+ "grad_norm": 96.91162872314453,
49
+ "learning_rate": 9.800000000000001e-06,
50
+ "loss": 2.0581,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.06481481481481481,
55
+ "grad_norm": 40.41334533691406,
56
+ "learning_rate": 9.7e-06,
57
+ "loss": 1.7127,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.07407407407407407,
62
+ "grad_norm": 89.87907409667969,
63
+ "learning_rate": 9.600000000000001e-06,
64
+ "loss": 1.9292,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.08333333333333333,
69
+ "grad_norm": 52.121402740478516,
70
+ "learning_rate": 9.5e-06,
71
+ "loss": 1.9921,
72
+ "step": 9
73
+ },
74
+ {
75
+ "epoch": 0.09259259259259259,
76
+ "grad_norm": 321.693115234375,
77
+ "learning_rate": 9.4e-06,
78
+ "loss": 2.1328,
79
+ "step": 10
80
+ },
81
+ {
82
+ "epoch": 0.10185185185185185,
83
+ "grad_norm": 125.0765609741211,
84
+ "learning_rate": 9.3e-06,
85
+ "loss": 1.986,
86
+ "step": 11
87
+ },
88
+ {
89
+ "epoch": 0.1111111111111111,
90
+ "grad_norm": 110.41716766357422,
91
+ "learning_rate": 9.200000000000002e-06,
92
+ "loss": 1.8335,
93
+ "step": 12
94
+ },
95
+ {
96
+ "epoch": 0.12037037037037036,
97
+ "grad_norm": 77.33905792236328,
98
+ "learning_rate": 9.100000000000001e-06,
99
+ "loss": 1.9346,
100
+ "step": 13
101
+ },
102
+ {
103
+ "epoch": 0.12962962962962962,
104
+ "grad_norm": 89.59822845458984,
105
+ "learning_rate": 9e-06,
106
+ "loss": 2.0646,
107
+ "step": 14
108
+ },
109
+ {
110
+ "epoch": 0.1388888888888889,
111
+ "grad_norm": 23.786359786987305,
112
+ "learning_rate": 8.900000000000001e-06,
113
+ "loss": 1.9532,
114
+ "step": 15
115
+ },
116
+ {
117
+ "epoch": 0.14814814814814814,
118
+ "grad_norm": 49.40592956542969,
119
+ "learning_rate": 8.8e-06,
120
+ "loss": 1.9619,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.1574074074074074,
125
+ "grad_norm": 146.65223693847656,
126
+ "learning_rate": 8.700000000000001e-06,
127
+ "loss": 1.9551,
128
+ "step": 17
129
+ },
130
+ {
131
+ "epoch": 0.16666666666666666,
132
+ "grad_norm": Infinity,
133
+ "learning_rate": 8.700000000000001e-06,
134
+ "loss": 1.6949,
135
+ "step": 18
136
+ },
137
+ {
138
+ "epoch": 0.17592592592592593,
139
+ "grad_norm": 236.78709411621094,
140
+ "learning_rate": 8.6e-06,
141
+ "loss": 1.8608,
142
+ "step": 19
143
+ },
144
+ {
145
+ "epoch": 0.18518518518518517,
146
+ "grad_norm": 184.51332092285156,
147
+ "learning_rate": 8.5e-06,
148
+ "loss": 1.9155,
149
+ "step": 20
150
+ },
151
+ {
152
+ "epoch": 0.18518518518518517,
153
+ "eval_loss": 1.8037505149841309,
154
+ "eval_map": 0.0047,
155
+ "eval_map_50": 0.0101,
156
+ "eval_map_75": 0.005,
157
+ "eval_map_Adult": 0.0081,
158
+ "eval_map_Kid": 0.0013,
159
+ "eval_map_large": 0.0071,
160
+ "eval_map_medium": 0.0035,
161
+ "eval_map_small": -1.0,
162
+ "eval_mar_1": 0.0332,
163
+ "eval_mar_10": 0.1231,
164
+ "eval_mar_100": 0.3206,
165
+ "eval_mar_100_Adult": 0.4727,
166
+ "eval_mar_100_Kid": 0.1685,
167
+ "eval_mar_large": 0.3067,
168
+ "eval_mar_medium": 0.4545,
169
+ "eval_mar_small": -1.0,
170
+ "eval_runtime": 15.3961,
171
+ "eval_samples_per_second": 4.936,
172
+ "eval_steps_per_second": 1.234,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.19444444444444445,
177
+ "grad_norm": 163.67559814453125,
178
+ "learning_rate": 8.400000000000001e-06,
179
+ "loss": 2.1412,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.2037037037037037,
184
+ "grad_norm": 24.548583984375,
185
+ "learning_rate": 8.3e-06,
186
+ "loss": 1.7217,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.21296296296296297,
191
+ "grad_norm": 124.01173400878906,
192
+ "learning_rate": 8.2e-06,
193
+ "loss": 2.0238,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.2222222222222222,
198
+ "grad_norm": 29.773996353149414,
199
+ "learning_rate": 8.1e-06,
200
+ "loss": 1.995,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.23148148148148148,
205
+ "grad_norm": 294.4190368652344,
206
+ "learning_rate": 8.000000000000001e-06,
207
+ "loss": 1.7983,
208
+ "step": 25
209
+ },
210
+ {
211
+ "epoch": 0.24074074074074073,
212
+ "grad_norm": 478.20721435546875,
213
+ "learning_rate": 7.9e-06,
214
+ "loss": 2.1716,
215
+ "step": 26
216
+ },
217
+ {
218
+ "epoch": 0.25,
219
+ "grad_norm": 57.03529357910156,
220
+ "learning_rate": 7.800000000000002e-06,
221
+ "loss": 1.9222,
222
+ "step": 27
223
+ },
224
+ {
225
+ "epoch": 0.25925925925925924,
226
+ "grad_norm": 83.11935424804688,
227
+ "learning_rate": 7.7e-06,
228
+ "loss": 2.0409,
229
+ "step": 28
230
+ },
231
+ {
232
+ "epoch": 0.26851851851851855,
233
+ "grad_norm": 177.90359497070312,
234
+ "learning_rate": 7.600000000000001e-06,
235
+ "loss": 1.8105,
236
+ "step": 29
237
+ },
238
+ {
239
+ "epoch": 0.2777777777777778,
240
+ "grad_norm": 93.07716369628906,
241
+ "learning_rate": 7.500000000000001e-06,
242
+ "loss": 1.935,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 0.28703703703703703,
247
+ "grad_norm": 146.4728546142578,
248
+ "learning_rate": 7.4e-06,
249
+ "loss": 1.6811,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 0.2962962962962963,
254
+ "grad_norm": 223.94830322265625,
255
+ "learning_rate": 7.3e-06,
256
+ "loss": 1.922,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 0.3055555555555556,
261
+ "grad_norm": 153.93211364746094,
262
+ "learning_rate": 7.2000000000000005e-06,
263
+ "loss": 1.9116,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 0.3148148148148148,
268
+ "grad_norm": 193.51223754882812,
269
+ "learning_rate": 7.100000000000001e-06,
270
+ "loss": 2.1864,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 0.32407407407407407,
275
+ "grad_norm": 590.421875,
276
+ "learning_rate": 7e-06,
277
+ "loss": 1.8323,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 0.3333333333333333,
282
+ "grad_norm": 194.01913452148438,
283
+ "learning_rate": 6.9e-06,
284
+ "loss": 1.8585,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 0.3425925925925926,
289
+ "grad_norm": 45.26372146606445,
290
+ "learning_rate": 6.800000000000001e-06,
291
+ "loss": 2.0296,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 0.35185185185185186,
296
+ "grad_norm": 206.16380310058594,
297
+ "learning_rate": 6.700000000000001e-06,
298
+ "loss": 1.8509,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 0.3611111111111111,
303
+ "grad_norm": 162.83180236816406,
304
+ "learning_rate": 6.600000000000001e-06,
305
+ "loss": 1.7652,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 0.37037037037037035,
310
+ "grad_norm": 55.78132629394531,
311
+ "learning_rate": 6.5000000000000004e-06,
312
+ "loss": 1.8395,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 0.37037037037037035,
317
+ "eval_loss": 1.7978708744049072,
318
+ "eval_map": 0.0045,
319
+ "eval_map_50": 0.0107,
320
+ "eval_map_75": 0.0035,
321
+ "eval_map_Adult": 0.0076,
322
+ "eval_map_Kid": 0.0014,
323
+ "eval_map_large": 0.0072,
324
+ "eval_map_medium": 0.0042,
325
+ "eval_map_small": -1.0,
326
+ "eval_mar_1": 0.0223,
327
+ "eval_mar_10": 0.1147,
328
+ "eval_mar_100": 0.3081,
329
+ "eval_mar_100_Adult": 0.4436,
330
+ "eval_mar_100_Kid": 0.1726,
331
+ "eval_mar_large": 0.2926,
332
+ "eval_mar_medium": 0.4909,
333
+ "eval_mar_small": -1.0,
334
+ "eval_runtime": 14.768,
335
+ "eval_samples_per_second": 5.146,
336
+ "eval_steps_per_second": 1.287,
337
+ "step": 40
338
+ },
339
+ {
340
+ "epoch": 0.37962962962962965,
341
+ "grad_norm": 60.19484329223633,
342
+ "learning_rate": 6.4000000000000006e-06,
343
+ "loss": 1.6552,
344
+ "step": 41
345
+ },
346
+ {
347
+ "epoch": 0.3888888888888889,
348
+ "grad_norm": 179.31629943847656,
349
+ "learning_rate": 6.300000000000001e-06,
350
+ "loss": 1.7017,
351
+ "step": 42
352
+ },
353
+ {
354
+ "epoch": 0.39814814814814814,
355
+ "grad_norm": 296.96734619140625,
356
+ "learning_rate": 6.200000000000001e-06,
357
+ "loss": 1.9784,
358
+ "step": 43
359
+ },
360
+ {
361
+ "epoch": 0.4074074074074074,
362
+ "grad_norm": 32.73106384277344,
363
+ "learning_rate": 6.1e-06,
364
+ "loss": 2.0133,
365
+ "step": 44
366
+ },
367
+ {
368
+ "epoch": 0.4166666666666667,
369
+ "grad_norm": 76.0937271118164,
370
+ "learning_rate": 6e-06,
371
+ "loss": 2.1171,
372
+ "step": 45
373
+ },
374
+ {
375
+ "epoch": 0.42592592592592593,
376
+ "grad_norm": 41.131385803222656,
377
+ "learning_rate": 5.9e-06,
378
+ "loss": 2.0797,
379
+ "step": 46
380
+ },
381
+ {
382
+ "epoch": 0.4351851851851852,
383
+ "grad_norm": 722.7262573242188,
384
+ "learning_rate": 5.8e-06,
385
+ "loss": 2.6376,
386
+ "step": 47
387
+ },
388
+ {
389
+ "epoch": 0.4444444444444444,
390
+ "grad_norm": 661.7726440429688,
391
+ "learning_rate": 5.7e-06,
392
+ "loss": 2.1092,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 0.4537037037037037,
397
+ "grad_norm": 86.95410919189453,
398
+ "learning_rate": 5.600000000000001e-06,
399
+ "loss": 1.8785,
400
+ "step": 49
401
+ },
402
+ {
403
+ "epoch": 0.46296296296296297,
404
+ "grad_norm": 168.14027404785156,
405
+ "learning_rate": 5.500000000000001e-06,
406
+ "loss": 1.9761,
407
+ "step": 50
408
+ },
409
+ {
410
+ "epoch": 0.4722222222222222,
411
+ "grad_norm": Infinity,
412
+ "learning_rate": 5.500000000000001e-06,
413
+ "loss": 1.742,
414
+ "step": 51
415
+ },
416
+ {
417
+ "epoch": 0.48148148148148145,
418
+ "grad_norm": 25.475540161132812,
419
+ "learning_rate": 5.400000000000001e-06,
420
+ "loss": 1.922,
421
+ "step": 52
422
+ },
423
+ {
424
+ "epoch": 0.49074074074074076,
425
+ "grad_norm": 54.6159553527832,
426
+ "learning_rate": 5.300000000000001e-06,
427
+ "loss": 1.649,
428
+ "step": 53
429
+ },
430
+ {
431
+ "epoch": 0.5,
432
+ "grad_norm": 67.34464263916016,
433
+ "learning_rate": 5.2e-06,
434
+ "loss": 2.0628,
435
+ "step": 54
436
+ },
437
+ {
438
+ "epoch": 0.5092592592592593,
439
+ "grad_norm": 99.4066162109375,
440
+ "learning_rate": 5.1e-06,
441
+ "loss": 1.9407,
442
+ "step": 55
443
+ },
444
+ {
445
+ "epoch": 0.5185185185185185,
446
+ "grad_norm": 68.25653076171875,
447
+ "learning_rate": 5e-06,
448
+ "loss": 1.9328,
449
+ "step": 56
450
+ },
451
+ {
452
+ "epoch": 0.5277777777777778,
453
+ "grad_norm": 398.8371276855469,
454
+ "learning_rate": 4.9000000000000005e-06,
455
+ "loss": 1.8278,
456
+ "step": 57
457
+ },
458
+ {
459
+ "epoch": 0.5370370370370371,
460
+ "grad_norm": 49.0096435546875,
461
+ "learning_rate": 4.800000000000001e-06,
462
+ "loss": 1.9959,
463
+ "step": 58
464
+ },
465
+ {
466
+ "epoch": 0.5462962962962963,
467
+ "grad_norm": 310.508544921875,
468
+ "learning_rate": 4.7e-06,
469
+ "loss": 1.6916,
470
+ "step": 59
471
+ },
472
+ {
473
+ "epoch": 0.5555555555555556,
474
+ "grad_norm": 260.59576416015625,
475
+ "learning_rate": 4.600000000000001e-06,
476
+ "loss": 1.8477,
477
+ "step": 60
478
+ },
479
+ {
480
+ "epoch": 0.5555555555555556,
481
+ "eval_loss": 1.765649437904358,
482
+ "eval_map": 0.0046,
483
+ "eval_map_50": 0.0107,
484
+ "eval_map_75": 0.0041,
485
+ "eval_map_Adult": 0.0072,
486
+ "eval_map_Kid": 0.002,
487
+ "eval_map_large": 0.0077,
488
+ "eval_map_medium": 0.0044,
489
+ "eval_map_small": -1.0,
490
+ "eval_mar_1": 0.0329,
491
+ "eval_mar_10": 0.1148,
492
+ "eval_mar_100": 0.3276,
493
+ "eval_mar_100_Adult": 0.4309,
494
+ "eval_mar_100_Kid": 0.2242,
495
+ "eval_mar_large": 0.3137,
496
+ "eval_mar_medium": 0.5091,
497
+ "eval_mar_small": -1.0,
498
+ "eval_runtime": 14.8896,
499
+ "eval_samples_per_second": 5.104,
500
+ "eval_steps_per_second": 1.276,
501
+ "step": 60
502
+ },
503
+ {
504
+ "epoch": 0.5648148148148148,
505
+ "grad_norm": 65.56439971923828,
506
+ "learning_rate": 4.5e-06,
507
+ "loss": 1.9681,
508
+ "step": 61
509
+ },
510
+ {
511
+ "epoch": 0.5740740740740741,
512
+ "grad_norm": 99.23474884033203,
513
+ "learning_rate": 4.4e-06,
514
+ "loss": 1.7314,
515
+ "step": 62
516
+ },
517
+ {
518
+ "epoch": 0.5833333333333334,
519
+ "grad_norm": 118.4346694946289,
520
+ "learning_rate": 4.3e-06,
521
+ "loss": 1.6602,
522
+ "step": 63
523
+ },
524
+ {
525
+ "epoch": 0.5925925925925926,
526
+ "grad_norm": 43.443115234375,
527
+ "learning_rate": 4.2000000000000004e-06,
528
+ "loss": 2.0101,
529
+ "step": 64
530
+ },
531
+ {
532
+ "epoch": 0.6018518518518519,
533
+ "grad_norm": 112.58702850341797,
534
+ "learning_rate": 4.1e-06,
535
+ "loss": 1.656,
536
+ "step": 65
537
+ },
538
+ {
539
+ "epoch": 0.6111111111111112,
540
+ "grad_norm": 60.914520263671875,
541
+ "learning_rate": 4.000000000000001e-06,
542
+ "loss": 1.7922,
543
+ "step": 66
544
+ },
545
+ {
546
+ "epoch": 0.6203703703703703,
547
+ "grad_norm": 92.44080352783203,
548
+ "learning_rate": 3.900000000000001e-06,
549
+ "loss": 1.5629,
550
+ "step": 67
551
+ },
552
+ {
553
+ "epoch": 0.6296296296296297,
554
+ "grad_norm": 1000.2711181640625,
555
+ "learning_rate": 3.8000000000000005e-06,
556
+ "loss": 2.3308,
557
+ "step": 68
558
+ },
559
+ {
560
+ "epoch": 0.6388888888888888,
561
+ "grad_norm": 132.41334533691406,
562
+ "learning_rate": 3.7e-06,
563
+ "loss": 1.7623,
564
+ "step": 69
565
+ },
566
+ {
567
+ "epoch": 0.6481481481481481,
568
+ "grad_norm": 631.0505981445312,
569
+ "learning_rate": 3.6000000000000003e-06,
570
+ "loss": 2.043,
571
+ "step": 70
572
+ },
573
+ {
574
+ "epoch": 0.6574074074074074,
575
+ "grad_norm": 230.10476684570312,
576
+ "learning_rate": 3.5e-06,
577
+ "loss": 1.9894,
578
+ "step": 71
579
+ },
580
+ {
581
+ "epoch": 0.6666666666666666,
582
+ "grad_norm": 68.68940734863281,
583
+ "learning_rate": 3.4000000000000005e-06,
584
+ "loss": 1.8069,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 0.6759259259259259,
589
+ "grad_norm": 310.83233642578125,
590
+ "learning_rate": 3.3000000000000006e-06,
591
+ "loss": 1.8285,
592
+ "step": 73
593
+ },
594
+ {
595
+ "epoch": 0.6851851851851852,
596
+ "grad_norm": 97.42311096191406,
597
+ "learning_rate": 3.2000000000000003e-06,
598
+ "loss": 1.8649,
599
+ "step": 74
600
+ },
601
+ {
602
+ "epoch": 0.6944444444444444,
603
+ "grad_norm": 119.45216369628906,
604
+ "learning_rate": 3.1000000000000004e-06,
605
+ "loss": 1.6971,
606
+ "step": 75
607
+ },
608
+ {
609
+ "epoch": 0.7037037037037037,
610
+ "grad_norm": 63.03942108154297,
611
+ "learning_rate": 3e-06,
612
+ "loss": 1.8963,
613
+ "step": 76
614
+ },
615
+ {
616
+ "epoch": 0.7129629629629629,
617
+ "grad_norm": 69.41495513916016,
618
+ "learning_rate": 2.9e-06,
619
+ "loss": 1.551,
620
+ "step": 77
621
+ },
622
+ {
623
+ "epoch": 0.7222222222222222,
624
+ "grad_norm": 90.6502685546875,
625
+ "learning_rate": 2.8000000000000003e-06,
626
+ "loss": 1.6615,
627
+ "step": 78
628
+ },
629
+ {
630
+ "epoch": 0.7314814814814815,
631
+ "grad_norm": 111.7878646850586,
632
+ "learning_rate": 2.7000000000000004e-06,
633
+ "loss": 2.3283,
634
+ "step": 79
635
+ },
636
+ {
637
+ "epoch": 0.7407407407407407,
638
+ "grad_norm": 89.25751495361328,
639
+ "learning_rate": 2.6e-06,
640
+ "loss": 1.66,
641
+ "step": 80
642
+ },
643
+ {
644
+ "epoch": 0.7407407407407407,
645
+ "eval_loss": 1.7439719438552856,
646
+ "eval_map": 0.0044,
647
+ "eval_map_50": 0.0098,
648
+ "eval_map_75": 0.0039,
649
+ "eval_map_Adult": 0.0062,
650
+ "eval_map_Kid": 0.0026,
651
+ "eval_map_large": 0.0076,
652
+ "eval_map_medium": 0.0049,
653
+ "eval_map_small": -1.0,
654
+ "eval_mar_1": 0.0256,
655
+ "eval_mar_10": 0.1029,
656
+ "eval_mar_100": 0.3266,
657
+ "eval_mar_100_Adult": 0.4,
658
+ "eval_mar_100_Kid": 0.2532,
659
+ "eval_mar_large": 0.3137,
660
+ "eval_mar_medium": 0.5182,
661
+ "eval_mar_small": -1.0,
662
+ "eval_runtime": 15.2125,
663
+ "eval_samples_per_second": 4.996,
664
+ "eval_steps_per_second": 1.249,
665
+ "step": 80
666
+ },
667
+ {
668
+ "epoch": 0.75,
669
+ "grad_norm": 87.07488250732422,
670
+ "learning_rate": 2.5e-06,
671
+ "loss": 1.7118,
672
+ "step": 81
673
+ },
674
+ {
675
+ "epoch": 0.7592592592592593,
676
+ "grad_norm": 59.08213424682617,
677
+ "learning_rate": 2.4000000000000003e-06,
678
+ "loss": 1.6906,
679
+ "step": 82
680
+ },
681
+ {
682
+ "epoch": 0.7685185185185185,
683
+ "grad_norm": 45.39196014404297,
684
+ "learning_rate": 2.3000000000000004e-06,
685
+ "loss": 1.7661,
686
+ "step": 83
687
+ },
688
+ {
689
+ "epoch": 0.7777777777777778,
690
+ "grad_norm": 42.13712692260742,
691
+ "learning_rate": 2.2e-06,
692
+ "loss": 2.0699,
693
+ "step": 84
694
+ },
695
+ {
696
+ "epoch": 0.7870370370370371,
697
+ "grad_norm": 355.4501647949219,
698
+ "learning_rate": 2.1000000000000002e-06,
699
+ "loss": 1.6448,
700
+ "step": 85
701
+ },
702
+ {
703
+ "epoch": 0.7962962962962963,
704
+ "grad_norm": 42.52521514892578,
705
+ "learning_rate": 2.0000000000000003e-06,
706
+ "loss": 1.8531,
707
+ "step": 86
708
+ },
709
+ {
710
+ "epoch": 0.8055555555555556,
711
+ "grad_norm": 90.40050506591797,
712
+ "learning_rate": 1.9000000000000002e-06,
713
+ "loss": 1.9209,
714
+ "step": 87
715
+ },
716
+ {
717
+ "epoch": 0.8148148148148148,
718
+ "grad_norm": Infinity,
719
+ "learning_rate": 1.9000000000000002e-06,
720
+ "loss": 2.0899,
721
+ "step": 88
722
+ },
723
+ {
724
+ "epoch": 0.8240740740740741,
725
+ "grad_norm": 58.782588958740234,
726
+ "learning_rate": 1.8000000000000001e-06,
727
+ "loss": 1.7296,
728
+ "step": 89
729
+ },
730
+ {
731
+ "epoch": 0.8333333333333334,
732
+ "grad_norm": 45.47020721435547,
733
+ "learning_rate": 1.7000000000000002e-06,
734
+ "loss": 1.6152,
735
+ "step": 90
736
+ },
737
+ {
738
+ "epoch": 0.8425925925925926,
739
+ "grad_norm": 25.518877029418945,
740
+ "learning_rate": 1.6000000000000001e-06,
741
+ "loss": 1.6979,
742
+ "step": 91
743
+ },
744
+ {
745
+ "epoch": 0.8518518518518519,
746
+ "grad_norm": 87.5730972290039,
747
+ "learning_rate": 1.5e-06,
748
+ "loss": 2.2574,
749
+ "step": 92
750
+ },
751
+ {
752
+ "epoch": 0.8611111111111112,
753
+ "grad_norm": 39.618709564208984,
754
+ "learning_rate": 1.4000000000000001e-06,
755
+ "loss": 2.0495,
756
+ "step": 93
757
+ },
758
+ {
759
+ "epoch": 0.8703703703703703,
760
+ "grad_norm": 206.44537353515625,
761
+ "learning_rate": 1.3e-06,
762
+ "loss": 2.2438,
763
+ "step": 94
764
+ },
765
+ {
766
+ "epoch": 0.8796296296296297,
767
+ "grad_norm": 163.28404235839844,
768
+ "learning_rate": 1.2000000000000002e-06,
769
+ "loss": 1.7708,
770
+ "step": 95
771
+ },
772
+ {
773
+ "epoch": 0.8888888888888888,
774
+ "grad_norm": 103.78318786621094,
775
+ "learning_rate": 1.1e-06,
776
+ "loss": 1.9995,
777
+ "step": 96
778
+ },
779
+ {
780
+ "epoch": 0.8981481481481481,
781
+ "grad_norm": 150.3301239013672,
782
+ "learning_rate": 1.0000000000000002e-06,
783
+ "loss": 1.8358,
784
+ "step": 97
785
+ },
786
+ {
787
+ "epoch": 0.9074074074074074,
788
+ "grad_norm": 1041.284423828125,
789
+ "learning_rate": 9.000000000000001e-07,
790
+ "loss": 1.6731,
791
+ "step": 98
792
+ },
793
+ {
794
+ "epoch": 0.9166666666666666,
795
+ "grad_norm": 290.3284912109375,
796
+ "learning_rate": 8.000000000000001e-07,
797
+ "loss": 1.9554,
798
+ "step": 99
799
+ },
800
+ {
801
+ "epoch": 0.9259259259259259,
802
+ "grad_norm": 25.19146728515625,
803
+ "learning_rate": 7.000000000000001e-07,
804
+ "loss": 2.0088,
805
+ "step": 100
806
+ },
807
+ {
808
+ "epoch": 0.9259259259259259,
809
+ "eval_loss": 1.7495548725128174,
810
+ "eval_map": 0.0043,
811
+ "eval_map_50": 0.0096,
812
+ "eval_map_75": 0.0039,
813
+ "eval_map_Adult": 0.0056,
814
+ "eval_map_Kid": 0.0029,
815
+ "eval_map_large": 0.0073,
816
+ "eval_map_medium": 0.0051,
817
+ "eval_map_small": -1.0,
818
+ "eval_mar_1": 0.0175,
819
+ "eval_mar_10": 0.0983,
820
+ "eval_mar_100": 0.3165,
821
+ "eval_mar_100_Adult": 0.3636,
822
+ "eval_mar_100_Kid": 0.2694,
823
+ "eval_mar_large": 0.3035,
824
+ "eval_mar_medium": 0.5364,
825
+ "eval_mar_small": -1.0,
826
+ "eval_runtime": 15.2134,
827
+ "eval_samples_per_second": 4.996,
828
+ "eval_steps_per_second": 1.249,
829
+ "step": 100
830
+ },
831
+ {
832
+ "epoch": 0.9259259259259259,
833
+ "step": 100,
834
+ "total_flos": 1.9112276736e+17,
835
+ "train_loss": 1.9073539078235626,
836
+ "train_runtime": 260.6876,
837
+ "train_samples_per_second": 1.534,
838
+ "train_steps_per_second": 0.384
839
+ }
840
+ ],
841
+ "logging_steps": 1,
842
+ "max_steps": 100,
843
+ "num_input_tokens_seen": 0,
844
+ "num_train_epochs": 1,
845
+ "save_steps": 10,
846
+ "stateful_callbacks": {
847
+ "TrainerControl": {
848
+ "args": {
849
+ "should_epoch_stop": false,
850
+ "should_evaluate": false,
851
+ "should_log": false,
852
+ "should_save": true,
853
+ "should_training_stop": true
854
+ },
855
+ "attributes": {}
856
+ }
857
+ },
858
+ "total_flos": 1.9112276736e+17,
859
+ "train_batch_size": 4,
860
+ "trial_name": null,
861
+ "trial_params": null
862
+ }