lkw2024 commited on
Commit
7ad8ca8
·
verified ·
1 Parent(s): 0e39c17

Upload 11 files

Browse files
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba61dc75052f88a032f40def3f6d02dbd22afa95c69eec635c1f840a88ea8677
3
+ size 885119738
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e2a646ca72077b2f4e495817913734817a4d0b59c43adf947fc829113f0db1
3
+ size 13990
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65fe7bfd1f7a265858dbc13c464e7304a5cf563b67c7f8c8477bb66cb89c2811
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "timeout": 60,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
trainer_state.json ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8976327575175944,
3
+ "best_model_checkpoint": "./checkpoints/klue_bert-base\\checkpoint-4689",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4689,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03198976327575176,
13
+ "grad_norm": 8.30168342590332,
14
+ "learning_rate": 1.9786734911494988e-05,
15
+ "loss": 0.7464,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.06397952655150352,
20
+ "grad_norm": 9.065665245056152,
21
+ "learning_rate": 1.9573469822989978e-05,
22
+ "loss": 0.539,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.09596928982725528,
27
+ "grad_norm": 14.672710418701172,
28
+ "learning_rate": 1.9360204734484968e-05,
29
+ "loss": 0.4953,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.12795905310300704,
34
+ "grad_norm": 11.273133277893066,
35
+ "learning_rate": 1.9146939645979955e-05,
36
+ "loss": 0.4002,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.1599488163787588,
41
+ "grad_norm": 10.856703758239746,
42
+ "learning_rate": 1.8933674557474945e-05,
43
+ "loss": 0.4415,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.19193857965451055,
48
+ "grad_norm": 6.359602451324463,
49
+ "learning_rate": 1.872040946896993e-05,
50
+ "loss": 0.4559,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.22392834293026231,
55
+ "grad_norm": 9.03753662109375,
56
+ "learning_rate": 1.8507144380464918e-05,
57
+ "loss": 0.4145,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.2559181062060141,
62
+ "grad_norm": 10.679155349731445,
63
+ "learning_rate": 1.8293879291959908e-05,
64
+ "loss": 0.3769,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.28790786948176583,
69
+ "grad_norm": 11.417706489562988,
70
+ "learning_rate": 1.8080614203454897e-05,
71
+ "loss": 0.3864,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.3198976327575176,
76
+ "grad_norm": 7.758769512176514,
77
+ "learning_rate": 1.7867349114949884e-05,
78
+ "loss": 0.3608,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.35188739603326935,
83
+ "grad_norm": 10.617319107055664,
84
+ "learning_rate": 1.7654084026444874e-05,
85
+ "loss": 0.4189,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.3838771593090211,
90
+ "grad_norm": 7.695286273956299,
91
+ "learning_rate": 1.744081893793986e-05,
92
+ "loss": 0.3739,
93
+ "step": 600
94
+ },
95
+ {
96
+ "epoch": 0.41586692258477287,
97
+ "grad_norm": 4.655588626861572,
98
+ "learning_rate": 1.7227553849434847e-05,
99
+ "loss": 0.3978,
100
+ "step": 650
101
+ },
102
+ {
103
+ "epoch": 0.44785668586052463,
104
+ "grad_norm": 7.375668525695801,
105
+ "learning_rate": 1.7014288760929837e-05,
106
+ "loss": 0.3624,
107
+ "step": 700
108
+ },
109
+ {
110
+ "epoch": 0.4798464491362764,
111
+ "grad_norm": 8.715352058410645,
112
+ "learning_rate": 1.6801023672424827e-05,
113
+ "loss": 0.3983,
114
+ "step": 750
115
+ },
116
+ {
117
+ "epoch": 0.5118362124120281,
118
+ "grad_norm": 4.942613124847412,
119
+ "learning_rate": 1.6587758583919813e-05,
120
+ "loss": 0.3264,
121
+ "step": 800
122
+ },
123
+ {
124
+ "epoch": 0.5438259756877799,
125
+ "grad_norm": 9.237791061401367,
126
+ "learning_rate": 1.6374493495414803e-05,
127
+ "loss": 0.3811,
128
+ "step": 850
129
+ },
130
+ {
131
+ "epoch": 0.5758157389635317,
132
+ "grad_norm": 10.856432914733887,
133
+ "learning_rate": 1.616122840690979e-05,
134
+ "loss": 0.3965,
135
+ "step": 900
136
+ },
137
+ {
138
+ "epoch": 0.6078055022392834,
139
+ "grad_norm": 7.043545246124268,
140
+ "learning_rate": 1.5947963318404776e-05,
141
+ "loss": 0.3702,
142
+ "step": 950
143
+ },
144
+ {
145
+ "epoch": 0.6397952655150352,
146
+ "grad_norm": 7.454297065734863,
147
+ "learning_rate": 1.5734698229899766e-05,
148
+ "loss": 0.3946,
149
+ "step": 1000
150
+ },
151
+ {
152
+ "epoch": 0.6717850287907869,
153
+ "grad_norm": 17.558452606201172,
154
+ "learning_rate": 1.5521433141394756e-05,
155
+ "loss": 0.3569,
156
+ "step": 1050
157
+ },
158
+ {
159
+ "epoch": 0.7037747920665387,
160
+ "grad_norm": 10.30020809173584,
161
+ "learning_rate": 1.5308168052889743e-05,
162
+ "loss": 0.3621,
163
+ "step": 1100
164
+ },
165
+ {
166
+ "epoch": 0.7357645553422905,
167
+ "grad_norm": 8.323447227478027,
168
+ "learning_rate": 1.5094902964384733e-05,
169
+ "loss": 0.3507,
170
+ "step": 1150
171
+ },
172
+ {
173
+ "epoch": 0.7677543186180422,
174
+ "grad_norm": 5.383088111877441,
175
+ "learning_rate": 1.488163787587972e-05,
176
+ "loss": 0.3927,
177
+ "step": 1200
178
+ },
179
+ {
180
+ "epoch": 0.799744081893794,
181
+ "grad_norm": 6.308414936065674,
182
+ "learning_rate": 1.4668372787374708e-05,
183
+ "loss": 0.3469,
184
+ "step": 1250
185
+ },
186
+ {
187
+ "epoch": 0.8317338451695457,
188
+ "grad_norm": 8.680594444274902,
189
+ "learning_rate": 1.4455107698869698e-05,
190
+ "loss": 0.3442,
191
+ "step": 1300
192
+ },
193
+ {
194
+ "epoch": 0.8637236084452975,
195
+ "grad_norm": 3.560561418533325,
196
+ "learning_rate": 1.4241842610364684e-05,
197
+ "loss": 0.3829,
198
+ "step": 1350
199
+ },
200
+ {
201
+ "epoch": 0.8957133717210493,
202
+ "grad_norm": 8.998127937316895,
203
+ "learning_rate": 1.4028577521859672e-05,
204
+ "loss": 0.3245,
205
+ "step": 1400
206
+ },
207
+ {
208
+ "epoch": 0.927703134996801,
209
+ "grad_norm": 2.6510281562805176,
210
+ "learning_rate": 1.3815312433354662e-05,
211
+ "loss": 0.3591,
212
+ "step": 1450
213
+ },
214
+ {
215
+ "epoch": 0.9596928982725528,
216
+ "grad_norm": 10.488450050354004,
217
+ "learning_rate": 1.3602047344849649e-05,
218
+ "loss": 0.3355,
219
+ "step": 1500
220
+ },
221
+ {
222
+ "epoch": 0.9916826615483045,
223
+ "grad_norm": 6.973298072814941,
224
+ "learning_rate": 1.3388782256344637e-05,
225
+ "loss": 0.3017,
226
+ "step": 1550
227
+ },
228
+ {
229
+ "epoch": 1.0,
230
+ "eval_accuracy": 0.8835572616762636,
231
+ "eval_f1": 0.8734376826237961,
232
+ "eval_loss": 0.3096177279949188,
233
+ "eval_runtime": 70.7512,
234
+ "eval_samples_per_second": 44.183,
235
+ "eval_steps_per_second": 2.77,
236
+ "step": 1563
237
+ },
238
+ {
239
+ "epoch": 1.0236724248240563,
240
+ "grad_norm": 7.356038570404053,
241
+ "learning_rate": 1.3175517167839627e-05,
242
+ "loss": 0.2326,
243
+ "step": 1600
244
+ },
245
+ {
246
+ "epoch": 1.055662188099808,
247
+ "grad_norm": 12.944537162780762,
248
+ "learning_rate": 1.2962252079334613e-05,
249
+ "loss": 0.2559,
250
+ "step": 1650
251
+ },
252
+ {
253
+ "epoch": 1.0876519513755598,
254
+ "grad_norm": 29.164777755737305,
255
+ "learning_rate": 1.2748986990829602e-05,
256
+ "loss": 0.2468,
257
+ "step": 1700
258
+ },
259
+ {
260
+ "epoch": 1.1196417146513116,
261
+ "grad_norm": 4.75251579284668,
262
+ "learning_rate": 1.2535721902324592e-05,
263
+ "loss": 0.2388,
264
+ "step": 1750
265
+ },
266
+ {
267
+ "epoch": 1.1516314779270633,
268
+ "grad_norm": 4.9411773681640625,
269
+ "learning_rate": 1.2322456813819578e-05,
270
+ "loss": 0.2788,
271
+ "step": 1800
272
+ },
273
+ {
274
+ "epoch": 1.183621241202815,
275
+ "grad_norm": 12.609339714050293,
276
+ "learning_rate": 1.2109191725314566e-05,
277
+ "loss": 0.2343,
278
+ "step": 1850
279
+ },
280
+ {
281
+ "epoch": 1.2156110044785668,
282
+ "grad_norm": 6.632409572601318,
283
+ "learning_rate": 1.1895926636809556e-05,
284
+ "loss": 0.2845,
285
+ "step": 1900
286
+ },
287
+ {
288
+ "epoch": 1.2476007677543186,
289
+ "grad_norm": 13.227570533752441,
290
+ "learning_rate": 1.1682661548304543e-05,
291
+ "loss": 0.2484,
292
+ "step": 1950
293
+ },
294
+ {
295
+ "epoch": 1.2795905310300704,
296
+ "grad_norm": 9.043356895446777,
297
+ "learning_rate": 1.1469396459799531e-05,
298
+ "loss": 0.2781,
299
+ "step": 2000
300
+ },
301
+ {
302
+ "epoch": 1.3115802943058221,
303
+ "grad_norm": 1.4863983392715454,
304
+ "learning_rate": 1.1256131371294521e-05,
305
+ "loss": 0.2486,
306
+ "step": 2050
307
+ },
308
+ {
309
+ "epoch": 1.3435700575815739,
310
+ "grad_norm": 11.874574661254883,
311
+ "learning_rate": 1.1042866282789508e-05,
312
+ "loss": 0.2319,
313
+ "step": 2100
314
+ },
315
+ {
316
+ "epoch": 1.3755598208573256,
317
+ "grad_norm": 10.544360160827637,
318
+ "learning_rate": 1.0829601194284496e-05,
319
+ "loss": 0.2629,
320
+ "step": 2150
321
+ },
322
+ {
323
+ "epoch": 1.4075495841330774,
324
+ "grad_norm": 1.597615122795105,
325
+ "learning_rate": 1.0616336105779486e-05,
326
+ "loss": 0.2424,
327
+ "step": 2200
328
+ },
329
+ {
330
+ "epoch": 1.4395393474088292,
331
+ "grad_norm": 6.933614730834961,
332
+ "learning_rate": 1.0403071017274472e-05,
333
+ "loss": 0.2596,
334
+ "step": 2250
335
+ },
336
+ {
337
+ "epoch": 1.471529110684581,
338
+ "grad_norm": 2.243976593017578,
339
+ "learning_rate": 1.018980592876946e-05,
340
+ "loss": 0.2746,
341
+ "step": 2300
342
+ },
343
+ {
344
+ "epoch": 1.5035188739603327,
345
+ "grad_norm": 3.0834619998931885,
346
+ "learning_rate": 9.976540840264449e-06,
347
+ "loss": 0.2143,
348
+ "step": 2350
349
+ },
350
+ {
351
+ "epoch": 1.5355086372360844,
352
+ "grad_norm": 21.907583236694336,
353
+ "learning_rate": 9.763275751759437e-06,
354
+ "loss": 0.2311,
355
+ "step": 2400
356
+ },
357
+ {
358
+ "epoch": 1.5674984005118362,
359
+ "grad_norm": 1.5895259380340576,
360
+ "learning_rate": 9.550010663254427e-06,
361
+ "loss": 0.2487,
362
+ "step": 2450
363
+ },
364
+ {
365
+ "epoch": 1.599488163787588,
366
+ "grad_norm": 10.364677429199219,
367
+ "learning_rate": 9.336745574749414e-06,
368
+ "loss": 0.2623,
369
+ "step": 2500
370
+ },
371
+ {
372
+ "epoch": 1.6314779270633397,
373
+ "grad_norm": 10.95889663696289,
374
+ "learning_rate": 9.123480486244403e-06,
375
+ "loss": 0.2233,
376
+ "step": 2550
377
+ },
378
+ {
379
+ "epoch": 1.6634676903390915,
380
+ "grad_norm": 21.338903427124023,
381
+ "learning_rate": 8.910215397739392e-06,
382
+ "loss": 0.25,
383
+ "step": 2600
384
+ },
385
+ {
386
+ "epoch": 1.6954574536148432,
387
+ "grad_norm": 5.0549139976501465,
388
+ "learning_rate": 8.696950309234378e-06,
389
+ "loss": 0.2466,
390
+ "step": 2650
391
+ },
392
+ {
393
+ "epoch": 1.727447216890595,
394
+ "grad_norm": 5.613836765289307,
395
+ "learning_rate": 8.483685220729368e-06,
396
+ "loss": 0.2603,
397
+ "step": 2700
398
+ },
399
+ {
400
+ "epoch": 1.7594369801663468,
401
+ "grad_norm": 11.103569030761719,
402
+ "learning_rate": 8.270420132224356e-06,
403
+ "loss": 0.259,
404
+ "step": 2750
405
+ },
406
+ {
407
+ "epoch": 1.7914267434420985,
408
+ "grad_norm": 3.3913040161132812,
409
+ "learning_rate": 8.057155043719343e-06,
410
+ "loss": 0.2458,
411
+ "step": 2800
412
+ },
413
+ {
414
+ "epoch": 1.8234165067178503,
415
+ "grad_norm": 0.5389057993888855,
416
+ "learning_rate": 7.843889955214333e-06,
417
+ "loss": 0.2366,
418
+ "step": 2850
419
+ },
420
+ {
421
+ "epoch": 1.855406269993602,
422
+ "grad_norm": 8.419164657592773,
423
+ "learning_rate": 7.630624866709321e-06,
424
+ "loss": 0.2716,
425
+ "step": 2900
426
+ },
427
+ {
428
+ "epoch": 1.8873960332693538,
429
+ "grad_norm": 5.326470375061035,
430
+ "learning_rate": 7.4173597782043085e-06,
431
+ "loss": 0.2302,
432
+ "step": 2950
433
+ },
434
+ {
435
+ "epoch": 1.9193857965451055,
436
+ "grad_norm": 6.134453296661377,
437
+ "learning_rate": 7.204094689699297e-06,
438
+ "loss": 0.2236,
439
+ "step": 3000
440
+ },
441
+ {
442
+ "epoch": 1.9513755598208573,
443
+ "grad_norm": 13.053471565246582,
444
+ "learning_rate": 6.990829601194286e-06,
445
+ "loss": 0.2549,
446
+ "step": 3050
447
+ },
448
+ {
449
+ "epoch": 1.983365323096609,
450
+ "grad_norm": 16.08262825012207,
451
+ "learning_rate": 6.777564512689273e-06,
452
+ "loss": 0.2065,
453
+ "step": 3100
454
+ },
455
+ {
456
+ "epoch": 2.0,
457
+ "eval_accuracy": 0.891234804862444,
458
+ "eval_f1": 0.8816158583539794,
459
+ "eval_loss": 0.32736125588417053,
460
+ "eval_runtime": 69.7253,
461
+ "eval_samples_per_second": 44.833,
462
+ "eval_steps_per_second": 2.811,
463
+ "step": 3126
464
+ },
465
+ {
466
+ "epoch": 2.015355086372361,
467
+ "grad_norm": 12.55622386932373,
468
+ "learning_rate": 6.5642994241842614e-06,
469
+ "loss": 0.1925,
470
+ "step": 3150
471
+ },
472
+ {
473
+ "epoch": 2.0473448496481126,
474
+ "grad_norm": 9.97314167022705,
475
+ "learning_rate": 6.3510343356792505e-06,
476
+ "loss": 0.1697,
477
+ "step": 3200
478
+ },
479
+ {
480
+ "epoch": 2.0793346129238643,
481
+ "grad_norm": 13.542362213134766,
482
+ "learning_rate": 6.137769247174238e-06,
483
+ "loss": 0.1759,
484
+ "step": 3250
485
+ },
486
+ {
487
+ "epoch": 2.111324376199616,
488
+ "grad_norm": 2.1672303676605225,
489
+ "learning_rate": 5.924504158669226e-06,
490
+ "loss": 0.163,
491
+ "step": 3300
492
+ },
493
+ {
494
+ "epoch": 2.143314139475368,
495
+ "grad_norm": 10.153268814086914,
496
+ "learning_rate": 5.711239070164215e-06,
497
+ "loss": 0.152,
498
+ "step": 3350
499
+ },
500
+ {
501
+ "epoch": 2.1753039027511196,
502
+ "grad_norm": 30.200542449951172,
503
+ "learning_rate": 5.497973981659203e-06,
504
+ "loss": 0.2203,
505
+ "step": 3400
506
+ },
507
+ {
508
+ "epoch": 2.2072936660268714,
509
+ "grad_norm": 7.482994079589844,
510
+ "learning_rate": 5.284708893154191e-06,
511
+ "loss": 0.1699,
512
+ "step": 3450
513
+ },
514
+ {
515
+ "epoch": 2.239283429302623,
516
+ "grad_norm": 18.278284072875977,
517
+ "learning_rate": 5.07144380464918e-06,
518
+ "loss": 0.1624,
519
+ "step": 3500
520
+ },
521
+ {
522
+ "epoch": 2.271273192578375,
523
+ "grad_norm": 15.992013931274414,
524
+ "learning_rate": 4.858178716144167e-06,
525
+ "loss": 0.1354,
526
+ "step": 3550
527
+ },
528
+ {
529
+ "epoch": 2.3032629558541267,
530
+ "grad_norm": 8.946208953857422,
531
+ "learning_rate": 4.644913627639156e-06,
532
+ "loss": 0.176,
533
+ "step": 3600
534
+ },
535
+ {
536
+ "epoch": 2.3352527191298784,
537
+ "grad_norm": 0.24096710979938507,
538
+ "learning_rate": 4.431648539134144e-06,
539
+ "loss": 0.1859,
540
+ "step": 3650
541
+ },
542
+ {
543
+ "epoch": 2.36724248240563,
544
+ "grad_norm": 1.182032585144043,
545
+ "learning_rate": 4.218383450629132e-06,
546
+ "loss": 0.1643,
547
+ "step": 3700
548
+ },
549
+ {
550
+ "epoch": 2.399232245681382,
551
+ "grad_norm": 14.64736270904541,
552
+ "learning_rate": 4.005118362124121e-06,
553
+ "loss": 0.1802,
554
+ "step": 3750
555
+ },
556
+ {
557
+ "epoch": 2.4312220089571337,
558
+ "grad_norm": 28.632488250732422,
559
+ "learning_rate": 3.791853273619109e-06,
560
+ "loss": 0.1754,
561
+ "step": 3800
562
+ },
563
+ {
564
+ "epoch": 2.4632117722328855,
565
+ "grad_norm": 12.297914505004883,
566
+ "learning_rate": 3.5785881851140968e-06,
567
+ "loss": 0.1654,
568
+ "step": 3850
569
+ },
570
+ {
571
+ "epoch": 2.495201535508637,
572
+ "grad_norm": 8.145957946777344,
573
+ "learning_rate": 3.3653230966090854e-06,
574
+ "loss": 0.1754,
575
+ "step": 3900
576
+ },
577
+ {
578
+ "epoch": 2.527191298784389,
579
+ "grad_norm": 1.9005142450332642,
580
+ "learning_rate": 3.1520580081040737e-06,
581
+ "loss": 0.1292,
582
+ "step": 3950
583
+ },
584
+ {
585
+ "epoch": 2.5591810620601407,
586
+ "grad_norm": 23.53730010986328,
587
+ "learning_rate": 2.9387929195990615e-06,
588
+ "loss": 0.1386,
589
+ "step": 4000
590
+ },
591
+ {
592
+ "epoch": 2.5911708253358925,
593
+ "grad_norm": 2.064774751663208,
594
+ "learning_rate": 2.72552783109405e-06,
595
+ "loss": 0.1667,
596
+ "step": 4050
597
+ },
598
+ {
599
+ "epoch": 2.6231605886116443,
600
+ "grad_norm": 34.96377182006836,
601
+ "learning_rate": 2.5122627425890384e-06,
602
+ "loss": 0.2021,
603
+ "step": 4100
604
+ },
605
+ {
606
+ "epoch": 2.655150351887396,
607
+ "grad_norm": 20.485219955444336,
608
+ "learning_rate": 2.2989976540840266e-06,
609
+ "loss": 0.1614,
610
+ "step": 4150
611
+ },
612
+ {
613
+ "epoch": 2.6871401151631478,
614
+ "grad_norm": 13.900922775268555,
615
+ "learning_rate": 2.085732565579015e-06,
616
+ "loss": 0.1185,
617
+ "step": 4200
618
+ },
619
+ {
620
+ "epoch": 2.7191298784388995,
621
+ "grad_norm": 0.96761155128479,
622
+ "learning_rate": 1.872467477074003e-06,
623
+ "loss": 0.1539,
624
+ "step": 4250
625
+ },
626
+ {
627
+ "epoch": 2.7511196417146513,
628
+ "grad_norm": 1.9782915115356445,
629
+ "learning_rate": 1.6592023885689915e-06,
630
+ "loss": 0.1883,
631
+ "step": 4300
632
+ },
633
+ {
634
+ "epoch": 2.783109404990403,
635
+ "grad_norm": 14.365890502929688,
636
+ "learning_rate": 1.4459373000639796e-06,
637
+ "loss": 0.2019,
638
+ "step": 4350
639
+ },
640
+ {
641
+ "epoch": 2.815099168266155,
642
+ "grad_norm": 17.920162200927734,
643
+ "learning_rate": 1.2326722115589678e-06,
644
+ "loss": 0.1614,
645
+ "step": 4400
646
+ },
647
+ {
648
+ "epoch": 2.8470889315419066,
649
+ "grad_norm": 7.565415859222412,
650
+ "learning_rate": 1.019407123053956e-06,
651
+ "loss": 0.1581,
652
+ "step": 4450
653
+ },
654
+ {
655
+ "epoch": 2.8790786948176583,
656
+ "grad_norm": 4.440770149230957,
657
+ "learning_rate": 8.061420345489445e-07,
658
+ "loss": 0.1759,
659
+ "step": 4500
660
+ },
661
+ {
662
+ "epoch": 2.91106845809341,
663
+ "grad_norm": 14.82466983795166,
664
+ "learning_rate": 5.928769460439326e-07,
665
+ "loss": 0.1288,
666
+ "step": 4550
667
+ },
668
+ {
669
+ "epoch": 2.943058221369162,
670
+ "grad_norm": 15.672795295715332,
671
+ "learning_rate": 3.796118575389209e-07,
672
+ "loss": 0.205,
673
+ "step": 4600
674
+ },
675
+ {
676
+ "epoch": 2.9750479846449136,
677
+ "grad_norm": 13.70653247833252,
678
+ "learning_rate": 1.6634676903390917e-07,
679
+ "loss": 0.1756,
680
+ "step": 4650
681
+ },
682
+ {
683
+ "epoch": 3.0,
684
+ "eval_accuracy": 0.8976327575175944,
685
+ "eval_f1": 0.8871674993824495,
686
+ "eval_loss": 0.3958832323551178,
687
+ "eval_runtime": 71.1516,
688
+ "eval_samples_per_second": 43.934,
689
+ "eval_steps_per_second": 2.755,
690
+ "step": 4689
691
+ }
692
+ ],
693
+ "logging_steps": 50,
694
+ "max_steps": 4689,
695
+ "num_input_tokens_seen": 0,
696
+ "num_train_epochs": 3,
697
+ "save_steps": 500,
698
+ "stateful_callbacks": {
699
+ "TrainerControl": {
700
+ "args": {
701
+ "should_epoch_stop": false,
702
+ "should_evaluate": false,
703
+ "should_log": false,
704
+ "should_save": true,
705
+ "should_training_stop": true
706
+ },
707
+ "attributes": {}
708
+ }
709
+ },
710
+ "total_flos": 824050668588336.0,
711
+ "train_batch_size": 16,
712
+ "trial_name": null,
713
+ "trial_params": null
714
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d56306f6462f0be32250e5f56819e3714e0d37c2aaf00f00d6f23a0f925745e
3
+ size 5176
vocab.txt ADDED
The diff for this file is too large to render. See raw diff