yuyang commited on
Commit
96db823
1 Parent(s): 54b0187

upload checkpoint files

Browse files
config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "yuyang/bart-large-cnn-finetuned-newsroom",
3
+ "_num_labels": 3,
4
+ "activation_dropout": 0.0,
5
+ "activation_function": "gelu",
6
+ "add_final_layer_norm": false,
7
+ "architectures": [
8
+ "NFBart"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "bos_token_id": 0,
12
+ "classif_dropout": 0.0,
13
+ "classifier_dropout": 0.0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0.0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "early_stopping": true,
22
+ "encoder_attention_heads": 16,
23
+ "encoder_ffn_dim": 4096,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 12,
26
+ "eos_token_id": 2,
27
+ "force_bos_token_to_be_generated": true,
28
+ "forced_bos_token_id": 0,
29
+ "forced_eos_token_id": 2,
30
+ "gradient_checkpointing": false,
31
+ "id2label": {
32
+ "0": "LABEL_0",
33
+ "1": "LABEL_1",
34
+ "2": "LABEL_2"
35
+ },
36
+ "init_std": 0.02,
37
+ "is_encoder_decoder": true,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1,
41
+ "LABEL_2": 2
42
+ },
43
+ "length_penalty": 2.0,
44
+ "max_length": 142,
45
+ "max_position_embeddings": 1024,
46
+ "min_length": 56,
47
+ "model_type": "bart",
48
+ "no_repeat_ngram_size": 3,
49
+ "normalize_before": false,
50
+ "num_beams": 4,
51
+ "num_hidden_layers": 12,
52
+ "output_past": true,
53
+ "pad_token_id": 1,
54
+ "prefix": " ",
55
+ "scale_embedding": false,
56
+ "task_specific_params": {
57
+ "summarization": {
58
+ "early_stopping": true,
59
+ "length_penalty": 2.0,
60
+ "max_length": 142,
61
+ "min_length": 56,
62
+ "no_repeat_ngram_size": 3,
63
+ "num_beams": 4
64
+ }
65
+ },
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.27.1",
68
+ "use_cache": true,
69
+ "vocab_size": 50265
70
+ }
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "early_stopping": true,
6
+ "eos_token_id": 2,
7
+ "forced_bos_token_id": 0,
8
+ "forced_eos_token_id": 2,
9
+ "length_penalty": 2.0,
10
+ "max_length": 142,
11
+ "min_length": 56,
12
+ "no_repeat_ngram_size": 3,
13
+ "num_beams": 4,
14
+ "pad_token_id": 1,
15
+ "transformers_version": "4.27.1"
16
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dc78b5a8775d3396e1aa6e5fb384135b8eef8afbc44f42f1d82aea3f7f0c543
3
+ size 1660501409
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "mask_token": "<mask>",
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "pad_token": "<pad>",
10
+ "sep_token": "</s>",
11
+ "special_tokens_map_file": "/home/shenx/yang6367/.cache/huggingface/hub/models--yuyang--bart-large-cnn-finetuned-newsroom/snapshots/5684014ca8aaf3e5e5604206981dca7a86f6f265/special_tokens_map.json",
12
+ "tokenizer_class": "BartTokenizer",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
+ }
trainer_state.json ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 5.6603827476501465,
3
+ "best_model_checkpoint": "../checkpoints/nf-bart-newsroom-rqnsf/checkpoint-60000",
4
+ "epoch": 0.9648008490247472,
5
+ "global_step": 60000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.03,
12
+ "gate_score": 0.371,
13
+ "learning_rate": 4.9725e-05,
14
+ "loss": 592.8312,
15
+ "nf_loss": 591.7519,
16
+ "ppl": 3.3519,
17
+ "step": 2000
18
+ },
19
+ {
20
+ "epoch": 0.06,
21
+ "gate_score": 0.2007,
22
+ "learning_rate": 4.834853544667631e-05,
23
+ "loss": 333.2925,
24
+ "nf_loss": 332.3035,
25
+ "ppl": 2.8873,
26
+ "step": 4000
27
+ },
28
+ {
29
+ "epoch": 0.06,
30
+ "eval_loss": 252.84817504882812,
31
+ "eval_nf_loss": 251.2677459716797,
32
+ "eval_perplexity": 6.363341331481934,
33
+ "eval_runtime": 4381.8372,
34
+ "eval_samples_per_second": 24.837,
35
+ "eval_steps_per_second": 1.552,
36
+ "step": 4000
37
+ },
38
+ {
39
+ "epoch": 0.1,
40
+ "gate_score": 0.1135,
41
+ "learning_rate": 4.6687102294439184e-05,
42
+ "loss": 211.0105,
43
+ "nf_loss": 209.9559,
44
+ "ppl": 3.1158,
45
+ "step": 6000
46
+ },
47
+ {
48
+ "epoch": 0.13,
49
+ "gate_score": 0.0666,
50
+ "learning_rate": 4.5025669142202066e-05,
51
+ "loss": 134.1864,
52
+ "nf_loss": 133.0995,
53
+ "ppl": 3.2307,
54
+ "step": 8000
55
+ },
56
+ {
57
+ "epoch": 0.13,
58
+ "eval_loss": 102.85702514648438,
59
+ "eval_nf_loss": 101.2651596069336,
60
+ "eval_perplexity": 6.448662757873535,
61
+ "eval_runtime": 4005.1516,
62
+ "eval_samples_per_second": 27.173,
63
+ "eval_steps_per_second": 1.698,
64
+ "step": 8000
65
+ },
66
+ {
67
+ "epoch": 0.16,
68
+ "gate_score": 0.0352,
69
+ "learning_rate": 4.3365897423117185e-05,
70
+ "loss": 95.272,
71
+ "nf_loss": 94.1643,
72
+ "ppl": 3.2978,
73
+ "step": 10000
74
+ },
75
+ {
76
+ "epoch": 0.19,
77
+ "gate_score": 0.023,
78
+ "learning_rate": 4.170446427088006e-05,
79
+ "loss": 68.625,
80
+ "nf_loss": 67.5129,
81
+ "ppl": 3.324,
82
+ "step": 12000
83
+ },
84
+ {
85
+ "epoch": 0.19,
86
+ "eval_loss": 53.623291015625,
87
+ "eval_nf_loss": 52.02442169189453,
88
+ "eval_perplexity": 6.536867618560791,
89
+ "eval_runtime": 3924.6092,
90
+ "eval_samples_per_second": 27.731,
91
+ "eval_steps_per_second": 1.733,
92
+ "step": 12000
93
+ },
94
+ {
95
+ "epoch": 0.23,
96
+ "gate_score": 0.023,
97
+ "learning_rate": 4.004386183521906e-05,
98
+ "loss": 49.4308,
99
+ "nf_loss": 48.3272,
100
+ "ppl": 3.2964,
101
+ "step": 14000
102
+ },
103
+ {
104
+ "epoch": 0.26,
105
+ "gate_score": 0.0219,
106
+ "learning_rate": 3.8383259399558055e-05,
107
+ "loss": 36.663,
108
+ "nf_loss": 35.5419,
109
+ "ppl": 3.3952,
110
+ "step": 16000
111
+ },
112
+ {
113
+ "epoch": 0.26,
114
+ "eval_loss": 29.72612762451172,
115
+ "eval_nf_loss": 28.141469955444336,
116
+ "eval_perplexity": 6.444383144378662,
117
+ "eval_runtime": 3904.4886,
118
+ "eval_samples_per_second": 27.874,
119
+ "eval_steps_per_second": 1.742,
120
+ "step": 16000
121
+ },
122
+ {
123
+ "epoch": 0.29,
124
+ "gate_score": 0.0183,
125
+ "learning_rate": 3.672265696389706e-05,
126
+ "loss": 30.1161,
127
+ "nf_loss": 29.0084,
128
+ "ppl": 3.3204,
129
+ "step": 18000
130
+ },
131
+ {
132
+ "epoch": 0.32,
133
+ "gate_score": 0.0188,
134
+ "learning_rate": 3.5061223811659935e-05,
135
+ "loss": 25.2734,
136
+ "nf_loss": 24.1661,
137
+ "ppl": 3.3019,
138
+ "step": 20000
139
+ },
140
+ {
141
+ "epoch": 0.32,
142
+ "eval_loss": 21.10334014892578,
143
+ "eval_nf_loss": 19.523048400878906,
144
+ "eval_perplexity": 6.385104656219482,
145
+ "eval_runtime": 3951.5734,
146
+ "eval_samples_per_second": 27.541,
147
+ "eval_steps_per_second": 1.721,
148
+ "step": 20000
149
+ },
150
+ {
151
+ "epoch": 0.35,
152
+ "gate_score": 0.0189,
153
+ "learning_rate": 3.340062137599894e-05,
154
+ "loss": 21.8002,
155
+ "nf_loss": 20.6893,
156
+ "ppl": 3.3211,
157
+ "step": 22000
158
+ },
159
+ {
160
+ "epoch": 0.39,
161
+ "gate_score": 0.0186,
162
+ "learning_rate": 3.1740018940337936e-05,
163
+ "loss": 19.5679,
164
+ "nf_loss": 18.4664,
165
+ "ppl": 3.2967,
166
+ "step": 24000
167
+ },
168
+ {
169
+ "epoch": 0.39,
170
+ "eval_loss": 16.757369995117188,
171
+ "eval_nf_loss": 15.157673835754395,
172
+ "eval_perplexity": 6.50905179977417,
173
+ "eval_runtime": 4060.1325,
174
+ "eval_samples_per_second": 26.805,
175
+ "eval_steps_per_second": 1.675,
176
+ "step": 24000
177
+ },
178
+ {
179
+ "epoch": 0.42,
180
+ "gate_score": 0.0169,
181
+ "learning_rate": 3.0079416504676937e-05,
182
+ "loss": 17.5505,
183
+ "nf_loss": 16.4471,
184
+ "ppl": 3.2866,
185
+ "step": 26000
186
+ },
187
+ {
188
+ "epoch": 0.45,
189
+ "gate_score": 0.0169,
190
+ "learning_rate": 2.8417983352439813e-05,
191
+ "loss": 16.0243,
192
+ "nf_loss": 14.9265,
193
+ "ppl": 3.2771,
194
+ "step": 28000
195
+ },
196
+ {
197
+ "epoch": 0.45,
198
+ "eval_loss": 13.916472434997559,
199
+ "eval_nf_loss": 12.336326599121094,
200
+ "eval_perplexity": 6.333822250366211,
201
+ "eval_runtime": 4073.3885,
202
+ "eval_samples_per_second": 26.718,
203
+ "eval_steps_per_second": 1.67,
204
+ "step": 28000
205
+ },
206
+ {
207
+ "epoch": 0.48,
208
+ "gate_score": 0.0164,
209
+ "learning_rate": 2.6757380916778813e-05,
210
+ "loss": 14.7704,
211
+ "nf_loss": 13.6695,
212
+ "ppl": 3.2909,
213
+ "step": 30000
214
+ },
215
+ {
216
+ "epoch": 0.51,
217
+ "gate_score": 0.0165,
218
+ "learning_rate": 2.509677848111781e-05,
219
+ "loss": 13.8487,
220
+ "nf_loss": 12.7566,
221
+ "ppl": 3.2604,
222
+ "step": 32000
223
+ },
224
+ {
225
+ "epoch": 0.51,
226
+ "eval_loss": 12.310139656066895,
227
+ "eval_nf_loss": 10.705780029296875,
228
+ "eval_perplexity": 6.516045570373535,
229
+ "eval_runtime": 3950.6921,
230
+ "eval_samples_per_second": 27.548,
231
+ "eval_steps_per_second": 1.722,
232
+ "step": 32000
233
+ },
234
+ {
235
+ "epoch": 0.55,
236
+ "gate_score": 0.0163,
237
+ "learning_rate": 2.3435345328880693e-05,
238
+ "loss": 13.021,
239
+ "nf_loss": 11.939,
240
+ "ppl": 3.2217,
241
+ "step": 34000
242
+ },
243
+ {
244
+ "epoch": 0.58,
245
+ "gate_score": 0.0164,
246
+ "learning_rate": 2.1774742893219694e-05,
247
+ "loss": 12.3656,
248
+ "nf_loss": 11.2731,
249
+ "ppl": 3.2491,
250
+ "step": 36000
251
+ },
252
+ {
253
+ "epoch": 0.58,
254
+ "eval_loss": 11.076099395751953,
255
+ "eval_nf_loss": 9.518845558166504,
256
+ "eval_perplexity": 6.16482400894165,
257
+ "eval_runtime": 3933.1054,
258
+ "eval_samples_per_second": 27.671,
259
+ "eval_steps_per_second": 1.729,
260
+ "step": 36000
261
+ },
262
+ {
263
+ "epoch": 0.61,
264
+ "gate_score": 0.0169,
265
+ "learning_rate": 2.0113309740982573e-05,
266
+ "loss": 11.834,
267
+ "nf_loss": 10.7422,
268
+ "ppl": 3.248,
269
+ "step": 38000
270
+ },
271
+ {
272
+ "epoch": 0.64,
273
+ "gate_score": 0.017,
274
+ "learning_rate": 1.845270730532157e-05,
275
+ "loss": 11.301,
276
+ "nf_loss": 10.2211,
277
+ "ppl": 3.2176,
278
+ "step": 40000
279
+ },
280
+ {
281
+ "epoch": 0.64,
282
+ "eval_loss": 9.942972183227539,
283
+ "eval_nf_loss": 8.422438621520996,
284
+ "eval_perplexity": 5.888382911682129,
285
+ "eval_runtime": 3905.9353,
286
+ "eval_samples_per_second": 27.863,
287
+ "eval_steps_per_second": 1.741,
288
+ "step": 40000
289
+ },
290
+ {
291
+ "epoch": 0.68,
292
+ "gate_score": 0.0178,
293
+ "learning_rate": 1.679293558623669e-05,
294
+ "loss": 10.826,
295
+ "nf_loss": 9.7634,
296
+ "ppl": 3.1432,
297
+ "step": 42000
298
+ },
299
+ {
300
+ "epoch": 0.71,
301
+ "gate_score": 0.0167,
302
+ "learning_rate": 1.513150243399957e-05,
303
+ "loss": 10.5722,
304
+ "nf_loss": 9.4924,
305
+ "ppl": 3.2112,
306
+ "step": 44000
307
+ },
308
+ {
309
+ "epoch": 0.71,
310
+ "eval_loss": 9.34610366821289,
311
+ "eval_nf_loss": 7.819733619689941,
312
+ "eval_perplexity": 5.929094314575195,
313
+ "eval_runtime": 3809.9667,
314
+ "eval_samples_per_second": 28.565,
315
+ "eval_steps_per_second": 1.785,
316
+ "step": 44000
317
+ },
318
+ {
319
+ "epoch": 0.74,
320
+ "gate_score": 0.0164,
321
+ "learning_rate": 1.3470899998338569e-05,
322
+ "loss": 10.1982,
323
+ "nf_loss": 9.1308,
324
+ "ppl": 3.1538,
325
+ "step": 46000
326
+ },
327
+ {
328
+ "epoch": 0.77,
329
+ "gate_score": 0.0156,
330
+ "learning_rate": 1.1810297562677566e-05,
331
+ "loss": 9.9637,
332
+ "nf_loss": 8.9113,
333
+ "ppl": 3.1019,
334
+ "step": 48000
335
+ },
336
+ {
337
+ "epoch": 0.77,
338
+ "eval_loss": 9.272148132324219,
339
+ "eval_nf_loss": 7.770688056945801,
340
+ "eval_perplexity": 5.756178855895996,
341
+ "eval_runtime": 3799.2449,
342
+ "eval_samples_per_second": 28.646,
343
+ "eval_steps_per_second": 1.79,
344
+ "step": 48000
345
+ },
346
+ {
347
+ "epoch": 0.8,
348
+ "gate_score": 0.0159,
349
+ "learning_rate": 1.0148864410440447e-05,
350
+ "loss": 9.7627,
351
+ "nf_loss": 8.701,
352
+ "ppl": 3.1339,
353
+ "step": 50000
354
+ },
355
+ {
356
+ "epoch": 0.84,
357
+ "gate_score": 0.015,
358
+ "learning_rate": 8.488261974779444e-06,
359
+ "loss": 9.5228,
360
+ "nf_loss": 8.4605,
361
+ "ppl": 3.1399,
362
+ "step": 52000
363
+ },
364
+ {
365
+ "epoch": 0.84,
366
+ "eval_loss": 8.568826675415039,
367
+ "eval_nf_loss": 7.074533462524414,
368
+ "eval_perplexity": 5.70849609375,
369
+ "eval_runtime": 3805.5723,
370
+ "eval_samples_per_second": 28.598,
371
+ "eval_steps_per_second": 1.787,
372
+ "step": 52000
373
+ },
374
+ {
375
+ "epoch": 0.87,
376
+ "gate_score": 0.0143,
377
+ "learning_rate": 6.827659539118445e-06,
378
+ "loss": 9.3407,
379
+ "nf_loss": 8.2955,
380
+ "ppl": 3.0762,
381
+ "step": 54000
382
+ },
383
+ {
384
+ "epoch": 0.9,
385
+ "gate_score": 0.0148,
386
+ "learning_rate": 5.167057103457443e-06,
387
+ "loss": 9.1988,
388
+ "nf_loss": 8.1602,
389
+ "ppl": 3.058,
390
+ "step": 56000
391
+ },
392
+ {
393
+ "epoch": 0.9,
394
+ "eval_loss": 8.280665397644043,
395
+ "eval_nf_loss": 6.778278827667236,
396
+ "eval_perplexity": 5.754031658172607,
397
+ "eval_runtime": 3795.1994,
398
+ "eval_samples_per_second": 28.676,
399
+ "eval_steps_per_second": 1.792,
400
+ "step": 56000
401
+ },
402
+ {
403
+ "epoch": 0.93,
404
+ "gate_score": 0.0146,
405
+ "learning_rate": 3.5056239512203227e-06,
406
+ "loss": 9.0572,
407
+ "nf_loss": 8.0245,
408
+ "ppl": 3.0413,
409
+ "step": 58000
410
+ },
411
+ {
412
+ "epoch": 0.96,
413
+ "gate_score": 0.0138,
414
+ "learning_rate": 1.8450215155593216e-06,
415
+ "loss": 9.0079,
416
+ "nf_loss": 7.9878,
417
+ "ppl": 3.002,
418
+ "step": 60000
419
+ },
420
+ {
421
+ "epoch": 0.96,
422
+ "eval_loss": 8.222879409790039,
423
+ "eval_nf_loss": 6.735788822174072,
424
+ "eval_perplexity": 5.6603827476501465,
425
+ "eval_runtime": 3672.6524,
426
+ "eval_samples_per_second": 29.633,
427
+ "eval_steps_per_second": 1.852,
428
+ "step": 60000
429
+ }
430
+ ],
431
+ "max_steps": 62189,
432
+ "num_train_epochs": 1,
433
+ "total_flos": 1.06597659967488e+18,
434
+ "trial_name": null,
435
+ "trial_params": null
436
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff