jordypg commited on
Commit
bda90fa
·
verified ·
1 Parent(s): 5228468

Upload 12 files

Browse files
checkpoint-4000/config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/pegasus-x-large",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "relu",
5
+ "add_bias_logits": false,
6
+ "add_final_layer_norm": true,
7
+ "architectures": [
8
+ "PegasusXForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "block_size": 512,
12
+ "bos_token_id": 0,
13
+ "classif_dropout": 0.0,
14
+ "classifier_dropout": 0.0,
15
+ "d_model": 1024,
16
+ "decoder_attention_heads": 16,
17
+ "decoder_ffn_dim": 4096,
18
+ "decoder_layerdrop": 0.0,
19
+ "decoder_layers": 16,
20
+ "decoder_start_token_id": 0,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 16,
23
+ "encoder_ffn_dim": 4096,
24
+ "encoder_layerdrop": 0.0,
25
+ "encoder_layers": 16,
26
+ "eos_token_id": 1,
27
+ "extra_pos_embeddings": 1,
28
+ "force_bos_token_to_be_generated": false,
29
+ "forced_eos_token_id": 1,
30
+ "gradient_checkpointing": false,
31
+ "id2label": {
32
+ "0": "LABEL_0",
33
+ "1": "LABEL_1",
34
+ "2": "LABEL_2"
35
+ },
36
+ "init_std": 0.02,
37
+ "is_encoder_decoder": true,
38
+ "label2id": {
39
+ "LABEL_0": 0,
40
+ "LABEL_1": 1,
41
+ "LABEL_2": 2
42
+ },
43
+ "length_penalty": 0.8,
44
+ "max_length": 16384,
45
+ "max_position_embeddings": 16384,
46
+ "model_type": "pegasus_x",
47
+ "normalize_before": true,
48
+ "normalize_embedding": false,
49
+ "num_beams": 8,
50
+ "num_global_tokens": 128,
51
+ "num_hidden_layers": 16,
52
+ "pad_token_id": 0,
53
+ "scale_embedding": true,
54
+ "stagger_local_blocks": true,
55
+ "static_position_embeddings": true,
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.27.4",
58
+ "use_cache": true,
59
+ "vocab_size": 96103
60
+ }
checkpoint-4000/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 0,
5
+ "eos_token_id": 1,
6
+ "forced_eos_token_id": 1,
7
+ "length_penalty": 0.8,
8
+ "max_length": 16384,
9
+ "num_beams": 8,
10
+ "pad_token_id": 0,
11
+ "transformers_version": "4.27.4"
12
+ }
checkpoint-4000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dbef991e1dec31549bed0fdc66dba715c068d901d980c64250d2f60caa00c9d
3
+ size 4549646261
checkpoint-4000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b34ca2ca0d3ec5c0e317aa3f77f5299a6ffdbf0a38f1f7348a8346b766c9a377
3
+ size 2274845861
checkpoint-4000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72f0576613c85f811dd17320e32a76f84e81457422ba7f884392ba925249fe28
3
+ size 14575
checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7f4b5865bdfabcedab071f6ca08201dd914f54a5b590846b5688db29c7216c5
3
+ size 627
checkpoint-4000/special_tokens_map.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<mask_1>",
4
+ "<unk_2>",
5
+ "<unk_3>",
6
+ "<unk_4>",
7
+ "<unk_5>",
8
+ "<unk_6>",
9
+ "<unk_7>",
10
+ "<unk_8>",
11
+ "<unk_9>",
12
+ "<unk_10>",
13
+ "<unk_11>",
14
+ "<unk_12>",
15
+ "<unk_13>",
16
+ "<unk_14>",
17
+ "<unk_15>",
18
+ "<unk_16>",
19
+ "<unk_17>",
20
+ "<unk_18>",
21
+ "<unk_19>",
22
+ "<unk_20>",
23
+ "<unk_21>",
24
+ "<unk_22>",
25
+ "<unk_23>",
26
+ "<unk_24>",
27
+ "<unk_25>",
28
+ "<unk_26>",
29
+ "<unk_27>",
30
+ "<unk_28>",
31
+ "<unk_29>",
32
+ "<unk_30>",
33
+ "<unk_31>",
34
+ "<unk_32>",
35
+ "<unk_33>",
36
+ "<unk_34>",
37
+ "<unk_35>",
38
+ "<unk_36>",
39
+ "<unk_37>",
40
+ "<unk_38>",
41
+ "<unk_39>",
42
+ "<unk_40>",
43
+ "<unk_41>",
44
+ "<unk_42>",
45
+ "<unk_43>",
46
+ "<unk_44>",
47
+ "<unk_45>",
48
+ "<unk_46>",
49
+ "<unk_47>",
50
+ "<unk_48>",
51
+ "<unk_49>",
52
+ "<unk_50>",
53
+ "<unk_51>",
54
+ "<unk_52>",
55
+ "<unk_53>",
56
+ "<unk_54>",
57
+ "<unk_55>",
58
+ "<unk_56>",
59
+ "<unk_57>",
60
+ "<unk_58>",
61
+ "<unk_59>",
62
+ "<unk_60>",
63
+ "<unk_61>",
64
+ "<unk_62>",
65
+ "<unk_63>",
66
+ "<unk_64>",
67
+ "<unk_65>",
68
+ "<unk_66>",
69
+ "<unk_67>",
70
+ "<unk_68>",
71
+ "<unk_69>",
72
+ "<unk_70>",
73
+ "<unk_71>",
74
+ "<unk_72>",
75
+ "<unk_73>",
76
+ "<unk_74>",
77
+ "<unk_75>",
78
+ "<unk_76>",
79
+ "<unk_77>",
80
+ "<unk_78>",
81
+ "<unk_79>",
82
+ "<unk_80>",
83
+ "<unk_81>",
84
+ "<unk_82>",
85
+ "<unk_83>",
86
+ "<unk_84>",
87
+ "<unk_85>",
88
+ "<unk_86>",
89
+ "<unk_87>",
90
+ "<unk_88>",
91
+ "<unk_89>",
92
+ "<unk_90>",
93
+ "<unk_91>",
94
+ "<unk_92>",
95
+ "<unk_93>",
96
+ "<unk_94>",
97
+ "<unk_95>",
98
+ "<unk_96>",
99
+ "<unk_97>",
100
+ "<unk_98>",
101
+ "<unk_99>",
102
+ "<unk_100>",
103
+ "<unk_101>",
104
+ "<unk_102>"
105
+ ],
106
+ "eos_token": "</s>",
107
+ "mask_token": "<mask_2>",
108
+ "pad_token": "<pad>",
109
+ "unk_token": "<unk>"
110
+ }
checkpoint-4000/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0015189ef36359283fec8b93cf6d9ce51bca37eb1101defc68a53b394913b96c
3
+ size 1912529
checkpoint-4000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-4000/tokenizer_config.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<mask_1>",
4
+ "<unk_2>",
5
+ "<unk_3>",
6
+ "<unk_4>",
7
+ "<unk_5>",
8
+ "<unk_6>",
9
+ "<unk_7>",
10
+ "<unk_8>",
11
+ "<unk_9>",
12
+ "<unk_10>",
13
+ "<unk_11>",
14
+ "<unk_12>",
15
+ "<unk_13>",
16
+ "<unk_14>",
17
+ "<unk_15>",
18
+ "<unk_16>",
19
+ "<unk_17>",
20
+ "<unk_18>",
21
+ "<unk_19>",
22
+ "<unk_20>",
23
+ "<unk_21>",
24
+ "<unk_22>",
25
+ "<unk_23>",
26
+ "<unk_24>",
27
+ "<unk_25>",
28
+ "<unk_26>",
29
+ "<unk_27>",
30
+ "<unk_28>",
31
+ "<unk_29>",
32
+ "<unk_30>",
33
+ "<unk_31>",
34
+ "<unk_32>",
35
+ "<unk_33>",
36
+ "<unk_34>",
37
+ "<unk_35>",
38
+ "<unk_36>",
39
+ "<unk_37>",
40
+ "<unk_38>",
41
+ "<unk_39>",
42
+ "<unk_40>",
43
+ "<unk_41>",
44
+ "<unk_42>",
45
+ "<unk_43>",
46
+ "<unk_44>",
47
+ "<unk_45>",
48
+ "<unk_46>",
49
+ "<unk_47>",
50
+ "<unk_48>",
51
+ "<unk_49>",
52
+ "<unk_50>",
53
+ "<unk_51>",
54
+ "<unk_52>",
55
+ "<unk_53>",
56
+ "<unk_54>",
57
+ "<unk_55>",
58
+ "<unk_56>",
59
+ "<unk_57>",
60
+ "<unk_58>",
61
+ "<unk_59>",
62
+ "<unk_60>",
63
+ "<unk_61>",
64
+ "<unk_62>",
65
+ "<unk_63>",
66
+ "<unk_64>",
67
+ "<unk_65>",
68
+ "<unk_66>",
69
+ "<unk_67>",
70
+ "<unk_68>",
71
+ "<unk_69>",
72
+ "<unk_70>",
73
+ "<unk_71>",
74
+ "<unk_72>",
75
+ "<unk_73>",
76
+ "<unk_74>",
77
+ "<unk_75>",
78
+ "<unk_76>",
79
+ "<unk_77>",
80
+ "<unk_78>",
81
+ "<unk_79>",
82
+ "<unk_80>",
83
+ "<unk_81>",
84
+ "<unk_82>",
85
+ "<unk_83>",
86
+ "<unk_84>",
87
+ "<unk_85>",
88
+ "<unk_86>",
89
+ "<unk_87>",
90
+ "<unk_88>",
91
+ "<unk_89>",
92
+ "<unk_90>",
93
+ "<unk_91>",
94
+ "<unk_92>",
95
+ "<unk_93>",
96
+ "<unk_94>",
97
+ "<unk_95>",
98
+ "<unk_96>",
99
+ "<unk_97>",
100
+ "<unk_98>",
101
+ "<unk_99>",
102
+ "<unk_100>",
103
+ "<unk_101>",
104
+ "<unk_102>"
105
+ ],
106
+ "eos_token": "</s>",
107
+ "full_tokenizer_file": null,
108
+ "mask_token": "<mask_2>",
109
+ "mask_token_sent": "<mask_1>",
110
+ "model_max_length": 1024,
111
+ "offset": 103,
112
+ "pad_token": "<pad>",
113
+ "sp_model_kwargs": {},
114
+ "special_tokens_map_file": null,
115
+ "tokenizer_class": "PegasusTokenizer",
116
+ "unk_token": "<unk>"
117
+ }
checkpoint-4000/trainer_state.json ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 7.952286282306163,
5
+ "global_step": 4000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.2,
12
+ "learning_rate": 1e-05,
13
+ "loss": 9.1779,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.2,
18
+ "eval_loss": 3.894397258758545,
19
+ "eval_runtime": 33.2081,
20
+ "eval_samples_per_second": 11.774,
21
+ "eval_steps_per_second": 1.686,
22
+ "step": 100
23
+ },
24
+ {
25
+ "epoch": 0.4,
26
+ "learning_rate": 2e-05,
27
+ "loss": 1.9391,
28
+ "step": 200
29
+ },
30
+ {
31
+ "epoch": 0.4,
32
+ "eval_loss": 1.7052955627441406,
33
+ "eval_runtime": 33.2014,
34
+ "eval_samples_per_second": 11.777,
35
+ "eval_steps_per_second": 1.687,
36
+ "step": 200
37
+ },
38
+ {
39
+ "epoch": 0.6,
40
+ "learning_rate": 3e-05,
41
+ "loss": 1.4896,
42
+ "step": 300
43
+ },
44
+ {
45
+ "epoch": 0.6,
46
+ "eval_loss": 1.597261667251587,
47
+ "eval_runtime": 33.2105,
48
+ "eval_samples_per_second": 11.773,
49
+ "eval_steps_per_second": 1.686,
50
+ "step": 300
51
+ },
52
+ {
53
+ "epoch": 0.8,
54
+ "learning_rate": 4e-05,
55
+ "loss": 1.4279,
56
+ "step": 400
57
+ },
58
+ {
59
+ "epoch": 0.8,
60
+ "eval_loss": 1.559727668762207,
61
+ "eval_runtime": 33.1972,
62
+ "eval_samples_per_second": 11.778,
63
+ "eval_steps_per_second": 1.687,
64
+ "step": 400
65
+ },
66
+ {
67
+ "epoch": 0.99,
68
+ "learning_rate": 5e-05,
69
+ "loss": 1.3676,
70
+ "step": 500
71
+ },
72
+ {
73
+ "epoch": 0.99,
74
+ "eval_loss": 1.5377521514892578,
75
+ "eval_runtime": 33.1973,
76
+ "eval_samples_per_second": 11.778,
77
+ "eval_steps_per_second": 1.687,
78
+ "step": 500
79
+ },
80
+ {
81
+ "epoch": 1.19,
82
+ "learning_rate": 4.965729952021933e-05,
83
+ "loss": 1.3666,
84
+ "step": 600
85
+ },
86
+ {
87
+ "epoch": 1.19,
88
+ "eval_loss": 1.5202970504760742,
89
+ "eval_runtime": 33.2,
90
+ "eval_samples_per_second": 11.777,
91
+ "eval_steps_per_second": 1.687,
92
+ "step": 600
93
+ },
94
+ {
95
+ "epoch": 1.39,
96
+ "learning_rate": 4.9314599040438655e-05,
97
+ "loss": 1.3432,
98
+ "step": 700
99
+ },
100
+ {
101
+ "epoch": 1.39,
102
+ "eval_loss": 1.5091770887374878,
103
+ "eval_runtime": 33.2069,
104
+ "eval_samples_per_second": 11.775,
105
+ "eval_steps_per_second": 1.686,
106
+ "step": 700
107
+ },
108
+ {
109
+ "epoch": 1.59,
110
+ "learning_rate": 4.8971898560657985e-05,
111
+ "loss": 1.3226,
112
+ "step": 800
113
+ },
114
+ {
115
+ "epoch": 1.59,
116
+ "eval_loss": 1.5007234811782837,
117
+ "eval_runtime": 33.2119,
118
+ "eval_samples_per_second": 11.773,
119
+ "eval_steps_per_second": 1.686,
120
+ "step": 800
121
+ },
122
+ {
123
+ "epoch": 1.79,
124
+ "learning_rate": 4.8629198080877314e-05,
125
+ "loss": 1.3145,
126
+ "step": 900
127
+ },
128
+ {
129
+ "epoch": 1.79,
130
+ "eval_loss": 1.4917149543762207,
131
+ "eval_runtime": 33.2096,
132
+ "eval_samples_per_second": 11.774,
133
+ "eval_steps_per_second": 1.686,
134
+ "step": 900
135
+ },
136
+ {
137
+ "epoch": 1.99,
138
+ "learning_rate": 4.8286497601096644e-05,
139
+ "loss": 1.2897,
140
+ "step": 1000
141
+ },
142
+ {
143
+ "epoch": 1.99,
144
+ "eval_loss": 1.487337350845337,
145
+ "eval_runtime": 33.2063,
146
+ "eval_samples_per_second": 11.775,
147
+ "eval_steps_per_second": 1.686,
148
+ "step": 1000
149
+ },
150
+ {
151
+ "epoch": 2.19,
152
+ "learning_rate": 4.794379712131597e-05,
153
+ "loss": 1.2956,
154
+ "step": 1100
155
+ },
156
+ {
157
+ "epoch": 2.19,
158
+ "eval_loss": 1.481351613998413,
159
+ "eval_runtime": 33.2118,
160
+ "eval_samples_per_second": 11.773,
161
+ "eval_steps_per_second": 1.686,
162
+ "step": 1100
163
+ },
164
+ {
165
+ "epoch": 2.39,
166
+ "learning_rate": 4.76010966415353e-05,
167
+ "loss": 1.2791,
168
+ "step": 1200
169
+ },
170
+ {
171
+ "epoch": 2.39,
172
+ "eval_loss": 1.4768741130828857,
173
+ "eval_runtime": 33.2069,
174
+ "eval_samples_per_second": 11.775,
175
+ "eval_steps_per_second": 1.686,
176
+ "step": 1200
177
+ },
178
+ {
179
+ "epoch": 2.58,
180
+ "learning_rate": 4.725839616175463e-05,
181
+ "loss": 1.2679,
182
+ "step": 1300
183
+ },
184
+ {
185
+ "epoch": 2.58,
186
+ "eval_loss": 1.4751813411712646,
187
+ "eval_runtime": 33.1963,
188
+ "eval_samples_per_second": 11.778,
189
+ "eval_steps_per_second": 1.687,
190
+ "step": 1300
191
+ },
192
+ {
193
+ "epoch": 2.78,
194
+ "learning_rate": 4.6915695681973956e-05,
195
+ "loss": 1.2583,
196
+ "step": 1400
197
+ },
198
+ {
199
+ "epoch": 2.78,
200
+ "eval_loss": 1.4700709581375122,
201
+ "eval_runtime": 33.2157,
202
+ "eval_samples_per_second": 11.772,
203
+ "eval_steps_per_second": 1.686,
204
+ "step": 1400
205
+ },
206
+ {
207
+ "epoch": 2.98,
208
+ "learning_rate": 4.6572995202193286e-05,
209
+ "loss": 1.2813,
210
+ "step": 1500
211
+ },
212
+ {
213
+ "epoch": 2.98,
214
+ "eval_loss": 1.4654401540756226,
215
+ "eval_runtime": 33.205,
216
+ "eval_samples_per_second": 11.775,
217
+ "eval_steps_per_second": 1.686,
218
+ "step": 1500
219
+ },
220
+ {
221
+ "epoch": 3.18,
222
+ "learning_rate": 4.6230294722412615e-05,
223
+ "loss": 1.25,
224
+ "step": 1600
225
+ },
226
+ {
227
+ "epoch": 3.18,
228
+ "eval_loss": 1.4636775255203247,
229
+ "eval_runtime": 33.207,
230
+ "eval_samples_per_second": 11.775,
231
+ "eval_steps_per_second": 1.686,
232
+ "step": 1600
233
+ },
234
+ {
235
+ "epoch": 3.38,
236
+ "learning_rate": 4.5887594242631945e-05,
237
+ "loss": 1.2393,
238
+ "step": 1700
239
+ },
240
+ {
241
+ "epoch": 3.38,
242
+ "eval_loss": 1.4608376026153564,
243
+ "eval_runtime": 33.2094,
244
+ "eval_samples_per_second": 11.774,
245
+ "eval_steps_per_second": 1.686,
246
+ "step": 1700
247
+ },
248
+ {
249
+ "epoch": 3.58,
250
+ "learning_rate": 4.554489376285127e-05,
251
+ "loss": 1.2599,
252
+ "step": 1800
253
+ },
254
+ {
255
+ "epoch": 3.58,
256
+ "eval_loss": 1.4582923650741577,
257
+ "eval_runtime": 33.2003,
258
+ "eval_samples_per_second": 11.777,
259
+ "eval_steps_per_second": 1.687,
260
+ "step": 1800
261
+ },
262
+ {
263
+ "epoch": 3.78,
264
+ "learning_rate": 4.52021932830706e-05,
265
+ "loss": 1.2336,
266
+ "step": 1900
267
+ },
268
+ {
269
+ "epoch": 3.78,
270
+ "eval_loss": 1.455262303352356,
271
+ "eval_runtime": 33.2047,
272
+ "eval_samples_per_second": 11.775,
273
+ "eval_steps_per_second": 1.687,
274
+ "step": 1900
275
+ },
276
+ {
277
+ "epoch": 3.98,
278
+ "learning_rate": 4.485949280328993e-05,
279
+ "loss": 1.2374,
280
+ "step": 2000
281
+ },
282
+ {
283
+ "epoch": 3.98,
284
+ "eval_loss": 1.4536148309707642,
285
+ "eval_runtime": 33.2036,
286
+ "eval_samples_per_second": 11.776,
287
+ "eval_steps_per_second": 1.687,
288
+ "step": 2000
289
+ },
290
+ {
291
+ "epoch": 4.17,
292
+ "learning_rate": 4.451679232350926e-05,
293
+ "loss": 1.2171,
294
+ "step": 2100
295
+ },
296
+ {
297
+ "epoch": 4.17,
298
+ "eval_loss": 1.4514210224151611,
299
+ "eval_runtime": 33.2199,
300
+ "eval_samples_per_second": 11.77,
301
+ "eval_steps_per_second": 1.686,
302
+ "step": 2100
303
+ },
304
+ {
305
+ "epoch": 4.37,
306
+ "learning_rate": 4.417409184372858e-05,
307
+ "loss": 1.2154,
308
+ "step": 2200
309
+ },
310
+ {
311
+ "epoch": 4.37,
312
+ "eval_loss": 1.450243592262268,
313
+ "eval_runtime": 33.2384,
314
+ "eval_samples_per_second": 11.763,
315
+ "eval_steps_per_second": 1.685,
316
+ "step": 2200
317
+ },
318
+ {
319
+ "epoch": 4.57,
320
+ "learning_rate": 4.383139136394791e-05,
321
+ "loss": 1.2247,
322
+ "step": 2300
323
+ },
324
+ {
325
+ "epoch": 4.57,
326
+ "eval_loss": 1.4490052461624146,
327
+ "eval_runtime": 33.2063,
328
+ "eval_samples_per_second": 11.775,
329
+ "eval_steps_per_second": 1.686,
330
+ "step": 2300
331
+ },
332
+ {
333
+ "epoch": 4.77,
334
+ "learning_rate": 4.348869088416724e-05,
335
+ "loss": 1.2234,
336
+ "step": 2400
337
+ },
338
+ {
339
+ "epoch": 4.77,
340
+ "eval_loss": 1.4472484588623047,
341
+ "eval_runtime": 33.2181,
342
+ "eval_samples_per_second": 11.771,
343
+ "eval_steps_per_second": 1.686,
344
+ "step": 2400
345
+ },
346
+ {
347
+ "epoch": 4.97,
348
+ "learning_rate": 4.314599040438657e-05,
349
+ "loss": 1.2152,
350
+ "step": 2500
351
+ },
352
+ {
353
+ "epoch": 4.97,
354
+ "eval_loss": 1.4455540180206299,
355
+ "eval_runtime": 33.205,
356
+ "eval_samples_per_second": 11.775,
357
+ "eval_steps_per_second": 1.686,
358
+ "step": 2500
359
+ },
360
+ {
361
+ "epoch": 5.17,
362
+ "learning_rate": 4.280328992460589e-05,
363
+ "loss": 1.1883,
364
+ "step": 2600
365
+ },
366
+ {
367
+ "epoch": 5.17,
368
+ "eval_loss": 1.4456433057785034,
369
+ "eval_runtime": 33.2111,
370
+ "eval_samples_per_second": 11.773,
371
+ "eval_steps_per_second": 1.686,
372
+ "step": 2600
373
+ },
374
+ {
375
+ "epoch": 5.37,
376
+ "learning_rate": 4.246058944482523e-05,
377
+ "loss": 1.1947,
378
+ "step": 2700
379
+ },
380
+ {
381
+ "epoch": 5.37,
382
+ "eval_loss": 1.4448508024215698,
383
+ "eval_runtime": 33.1978,
384
+ "eval_samples_per_second": 11.778,
385
+ "eval_steps_per_second": 1.687,
386
+ "step": 2700
387
+ },
388
+ {
389
+ "epoch": 5.57,
390
+ "learning_rate": 4.211788896504456e-05,
391
+ "loss": 1.2127,
392
+ "step": 2800
393
+ },
394
+ {
395
+ "epoch": 5.57,
396
+ "eval_loss": 1.4433377981185913,
397
+ "eval_runtime": 33.2104,
398
+ "eval_samples_per_second": 11.773,
399
+ "eval_steps_per_second": 1.686,
400
+ "step": 2800
401
+ },
402
+ {
403
+ "epoch": 5.77,
404
+ "learning_rate": 4.177518848526388e-05,
405
+ "loss": 1.1959,
406
+ "step": 2900
407
+ },
408
+ {
409
+ "epoch": 5.77,
410
+ "eval_loss": 1.4412949085235596,
411
+ "eval_runtime": 33.2197,
412
+ "eval_samples_per_second": 11.77,
413
+ "eval_steps_per_second": 1.686,
414
+ "step": 2900
415
+ },
416
+ {
417
+ "epoch": 5.96,
418
+ "learning_rate": 4.143248800548321e-05,
419
+ "loss": 1.1913,
420
+ "step": 3000
421
+ },
422
+ {
423
+ "epoch": 5.96,
424
+ "eval_loss": 1.4401354789733887,
425
+ "eval_runtime": 33.21,
426
+ "eval_samples_per_second": 11.774,
427
+ "eval_steps_per_second": 1.686,
428
+ "step": 3000
429
+ },
430
+ {
431
+ "epoch": 6.16,
432
+ "learning_rate": 4.108978752570254e-05,
433
+ "loss": 1.1966,
434
+ "step": 3100
435
+ },
436
+ {
437
+ "epoch": 6.16,
438
+ "eval_loss": 1.4406956434249878,
439
+ "eval_runtime": 33.2007,
440
+ "eval_samples_per_second": 11.777,
441
+ "eval_steps_per_second": 1.687,
442
+ "step": 3100
443
+ },
444
+ {
445
+ "epoch": 6.36,
446
+ "learning_rate": 4.0747087045921863e-05,
447
+ "loss": 1.1798,
448
+ "step": 3200
449
+ },
450
+ {
451
+ "epoch": 6.36,
452
+ "eval_loss": 1.4382696151733398,
453
+ "eval_runtime": 33.2065,
454
+ "eval_samples_per_second": 11.775,
455
+ "eval_steps_per_second": 1.686,
456
+ "step": 3200
457
+ },
458
+ {
459
+ "epoch": 6.56,
460
+ "learning_rate": 4.040438656614119e-05,
461
+ "loss": 1.1736,
462
+ "step": 3300
463
+ },
464
+ {
465
+ "epoch": 6.56,
466
+ "eval_loss": 1.4385173320770264,
467
+ "eval_runtime": 33.2237,
468
+ "eval_samples_per_second": 11.769,
469
+ "eval_steps_per_second": 1.686,
470
+ "step": 3300
471
+ },
472
+ {
473
+ "epoch": 6.76,
474
+ "learning_rate": 4.006168608636052e-05,
475
+ "loss": 1.1661,
476
+ "step": 3400
477
+ },
478
+ {
479
+ "epoch": 6.76,
480
+ "eval_loss": 1.4370990991592407,
481
+ "eval_runtime": 33.2081,
482
+ "eval_samples_per_second": 11.774,
483
+ "eval_steps_per_second": 1.686,
484
+ "step": 3400
485
+ },
486
+ {
487
+ "epoch": 6.96,
488
+ "learning_rate": 3.971898560657985e-05,
489
+ "loss": 1.1802,
490
+ "step": 3500
491
+ },
492
+ {
493
+ "epoch": 6.96,
494
+ "eval_loss": 1.4352325201034546,
495
+ "eval_runtime": 33.2067,
496
+ "eval_samples_per_second": 11.775,
497
+ "eval_steps_per_second": 1.686,
498
+ "step": 3500
499
+ },
500
+ {
501
+ "epoch": 7.16,
502
+ "learning_rate": 3.9376285126799175e-05,
503
+ "loss": 1.1649,
504
+ "step": 3600
505
+ },
506
+ {
507
+ "epoch": 7.16,
508
+ "eval_loss": 1.436591386795044,
509
+ "eval_runtime": 33.2379,
510
+ "eval_samples_per_second": 11.764,
511
+ "eval_steps_per_second": 1.685,
512
+ "step": 3600
513
+ },
514
+ {
515
+ "epoch": 7.36,
516
+ "learning_rate": 3.9033584647018505e-05,
517
+ "loss": 1.1654,
518
+ "step": 3700
519
+ },
520
+ {
521
+ "epoch": 7.36,
522
+ "eval_loss": 1.4353548288345337,
523
+ "eval_runtime": 33.2162,
524
+ "eval_samples_per_second": 11.771,
525
+ "eval_steps_per_second": 1.686,
526
+ "step": 3700
527
+ },
528
+ {
529
+ "epoch": 7.55,
530
+ "learning_rate": 3.8690884167237835e-05,
531
+ "loss": 1.1604,
532
+ "step": 3800
533
+ },
534
+ {
535
+ "epoch": 7.55,
536
+ "eval_loss": 1.4346917867660522,
537
+ "eval_runtime": 33.2322,
538
+ "eval_samples_per_second": 11.766,
539
+ "eval_steps_per_second": 1.685,
540
+ "step": 3800
541
+ },
542
+ {
543
+ "epoch": 7.75,
544
+ "learning_rate": 3.8348183687457165e-05,
545
+ "loss": 1.1555,
546
+ "step": 3900
547
+ },
548
+ {
549
+ "epoch": 7.75,
550
+ "eval_loss": 1.4331263303756714,
551
+ "eval_runtime": 33.2124,
552
+ "eval_samples_per_second": 11.773,
553
+ "eval_steps_per_second": 1.686,
554
+ "step": 3900
555
+ },
556
+ {
557
+ "epoch": 7.95,
558
+ "learning_rate": 3.800548320767649e-05,
559
+ "loss": 1.1617,
560
+ "step": 4000
561
+ },
562
+ {
563
+ "epoch": 7.95,
564
+ "eval_loss": 1.4325422048568726,
565
+ "eval_runtime": 33.2006,
566
+ "eval_samples_per_second": 11.777,
567
+ "eval_steps_per_second": 1.687,
568
+ "step": 4000
569
+ }
570
+ ],
571
+ "max_steps": 15090,
572
+ "num_train_epochs": 30,
573
+ "total_flos": 8.075550264813158e+16,
574
+ "trial_name": null,
575
+ "trial_params": null
576
+ }
checkpoint-4000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73a3e18b94ee595e0628e947ea0948c27be0e6b9473c64a1eaa48716158a7d3e
3
+ size 3515