robertou2 commited on
Commit
69bcd4e
·
verified ·
1 Parent(s): bf507f7

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -10,20 +10,20 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 16,
14
- "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "o_proj",
24
- "gate_up_proj",
25
  "qkv_proj",
26
- "down_proj"
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 64,
14
+ "lora_dropout": 0.0001,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 32,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "down_proj",
 
24
  "qkv_proj",
25
+ "gate_up_proj",
26
+ "o_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09dd6cf05506f7922ad70f74fe6934c8336e697e67448b49f05a9b4a7a6566de
3
- size 50365768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:170e9283396f794ac39c141ef58fc732a915723bcc48acda06109764aede853c
3
+ size 201361312
added_tokens.json CHANGED
@@ -2,7 +2,6 @@
2
  "<|assistant|>": 32001,
3
  "<|endoftext|>": 32000,
4
  "<|end|>": 32007,
5
- "<|pad|>": 32011,
6
  "<|placeholder1|>": 32002,
7
  "<|placeholder2|>": 32003,
8
  "<|placeholder3|>": 32004,
 
2
  "<|assistant|>": 32001,
3
  "<|endoftext|>": 32000,
4
  "<|end|>": 32007,
 
5
  "<|placeholder1|>": 32002,
6
  "<|placeholder2|>": 32003,
7
  "<|placeholder3|>": 32004,
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31c2c06a3aa791be14685b5d59899a877c559219322620a519bd1b84ef483910
3
- size 100878458
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba8d5f3fb425d691ad00a11ff612f2c4ce2ef2f2350b1aa78ac024098d151a6
3
+ size 402868986
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:936b713051d3954452ec3bf4371217942ece2f8826b34ec15d76739514f6eb2c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df287feaf25c6bbc3e39d1e8402382f635590ca96adbe728944eb6f0edd1fc9
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a492b6d65e6e851d97e2025279f088a6a581867a0b4272350b086ae52aef4d06
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d23b4f208a9403528cc4590d75da0ba9842779b9cd25a1b5978ffbe9bcceb1
3
  size 1064
special_tokens_map.json CHANGED
@@ -7,14 +7,14 @@
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<|end|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<|pad|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
 
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "<|endoftext|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "<|endoftext|>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 2048,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 2048
12
- },
13
- "direction": "Left",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 32011,
16
- "pad_type_id": 0,
17
- "pad_token": "<|pad|>"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 0,
@@ -112,7 +98,7 @@
112
  "content": "<|end|>",
113
  "single_word": false,
114
  "lstrip": false,
115
- "rstrip": false,
116
  "normalized": false,
117
  "special": true
118
  },
@@ -142,15 +128,6 @@
142
  "rstrip": true,
143
  "normalized": false,
144
  "special": true
145
- },
146
- {
147
- "id": 32011,
148
- "content": "<|pad|>",
149
- "single_word": false,
150
- "lstrip": false,
151
- "rstrip": false,
152
- "normalized": false,
153
- "special": true
154
  }
155
  ],
156
  "normalizer": {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
98
  "content": "<|end|>",
99
  "single_word": false,
100
  "lstrip": false,
101
+ "rstrip": true,
102
  "normalized": false,
103
  "special": true
104
  },
 
128
  "rstrip": true,
129
  "normalized": false,
130
  "special": true
 
 
 
 
 
 
 
 
 
131
  }
132
  ],
133
  "normalizer": {
tokenizer_config.json CHANGED
@@ -87,7 +87,7 @@
87
  "content": "<|end|>",
88
  "lstrip": false,
89
  "normalized": false,
90
- "rstrip": false,
91
  "single_word": false,
92
  "special": true
93
  },
@@ -114,24 +114,16 @@
114
  "rstrip": true,
115
  "single_word": false,
116
  "special": true
117
- },
118
- "32011": {
119
- "content": "<|pad|>",
120
- "lstrip": false,
121
- "normalized": false,
122
- "rstrip": false,
123
- "single_word": false,
124
- "special": true
125
  }
126
  },
127
  "bos_token": "<s>",
128
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
129
  "clean_up_tokenization_spaces": false,
130
- "eos_token": "<|end|>",
131
  "extra_special_tokens": {},
132
  "legacy": false,
133
  "model_max_length": 131072,
134
- "pad_token": "<|pad|>",
135
  "padding_side": "left",
136
  "sp_model_kwargs": {},
137
  "tokenizer_class": "LlamaTokenizer",
 
87
  "content": "<|end|>",
88
  "lstrip": false,
89
  "normalized": false,
90
+ "rstrip": true,
91
  "single_word": false,
92
  "special": true
93
  },
 
114
  "rstrip": true,
115
  "single_word": false,
116
  "special": true
 
 
 
 
 
 
 
 
117
  }
118
  },
119
  "bos_token": "<s>",
120
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
  "clean_up_tokenization_spaces": false,
122
+ "eos_token": "<|endoftext|>",
123
  "extra_special_tokens": {},
124
  "legacy": false,
125
  "model_max_length": 131072,
126
+ "pad_token": "<|endoftext|>",
127
  "padding_side": "left",
128
  "sp_model_kwargs": {},
129
  "tokenizer_class": "LlamaTokenizer",
trainer_state.json CHANGED
@@ -1,50 +1,482 @@
1
  {
2
- "best_metric": 0.7067741751670837,
3
- "best_model_checkpoint": "./phi3_finetuned/checkpoint-16",
4
- "epoch": 1.8235294117647058,
5
  "eval_steps": 500,
6
- "global_step": 16,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
11
  {
12
  "epoch": 1.0,
13
- "eval_loss": 0.7619180083274841,
14
- "eval_runtime": 28.359,
15
- "eval_samples_per_second": 0.529,
16
- "eval_steps_per_second": 0.282,
17
  "step": 9
18
  },
19
  {
20
  "epoch": 1.1176470588235294,
21
- "grad_norm": 0.53853839635849,
22
- "learning_rate": 0.000225,
23
- "loss": 0.8601,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 1.8235294117647058,
28
- "eval_loss": 0.7067741751670837,
29
- "eval_runtime": 28.3045,
30
- "eval_samples_per_second": 0.53,
31
- "eval_steps_per_second": 0.283,
32
- "step": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  },
34
  {
35
- "epoch": 1.8235294117647058,
36
- "step": 16,
37
- "total_flos": 1.1336724214972416e+16,
38
- "train_loss": 0.8289451897144318,
39
- "train_runtime": 1583.6827,
40
- "train_samples_per_second": 0.17,
41
- "train_steps_per_second": 0.01
42
  }
43
  ],
44
- "logging_steps": 10,
45
- "max_steps": 16,
46
  "num_input_tokens_seen": 0,
47
- "num_train_epochs": 2,
48
  "save_steps": 500,
49
  "stateful_callbacks": {
50
  "TrainerControl": {
@@ -58,7 +490,7 @@
58
  "attributes": {}
59
  }
60
  },
61
- "total_flos": 1.1336724214972416e+16,
62
  "train_batch_size": 2,
63
  "trial_name": null,
64
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6163371205329895,
3
+ "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-200",
4
+ "epoch": 22.235294117647058,
5
  "eval_steps": 500,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.5882352941176471,
13
+ "grad_norm": 0.564612090587616,
14
+ "learning_rate": 5e-06,
15
+ "loss": 0.8053,
16
+ "step": 5
17
+ },
18
  {
19
  "epoch": 1.0,
20
+ "eval_loss": 0.7743130326271057,
21
+ "eval_runtime": 3.5233,
22
+ "eval_samples_per_second": 4.257,
23
+ "eval_steps_per_second": 0.568,
24
  "step": 9
25
  },
26
  {
27
  "epoch": 1.1176470588235294,
28
+ "grad_norm": 0.5536892414093018,
29
+ "learning_rate": 1e-05,
30
+ "loss": 0.7727,
31
  "step": 10
32
  },
33
  {
34
+ "epoch": 1.7058823529411766,
35
+ "grad_norm": 0.3953665494918823,
36
+ "learning_rate": 9.98292246503335e-06,
37
+ "loss": 0.7726,
38
+ "step": 15
39
+ },
40
+ {
41
+ "epoch": 2.0,
42
+ "eval_loss": 0.7348855137825012,
43
+ "eval_runtime": 3.3629,
44
+ "eval_samples_per_second": 4.46,
45
+ "eval_steps_per_second": 0.595,
46
+ "step": 18
47
+ },
48
+ {
49
+ "epoch": 2.235294117647059,
50
+ "grad_norm": 0.32548508048057556,
51
+ "learning_rate": 9.931806517013612e-06,
52
+ "loss": 0.7178,
53
+ "step": 20
54
+ },
55
+ {
56
+ "epoch": 2.8235294117647056,
57
+ "grad_norm": 0.27749133110046387,
58
+ "learning_rate": 9.847001329696653e-06,
59
+ "loss": 0.6607,
60
+ "step": 25
61
+ },
62
+ {
63
+ "epoch": 3.0,
64
+ "eval_loss": 0.7112905383110046,
65
+ "eval_runtime": 3.3607,
66
+ "eval_samples_per_second": 4.463,
67
+ "eval_steps_per_second": 0.595,
68
+ "step": 27
69
+ },
70
+ {
71
+ "epoch": 3.3529411764705883,
72
+ "grad_norm": 0.28755590319633484,
73
+ "learning_rate": 9.729086208503174e-06,
74
+ "loss": 0.7081,
75
+ "step": 30
76
+ },
77
+ {
78
+ "epoch": 3.9411764705882355,
79
+ "grad_norm": 0.25980502367019653,
80
+ "learning_rate": 9.578866633275289e-06,
81
+ "loss": 0.6063,
82
+ "step": 35
83
+ },
84
+ {
85
+ "epoch": 4.0,
86
+ "eval_loss": 0.6943528056144714,
87
+ "eval_runtime": 3.3629,
88
+ "eval_samples_per_second": 4.46,
89
+ "eval_steps_per_second": 0.595,
90
+ "step": 36
91
+ },
92
+ {
93
+ "epoch": 4.470588235294118,
94
+ "grad_norm": 0.23154301941394806,
95
+ "learning_rate": 9.397368756032445e-06,
96
+ "loss": 0.6561,
97
+ "step": 40
98
+ },
99
+ {
100
+ "epoch": 5.0,
101
+ "grad_norm": 0.30559542775154114,
102
+ "learning_rate": 9.185832391312644e-06,
103
+ "loss": 0.6935,
104
+ "step": 45
105
+ },
106
+ {
107
+ "epoch": 5.0,
108
+ "eval_loss": 0.6810200214385986,
109
+ "eval_runtime": 3.3611,
110
+ "eval_samples_per_second": 4.463,
111
+ "eval_steps_per_second": 0.595,
112
+ "step": 45
113
+ },
114
+ {
115
+ "epoch": 5.588235294117647,
116
+ "grad_norm": 0.21162718534469604,
117
+ "learning_rate": 8.94570254698197e-06,
118
+ "loss": 0.6829,
119
+ "step": 50
120
+ },
121
+ {
122
+ "epoch": 6.0,
123
+ "eval_loss": 0.6704084277153015,
124
+ "eval_runtime": 3.3625,
125
+ "eval_samples_per_second": 4.461,
126
+ "eval_steps_per_second": 0.595,
127
+ "step": 54
128
+ },
129
+ {
130
+ "epoch": 6.117647058823529,
131
+ "grad_norm": 0.26222917437553406,
132
+ "learning_rate": 8.67861955336566e-06,
133
+ "loss": 0.6021,
134
+ "step": 55
135
+ },
136
+ {
137
+ "epoch": 6.705882352941177,
138
+ "grad_norm": 0.23411308228969574,
139
+ "learning_rate": 8.386407858128707e-06,
140
+ "loss": 0.6483,
141
+ "step": 60
142
+ },
143
+ {
144
+ "epoch": 7.0,
145
+ "eval_loss": 0.6606718897819519,
146
+ "eval_runtime": 3.3601,
147
+ "eval_samples_per_second": 4.464,
148
+ "eval_steps_per_second": 0.595,
149
+ "step": 63
150
+ },
151
+ {
152
+ "epoch": 7.235294117647059,
153
+ "grad_norm": 0.18744103610515594,
154
+ "learning_rate": 8.071063563448341e-06,
155
+ "loss": 0.5817,
156
+ "step": 65
157
+ },
158
+ {
159
+ "epoch": 7.823529411764706,
160
+ "grad_norm": 0.18960484862327576,
161
+ "learning_rate": 7.734740790612137e-06,
162
+ "loss": 0.6352,
163
+ "step": 70
164
+ },
165
+ {
166
+ "epoch": 8.0,
167
+ "eval_loss": 0.6521106958389282,
168
+ "eval_runtime": 3.3613,
169
+ "eval_samples_per_second": 4.463,
170
+ "eval_steps_per_second": 0.595,
171
+ "step": 72
172
+ },
173
+ {
174
+ "epoch": 8.352941176470589,
175
+ "grad_norm": 0.15531951189041138,
176
+ "learning_rate": 7.379736965185369e-06,
177
+ "loss": 0.5719,
178
+ "step": 75
179
+ },
180
+ {
181
+ "epoch": 8.941176470588236,
182
+ "grad_norm": 0.34726396203041077,
183
+ "learning_rate": 7.008477123264849e-06,
184
+ "loss": 0.6186,
185
+ "step": 80
186
+ },
187
+ {
188
+ "epoch": 9.0,
189
+ "eval_loss": 0.6448661088943481,
190
+ "eval_runtime": 3.3624,
191
+ "eval_samples_per_second": 4.461,
192
+ "eval_steps_per_second": 0.595,
193
+ "step": 81
194
+ },
195
+ {
196
+ "epoch": 9.470588235294118,
197
+ "grad_norm": 0.1773035228252411,
198
+ "learning_rate": 6.6234973460234184e-06,
199
+ "loss": 0.6052,
200
+ "step": 85
201
+ },
202
+ {
203
+ "epoch": 10.0,
204
+ "grad_norm": 0.2170713096857071,
205
+ "learning_rate": 6.227427435703997e-06,
206
+ "loss": 0.5415,
207
+ "step": 90
208
+ },
209
+ {
210
+ "epoch": 10.0,
211
+ "eval_loss": 0.6390407681465149,
212
+ "eval_runtime": 3.3658,
213
+ "eval_samples_per_second": 4.457,
214
+ "eval_steps_per_second": 0.594,
215
+ "step": 90
216
+ },
217
+ {
218
+ "epoch": 10.588235294117647,
219
+ "grad_norm": 0.2540779709815979,
220
+ "learning_rate": 5.82297295140367e-06,
221
+ "loss": 0.6305,
222
+ "step": 95
223
+ },
224
+ {
225
+ "epoch": 11.0,
226
+ "eval_loss": 0.6332173943519592,
227
+ "eval_runtime": 3.3622,
228
+ "eval_samples_per_second": 4.461,
229
+ "eval_steps_per_second": 0.595,
230
+ "step": 99
231
+ },
232
+ {
233
+ "epoch": 11.117647058823529,
234
+ "grad_norm": 0.2432163953781128,
235
+ "learning_rate": 5.412896727361663e-06,
236
+ "loss": 0.5547,
237
+ "step": 100
238
+ },
239
+ {
240
+ "epoch": 11.705882352941176,
241
+ "grad_norm": 0.2414003312587738,
242
+ "learning_rate": 5e-06,
243
+ "loss": 0.5385,
244
+ "step": 105
245
+ },
246
+ {
247
+ "epoch": 12.0,
248
+ "eval_loss": 0.6285383701324463,
249
+ "eval_runtime": 3.3638,
250
+ "eval_samples_per_second": 4.459,
251
+ "eval_steps_per_second": 0.595,
252
+ "step": 108
253
+ },
254
+ {
255
+ "epoch": 12.235294117647058,
256
+ "grad_norm": 0.2067604809999466,
257
+ "learning_rate": 4.587103272638339e-06,
258
+ "loss": 0.536,
259
+ "step": 110
260
+ },
261
+ {
262
+ "epoch": 12.823529411764707,
263
+ "grad_norm": 0.29979485273361206,
264
+ "learning_rate": 4.17702704859633e-06,
265
+ "loss": 0.5896,
266
+ "step": 115
267
+ },
268
+ {
269
+ "epoch": 13.0,
270
+ "eval_loss": 0.6254769563674927,
271
+ "eval_runtime": 3.3694,
272
+ "eval_samples_per_second": 4.452,
273
+ "eval_steps_per_second": 0.594,
274
+ "step": 117
275
+ },
276
+ {
277
+ "epoch": 13.352941176470589,
278
+ "grad_norm": 0.1513441950082779,
279
+ "learning_rate": 3.7725725642960047e-06,
280
+ "loss": 0.5415,
281
+ "step": 120
282
+ },
283
+ {
284
+ "epoch": 13.941176470588236,
285
+ "grad_norm": 0.2250215709209442,
286
+ "learning_rate": 3.3765026539765832e-06,
287
+ "loss": 0.5612,
288
+ "step": 125
289
+ },
290
+ {
291
+ "epoch": 14.0,
292
+ "eval_loss": 0.6232194900512695,
293
+ "eval_runtime": 3.3613,
294
+ "eval_samples_per_second": 4.463,
295
+ "eval_steps_per_second": 0.595,
296
+ "step": 126
297
+ },
298
+ {
299
+ "epoch": 14.470588235294118,
300
+ "grad_norm": 0.21195632219314575,
301
+ "learning_rate": 2.991522876735154e-06,
302
+ "loss": 0.5624,
303
+ "step": 130
304
+ },
305
+ {
306
+ "epoch": 15.0,
307
+ "grad_norm": 0.4384087026119232,
308
+ "learning_rate": 2.6202630348146323e-06,
309
+ "loss": 0.5871,
310
+ "step": 135
311
+ },
312
+ {
313
+ "epoch": 15.0,
314
+ "eval_loss": 0.6213398575782776,
315
+ "eval_runtime": 3.3593,
316
+ "eval_samples_per_second": 4.465,
317
+ "eval_steps_per_second": 0.595,
318
+ "step": 135
319
+ },
320
+ {
321
+ "epoch": 15.588235294117647,
322
+ "grad_norm": 0.23890897631645203,
323
+ "learning_rate": 2.265259209387867e-06,
324
+ "loss": 0.5352,
325
+ "step": 140
326
+ },
327
+ {
328
+ "epoch": 16.0,
329
+ "eval_loss": 0.6193457841873169,
330
+ "eval_runtime": 3.3601,
331
+ "eval_samples_per_second": 4.464,
332
+ "eval_steps_per_second": 0.595,
333
+ "step": 144
334
+ },
335
+ {
336
+ "epoch": 16.11764705882353,
337
+ "grad_norm": 0.24785251915454865,
338
+ "learning_rate": 1.928936436551661e-06,
339
+ "loss": 0.5998,
340
+ "step": 145
341
+ },
342
+ {
343
+ "epoch": 16.705882352941178,
344
+ "grad_norm": 0.21428382396697998,
345
+ "learning_rate": 1.6135921418712959e-06,
346
+ "loss": 0.5564,
347
+ "step": 150
348
+ },
349
+ {
350
+ "epoch": 17.0,
351
+ "eval_loss": 0.618452787399292,
352
+ "eval_runtime": 3.3625,
353
+ "eval_samples_per_second": 4.461,
354
+ "eval_steps_per_second": 0.595,
355
+ "step": 153
356
+ },
357
+ {
358
+ "epoch": 17.235294117647058,
359
+ "grad_norm": 0.19924059510231018,
360
+ "learning_rate": 1.321380446634342e-06,
361
+ "loss": 0.4868,
362
+ "step": 155
363
+ },
364
+ {
365
+ "epoch": 17.823529411764707,
366
+ "grad_norm": 0.16416364908218384,
367
+ "learning_rate": 1.0542974530180327e-06,
368
+ "loss": 0.6029,
369
+ "step": 160
370
+ },
371
+ {
372
+ "epoch": 18.0,
373
+ "eval_loss": 0.6172903776168823,
374
+ "eval_runtime": 3.3616,
375
+ "eval_samples_per_second": 4.462,
376
+ "eval_steps_per_second": 0.595,
377
+ "step": 162
378
+ },
379
+ {
380
+ "epoch": 18.352941176470587,
381
+ "grad_norm": 0.21794988214969635,
382
+ "learning_rate": 8.141676086873574e-07,
383
+ "loss": 0.4832,
384
+ "step": 165
385
+ },
386
+ {
387
+ "epoch": 18.941176470588236,
388
+ "grad_norm": 0.27910733222961426,
389
+ "learning_rate": 6.026312439675553e-07,
390
+ "loss": 0.5107,
391
+ "step": 170
392
+ },
393
+ {
394
+ "epoch": 19.0,
395
+ "eval_loss": 0.617369532585144,
396
+ "eval_runtime": 3.3609,
397
+ "eval_samples_per_second": 4.463,
398
+ "eval_steps_per_second": 0.595,
399
+ "step": 171
400
+ },
401
+ {
402
+ "epoch": 19.470588235294116,
403
+ "grad_norm": 0.21645767986774445,
404
+ "learning_rate": 4.211333667247125e-07,
405
+ "loss": 0.5692,
406
+ "step": 175
407
+ },
408
+ {
409
+ "epoch": 20.0,
410
+ "grad_norm": 0.39115971326828003,
411
+ "learning_rate": 2.7091379149682683e-07,
412
+ "loss": 0.5808,
413
+ "step": 180
414
+ },
415
+ {
416
+ "epoch": 20.0,
417
+ "eval_loss": 0.6167533993721008,
418
+ "eval_runtime": 3.3616,
419
+ "eval_samples_per_second": 4.462,
420
+ "eval_steps_per_second": 0.595,
421
+ "step": 180
422
+ },
423
+ {
424
+ "epoch": 20.58823529411765,
425
+ "grad_norm": 0.26653149724006653,
426
+ "learning_rate": 1.5299867030334815e-07,
427
+ "loss": 0.5835,
428
+ "step": 185
429
+ },
430
+ {
431
+ "epoch": 21.0,
432
+ "eval_loss": 0.6167729496955872,
433
+ "eval_runtime": 3.3615,
434
+ "eval_samples_per_second": 4.462,
435
+ "eval_steps_per_second": 0.595,
436
+ "step": 189
437
+ },
438
+ {
439
+ "epoch": 21.11764705882353,
440
+ "grad_norm": 0.27125898003578186,
441
+ "learning_rate": 6.819348298638839e-08,
442
+ "loss": 0.5515,
443
+ "step": 190
444
+ },
445
+ {
446
+ "epoch": 21.705882352941178,
447
+ "grad_norm": 0.20525327324867249,
448
+ "learning_rate": 1.7077534966650767e-08,
449
+ "loss": 0.5211,
450
+ "step": 195
451
+ },
452
+ {
453
+ "epoch": 22.0,
454
+ "eval_loss": 0.6173871159553528,
455
+ "eval_runtime": 3.3629,
456
+ "eval_samples_per_second": 4.46,
457
+ "eval_steps_per_second": 0.595,
458
+ "step": 198
459
+ },
460
+ {
461
+ "epoch": 22.235294117647058,
462
+ "grad_norm": 0.19269497692584991,
463
+ "learning_rate": 0.0,
464
+ "loss": 0.5147,
465
+ "step": 200
466
  },
467
  {
468
+ "epoch": 22.235294117647058,
469
+ "eval_loss": 0.6163371205329895,
470
+ "eval_runtime": 3.3629,
471
+ "eval_samples_per_second": 4.46,
472
+ "eval_steps_per_second": 0.595,
473
+ "step": 200
 
474
  }
475
  ],
476
+ "logging_steps": 5,
477
+ "max_steps": 200,
478
  "num_input_tokens_seen": 0,
479
+ "num_train_epochs": 25,
480
  "save_steps": 500,
481
  "stateful_callbacks": {
482
  "TrainerControl": {
 
490
  "attributes": {}
491
  }
492
  },
493
+ "total_flos": 1.0471069689549005e+17,
494
  "train_batch_size": 2,
495
  "trial_name": null,
496
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08d02580fa61a9f76991a83deddc72fb2290b7a3aa24cadbb04f537cefeeef71
3
- size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354b637532320af3e4fc7a75a7a30ab3076b3ef28de912201c92125d861c2822
3
+ size 5624