quyanh commited on
Commit
0c07ad8
·
1 Parent(s): a40a5fe

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -16,12 +16,12 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "v_proj",
20
  "lm_head",
21
- "q_proj",
22
  "down_proj",
23
- "k_proj",
24
  "up_proj",
 
 
25
  "gate_proj",
26
  "o_proj"
27
  ],
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
19
  "lm_head",
 
20
  "down_proj",
21
+ "q_proj",
22
  "up_proj",
23
+ "k_proj",
24
+ "v_proj",
25
  "gate_proj",
26
  "o_proj"
27
  ],
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0c57cc792c2d04958a7adb1d1eeb063030e0b3db7decb679cd387e8c51dfdb21
3
  size 85100592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6efe7d5921665fda917c906f1427a1880a0089b02cd0f7eb51b99a6d43957ac
3
  size 85100592
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c8efa25384b32ed5e79edff75108806386b469115e4f287d779f0e1cb05ffb4
3
- size 43126695
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5600b3956d0427774a54d7f3bf3b7938a23d41b7b69dc207ab3e78e2479d7f37
3
+ size 43127132
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e5e7f0ce4eef3ed4815bac2b344952bea148a18b69ca09c3b646335d6476562
3
- size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad9dc56dfc01b966639cac8cd8f049d1b8d912aa8e90419adc5b16ff7e57382f
3
+ size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:890a05fec8ca27fb18cd86c380b6b21cc3b117cc9b2e5175e533b6b8ebd64c7b
3
- size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eedfec4bc6f5a00a07b48568793720af057cf404e35f584cd69e8f806039a34d
3
+ size 1064
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.849707912904939,
5
  "eval_steps": 20,
6
  "global_step": 400,
7
  "is_hyper_param_search": false,
@@ -11,286 +11,287 @@
11
  {
12
  "epoch": 0.04,
13
  "learning_rate": 1.9325842696629215e-05,
14
- "loss": 1.8168,
15
  "step": 20
16
  },
17
  {
18
  "epoch": 0.04,
19
- "eval_loss": 1.768841028213501,
20
- "eval_runtime": 215.902,
21
- "eval_samples_per_second": 1.931,
22
- "eval_steps_per_second": 0.195,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.08,
27
  "learning_rate": 1.8426966292134835e-05,
28
- "loss": 1.7432,
29
  "step": 40
30
  },
31
  {
32
- "epoch": 0.08,
33
- "eval_loss": 1.7144243717193604,
34
- "eval_runtime": 215.6923,
35
- "eval_samples_per_second": 1.933,
36
- "eval_steps_per_second": 0.195,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13,
41
  "learning_rate": 1.752808988764045e-05,
42
- "loss": 1.7052,
43
  "step": 60
44
  },
45
  {
46
  "epoch": 0.13,
47
- "eval_loss": 1.678654432296753,
48
- "eval_runtime": 215.6797,
49
- "eval_samples_per_second": 1.933,
50
- "eval_steps_per_second": 0.195,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.17,
55
  "learning_rate": 1.662921348314607e-05,
56
- "loss": 1.6851,
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.17,
61
- "eval_loss": 1.6587953567504883,
62
- "eval_runtime": 215.657,
63
- "eval_samples_per_second": 1.934,
64
- "eval_steps_per_second": 0.195,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21,
69
  "learning_rate": 1.5730337078651687e-05,
70
- "loss": 1.6265,
71
  "step": 100
72
  },
73
  {
74
  "epoch": 0.21,
75
- "eval_loss": 1.6458046436309814,
76
- "eval_runtime": 215.6322,
77
- "eval_samples_per_second": 1.934,
78
- "eval_steps_per_second": 0.195,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.25,
83
  "learning_rate": 1.4831460674157305e-05,
84
- "loss": 1.6617,
85
  "step": 120
86
  },
87
  {
88
- "epoch": 0.25,
89
- "eval_loss": 1.636366844177246,
90
- "eval_runtime": 215.6707,
91
- "eval_samples_per_second": 1.934,
92
- "eval_steps_per_second": 0.195,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3,
97
  "learning_rate": 1.3932584269662923e-05,
98
- "loss": 1.651,
99
  "step": 140
100
  },
101
  {
102
  "epoch": 0.3,
103
- "eval_loss": 1.6294023990631104,
104
- "eval_runtime": 215.621,
105
- "eval_samples_per_second": 1.934,
106
- "eval_steps_per_second": 0.195,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.34,
111
  "learning_rate": 1.303370786516854e-05,
112
- "loss": 1.6218,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 0.34,
117
- "eval_loss": 1.6229923963546753,
118
- "eval_runtime": 215.6539,
119
- "eval_samples_per_second": 1.934,
120
- "eval_steps_per_second": 0.195,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.38,
125
  "learning_rate": 1.213483146067416e-05,
126
- "loss": 1.625,
127
  "step": 180
128
  },
129
  {
130
  "epoch": 0.38,
131
- "eval_loss": 1.6182714700698853,
132
- "eval_runtime": 215.6429,
133
- "eval_samples_per_second": 1.934,
134
- "eval_steps_per_second": 0.195,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.42,
139
  "learning_rate": 1.1235955056179778e-05,
140
- "loss": 1.6358,
141
  "step": 200
142
  },
143
  {
144
- "epoch": 0.42,
145
- "eval_loss": 1.6151902675628662,
146
- "eval_runtime": 215.6354,
147
- "eval_samples_per_second": 1.934,
148
- "eval_steps_per_second": 0.195,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.47,
153
  "learning_rate": 1.0337078651685396e-05,
154
- "loss": 1.6118,
155
  "step": 220
156
  },
157
  {
158
  "epoch": 0.47,
159
- "eval_loss": 1.6117970943450928,
160
- "eval_runtime": 215.6265,
161
- "eval_samples_per_second": 1.934,
162
- "eval_steps_per_second": 0.195,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.51,
167
  "learning_rate": 9.438202247191012e-06,
168
- "loss": 1.5914,
169
  "step": 240
170
  },
171
  {
172
  "epoch": 0.51,
173
- "eval_loss": 1.6088885068893433,
174
- "eval_runtime": 215.6483,
175
- "eval_samples_per_second": 1.934,
176
- "eval_steps_per_second": 0.195,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.55,
181
  "learning_rate": 8.53932584269663e-06,
182
- "loss": 1.6321,
183
  "step": 260
184
  },
185
  {
186
  "epoch": 0.55,
187
- "eval_loss": 1.6063815355300903,
188
- "eval_runtime": 215.612,
189
- "eval_samples_per_second": 1.934,
190
- "eval_steps_per_second": 0.195,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 0.59,
195
  "learning_rate": 7.640449438202247e-06,
196
- "loss": 1.6139,
197
  "step": 280
198
  },
199
  {
200
- "epoch": 0.59,
201
- "eval_loss": 1.6046371459960938,
202
- "eval_runtime": 215.6502,
203
- "eval_samples_per_second": 1.934,
204
- "eval_steps_per_second": 0.195,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.64,
209
  "learning_rate": 6.741573033707865e-06,
210
- "loss": 1.5742,
211
  "step": 300
212
  },
213
  {
214
  "epoch": 0.64,
215
- "eval_loss": 1.6029813289642334,
216
- "eval_runtime": 215.6664,
217
- "eval_samples_per_second": 1.934,
218
- "eval_steps_per_second": 0.195,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.68,
223
  "learning_rate": 5.842696629213483e-06,
224
- "loss": 1.5757,
225
  "step": 320
226
  },
227
  {
228
  "epoch": 0.68,
229
- "eval_loss": 1.6015688180923462,
230
- "eval_runtime": 215.6424,
231
- "eval_samples_per_second": 1.934,
232
- "eval_steps_per_second": 0.195,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.72,
237
  "learning_rate": 4.943820224719101e-06,
238
- "loss": 1.577,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 0.72,
243
- "eval_loss": 1.6001653671264648,
244
- "eval_runtime": 215.6467,
245
- "eval_samples_per_second": 1.934,
246
- "eval_steps_per_second": 0.195,
247
  "step": 340
248
  },
249
  {
250
- "epoch": 0.76,
251
  "learning_rate": 4.04494382022472e-06,
252
- "loss": 1.6077,
253
  "step": 360
254
  },
255
  {
256
- "epoch": 0.76,
257
- "eval_loss": 1.5988893508911133,
258
- "eval_runtime": 215.6429,
259
- "eval_samples_per_second": 1.934,
260
- "eval_steps_per_second": 0.195,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.81,
265
  "learning_rate": 3.146067415730337e-06,
266
- "loss": 1.6011,
267
  "step": 380
268
  },
269
  {
270
  "epoch": 0.81,
271
- "eval_loss": 1.597815752029419,
272
- "eval_runtime": 215.6307,
273
- "eval_samples_per_second": 1.934,
274
- "eval_steps_per_second": 0.195,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.85,
279
  "learning_rate": 2.2471910112359554e-06,
280
- "loss": 1.5982,
281
  "step": 400
282
  },
283
  {
284
  "epoch": 0.85,
285
- "eval_loss": 1.5970968008041382,
286
- "eval_runtime": 215.6746,
287
- "eval_samples_per_second": 1.933,
288
- "eval_steps_per_second": 0.195,
289
  "step": 400
290
  }
291
  ],
292
  "logging_steps": 20,
293
  "max_steps": 450,
 
294
  "num_train_epochs": 1,
295
  "save_steps": 20,
296
  "total_flos": 2.804385205714944e+17,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8506113769271664,
5
  "eval_steps": 20,
6
  "global_step": 400,
7
  "is_hyper_param_search": false,
 
11
  {
12
  "epoch": 0.04,
13
  "learning_rate": 1.9325842696629215e-05,
14
+ "loss": 1.8391,
15
  "step": 20
16
  },
17
  {
18
  "epoch": 0.04,
19
+ "eval_loss": 1.7458518743515015,
20
+ "eval_runtime": 218.3179,
21
+ "eval_samples_per_second": 1.928,
22
+ "eval_steps_per_second": 0.197,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.09,
27
  "learning_rate": 1.8426966292134835e-05,
28
+ "loss": 1.7536,
29
  "step": 40
30
  },
31
  {
32
+ "epoch": 0.09,
33
+ "eval_loss": 1.693428635597229,
34
+ "eval_runtime": 218.0486,
35
+ "eval_samples_per_second": 1.931,
36
+ "eval_steps_per_second": 0.197,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.13,
41
  "learning_rate": 1.752808988764045e-05,
42
+ "loss": 1.7346,
43
  "step": 60
44
  },
45
  {
46
  "epoch": 0.13,
47
+ "eval_loss": 1.6598807573318481,
48
+ "eval_runtime": 218.036,
49
+ "eval_samples_per_second": 1.931,
50
+ "eval_steps_per_second": 0.197,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.17,
55
  "learning_rate": 1.662921348314607e-05,
56
+ "loss": 1.6951,
57
  "step": 80
58
  },
59
  {
60
  "epoch": 0.17,
61
+ "eval_loss": 1.639635443687439,
62
+ "eval_runtime": 218.0597,
63
+ "eval_samples_per_second": 1.931,
64
+ "eval_steps_per_second": 0.197,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.21,
69
  "learning_rate": 1.5730337078651687e-05,
70
+ "loss": 1.6508,
71
  "step": 100
72
  },
73
  {
74
  "epoch": 0.21,
75
+ "eval_loss": 1.626449465751648,
76
+ "eval_runtime": 218.0237,
77
+ "eval_samples_per_second": 1.931,
78
+ "eval_steps_per_second": 0.197,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.26,
83
  "learning_rate": 1.4831460674157305e-05,
84
+ "loss": 1.6245,
85
  "step": 120
86
  },
87
  {
88
+ "epoch": 0.26,
89
+ "eval_loss": 1.616517186164856,
90
+ "eval_runtime": 218.0987,
91
+ "eval_samples_per_second": 1.93,
92
+ "eval_steps_per_second": 0.197,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.3,
97
  "learning_rate": 1.3932584269662923e-05,
98
+ "loss": 1.6023,
99
  "step": 140
100
  },
101
  {
102
  "epoch": 0.3,
103
+ "eval_loss": 1.6096081733703613,
104
+ "eval_runtime": 218.0759,
105
+ "eval_samples_per_second": 1.931,
106
+ "eval_steps_per_second": 0.197,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.34,
111
  "learning_rate": 1.303370786516854e-05,
112
+ "loss": 1.6259,
113
  "step": 160
114
  },
115
  {
116
  "epoch": 0.34,
117
+ "eval_loss": 1.6037150621414185,
118
+ "eval_runtime": 218.0228,
119
+ "eval_samples_per_second": 1.931,
120
+ "eval_steps_per_second": 0.197,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.38,
125
  "learning_rate": 1.213483146067416e-05,
126
+ "loss": 1.6116,
127
  "step": 180
128
  },
129
  {
130
  "epoch": 0.38,
131
+ "eval_loss": 1.599488377571106,
132
+ "eval_runtime": 218.0297,
133
+ "eval_samples_per_second": 1.931,
134
+ "eval_steps_per_second": 0.197,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.43,
139
  "learning_rate": 1.1235955056179778e-05,
140
+ "loss": 1.6288,
141
  "step": 200
142
  },
143
  {
144
+ "epoch": 0.43,
145
+ "eval_loss": 1.595850944519043,
146
+ "eval_runtime": 218.1168,
147
+ "eval_samples_per_second": 1.93,
148
+ "eval_steps_per_second": 0.197,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.47,
153
  "learning_rate": 1.0337078651685396e-05,
154
+ "loss": 1.6328,
155
  "step": 220
156
  },
157
  {
158
  "epoch": 0.47,
159
+ "eval_loss": 1.5929853916168213,
160
+ "eval_runtime": 218.0481,
161
+ "eval_samples_per_second": 1.931,
162
+ "eval_steps_per_second": 0.197,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.51,
167
  "learning_rate": 9.438202247191012e-06,
168
+ "loss": 1.622,
169
  "step": 240
170
  },
171
  {
172
  "epoch": 0.51,
173
+ "eval_loss": 1.590191125869751,
174
+ "eval_runtime": 218.0232,
175
+ "eval_samples_per_second": 1.931,
176
+ "eval_steps_per_second": 0.197,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.55,
181
  "learning_rate": 8.53932584269663e-06,
182
+ "loss": 1.5966,
183
  "step": 260
184
  },
185
  {
186
  "epoch": 0.55,
187
+ "eval_loss": 1.5878028869628906,
188
+ "eval_runtime": 218.0617,
189
+ "eval_samples_per_second": 1.931,
190
+ "eval_steps_per_second": 0.197,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 0.6,
195
  "learning_rate": 7.640449438202247e-06,
196
+ "loss": 1.6094,
197
  "step": 280
198
  },
199
  {
200
+ "epoch": 0.6,
201
+ "eval_loss": 1.5858081579208374,
202
+ "eval_runtime": 218.1219,
203
+ "eval_samples_per_second": 1.93,
204
+ "eval_steps_per_second": 0.197,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.64,
209
  "learning_rate": 6.741573033707865e-06,
210
+ "loss": 1.5934,
211
  "step": 300
212
  },
213
  {
214
  "epoch": 0.64,
215
+ "eval_loss": 1.584080696105957,
216
+ "eval_runtime": 218.0609,
217
+ "eval_samples_per_second": 1.931,
218
+ "eval_steps_per_second": 0.197,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.68,
223
  "learning_rate": 5.842696629213483e-06,
224
+ "loss": 1.59,
225
  "step": 320
226
  },
227
  {
228
  "epoch": 0.68,
229
+ "eval_loss": 1.5824154615402222,
230
+ "eval_runtime": 218.0845,
231
+ "eval_samples_per_second": 1.93,
232
+ "eval_steps_per_second": 0.197,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.72,
237
  "learning_rate": 4.943820224719101e-06,
238
+ "loss": 1.6134,
239
  "step": 340
240
  },
241
  {
242
  "epoch": 0.72,
243
+ "eval_loss": 1.5810182094573975,
244
+ "eval_runtime": 218.1022,
245
+ "eval_samples_per_second": 1.93,
246
+ "eval_steps_per_second": 0.197,
247
  "step": 340
248
  },
249
  {
250
+ "epoch": 0.77,
251
  "learning_rate": 4.04494382022472e-06,
252
+ "loss": 1.5954,
253
  "step": 360
254
  },
255
  {
256
+ "epoch": 0.77,
257
+ "eval_loss": 1.5799251794815063,
258
+ "eval_runtime": 218.1537,
259
+ "eval_samples_per_second": 1.93,
260
+ "eval_steps_per_second": 0.197,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.81,
265
  "learning_rate": 3.146067415730337e-06,
266
+ "loss": 1.5973,
267
  "step": 380
268
  },
269
  {
270
  "epoch": 0.81,
271
+ "eval_loss": 1.579064130783081,
272
+ "eval_runtime": 218.0366,
273
+ "eval_samples_per_second": 1.931,
274
+ "eval_steps_per_second": 0.197,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.85,
279
  "learning_rate": 2.2471910112359554e-06,
280
+ "loss": 1.5468,
281
  "step": 400
282
  },
283
  {
284
  "epoch": 0.85,
285
+ "eval_loss": 1.5782713890075684,
286
+ "eval_runtime": 218.0535,
287
+ "eval_samples_per_second": 1.931,
288
+ "eval_steps_per_second": 0.197,
289
  "step": 400
290
  }
291
  ],
292
  "logging_steps": 20,
293
  "max_steps": 450,
294
+ "num_input_tokens_seen": 0,
295
  "num_train_epochs": 1,
296
  "save_steps": 20,
297
  "total_flos": 2.804385205714944e+17,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab3305b7dcf9288a7ba6e67a27b1b835f0e809d24726ea214c0d4a17cbd03386
3
- size 4155
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18f22170b63f557ce8ed503ea5362cfa3b2bf6a88a049f98dbc25116e2a6df00
3
+ size 4664