danwils commited on
Commit
e14f681
·
verified ·
1 Parent(s): 2699d86

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ base_model: sail/Sailor-7B
7
+ model-index:
8
+ - name: Sailor-7B-toba
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.0`
19
+ ```yaml
20
+ base_model: sail/Sailor-7B
21
+ model_type: AutoModelForCausalLM
22
+ tokenizer_type: AutoTokenizer
23
+ is_mistral_derived_model: false
24
+
25
+ load_in_8bit: false
26
+ load_in_4bit: true
27
+ strict: false
28
+
29
+ datasets:
30
+ #we used a small dataset to teach the model function calling abilities
31
+ - path: ./echonettobatrain.jsonl
32
+ ds_type: json
33
+ type: sharegpt
34
+
35
+ dataset_prepared_path: last_run_function_call
36
+ #0.05
37
+ val_set_size: 0.02
38
+ output_dir: ./Sailor-7B-toba
39
+
40
+ adapter: qlora
41
+ lora_model_dir:
42
+
43
+ sequence_len: 8192
44
+ sample_packing: false
45
+ eval_sample_packing: true
46
+ pad_to_sequence_len: true
47
+
48
+ # important, to get the same trainable parameters then for a qlora training with lora_r=32 and lora_alpha=16 you need to adjust the lora_r depending on the amount of filtered layers you want to use. With top_n=4 you would go for lora_r= 256
49
+
50
+ lora_r: 64
51
+ lora_alpha: 16
52
+ lora_dropout: 0.05
53
+ lora_target_linear: false
54
+ lora_fan_in_fan_out:
55
+ lora_target_modules:
56
+ - layers.0.self_attn.v_proj
57
+ - layers.1.self_attn.v_proj
58
+ - layers.2.self_attn.v_proj
59
+ - layers.3.self_attn.v_proj
60
+ - layers.4.self_attn.v_proj
61
+ - layers.5.self_attn.v_proj
62
+ - layers.6.self_attn.v_proj
63
+ - layers.7.self_attn.v_proj
64
+ - layers.8.self_attn.v_proj
65
+ - layers.9.self_attn.v_proj
66
+ - layers.10.self_attn.v_proj
67
+ - layers.11.self_attn.v_proj
68
+ - layers.12.self_attn.v_proj
69
+ - layers.13.self_attn.v_proj
70
+ - layers.14.self_attn.v_proj
71
+ - layers.15.self_attn.v_proj
72
+ - layers.16.self_attn.v_proj
73
+ - layers.17.self_attn.v_proj
74
+ - layers.18.self_attn.v_proj
75
+ - layers.19.self_attn.v_proj
76
+ - layers.20.self_attn.v_proj
77
+ - layers.21.self_attn.v_proj
78
+ - layers.22.self_attn.v_proj
79
+ - layers.23.self_attn.v_proj
80
+ - layers.24.self_attn.v_proj
81
+ - layers.25.self_attn.v_proj
82
+ - layers.26.self_attn.v_proj
83
+ - layers.27.self_attn.v_proj
84
+ - layers.28.self_attn.v_proj
85
+ - layers.29.self_attn.v_proj
86
+ - layers.30.self_attn.v_proj
87
+ - layers.31.self_attn.v_proj
88
+ - layers.0.self_attn.k_proj
89
+ - layers.1.self_attn.k_proj
90
+ - layers.2.self_attn.k_proj
91
+ - layers.3.self_attn.k_proj
92
+ - layers.4.self_attn.k_proj
93
+ - layers.5.self_attn.k_proj
94
+ - layers.6.self_attn.k_proj
95
+ - layers.7.self_attn.k_proj
96
+ - layers.8.self_attn.k_proj
97
+ - layers.9.self_attn.k_proj
98
+ - layers.10.self_attn.k_proj
99
+ - layers.11.self_attn.k_proj
100
+ - layers.12.self_attn.k_proj
101
+ - layers.13.self_attn.k_proj
102
+ - layers.14.self_attn.k_proj
103
+ - layers.15.self_attn.k_proj
104
+ - layers.16.self_attn.k_proj
105
+ - layers.17.self_attn.k_proj
106
+ - layers.18.self_attn.k_proj
107
+ - layers.19.self_attn.k_proj
108
+ - layers.20.self_attn.k_proj
109
+ - layers.21.self_attn.k_proj
110
+ - layers.22.self_attn.k_proj
111
+ - layers.23.self_attn.k_proj
112
+ - layers.24.self_attn.k_proj
113
+ - layers.25.self_attn.k_proj
114
+ - layers.26.self_attn.k_proj
115
+ - layers.27.self_attn.k_proj
116
+ - layers.28.self_attn.k_proj
117
+ - layers.29.self_attn.k_proj
118
+ - layers.30.self_attn.k_proj
119
+ - layers.31.self_attn.k_proj
120
+ - layers.0.self_attn.o_proj
121
+ - layers.1.self_attn.o_proj
122
+ - layers.2.self_attn.o_proj
123
+ - layers.3.self_attn.o_proj
124
+ - layers.4.self_attn.o_proj
125
+ - layers.5.self_attn.o_proj
126
+ - layers.6.self_attn.o_proj
127
+ - layers.7.self_attn.o_proj
128
+ - layers.8.self_attn.o_proj
129
+ - layers.9.self_attn.o_proj
130
+ - layers.10.self_attn.o_proj
131
+ - layers.11.self_attn.o_proj
132
+ - layers.12.self_attn.o_proj
133
+ - layers.13.self_attn.o_proj
134
+ - layers.14.self_attn.o_proj
135
+ - layers.15.self_attn.o_proj
136
+ - layers.16.self_attn.o_proj
137
+ - layers.17.self_attn.o_proj
138
+ - layers.18.self_attn.o_proj
139
+ - layers.19.self_attn.o_proj
140
+ - layers.20.self_attn.o_proj
141
+ - layers.21.self_attn.o_proj
142
+ - layers.22.self_attn.o_proj
143
+ - layers.23.self_attn.o_proj
144
+ - layers.24.self_attn.o_proj
145
+ - layers.25.self_attn.o_proj
146
+ - layers.26.self_attn.o_proj
147
+ - layers.27.self_attn.o_proj
148
+ - layers.28.self_attn.o_proj
149
+ - layers.29.self_attn.o_proj
150
+ - layers.30.self_attn.o_proj
151
+ - layers.31.self_attn.o_proj
152
+ - layers.0.self_attn.q_proj
153
+ - layers.1.self_attn.q_proj
154
+ - layers.2.self_attn.q_proj
155
+ - layers.3.self_attn.q_proj
156
+ - layers.4.self_attn.q_proj
157
+ - layers.5.self_attn.q_proj
158
+ - layers.6.self_attn.q_proj
159
+ - layers.7.self_attn.q_proj
160
+ - layers.8.self_attn.q_proj
161
+ - layers.9.self_attn.q_proj
162
+ - layers.10.self_attn.q_proj
163
+ - layers.11.self_attn.q_proj
164
+ - layers.12.self_attn.q_proj
165
+ - layers.13.self_attn.q_proj
166
+ - layers.14.self_attn.q_proj
167
+ - layers.15.self_attn.q_proj
168
+ - layers.16.self_attn.q_proj
169
+ - layers.17.self_attn.q_proj
170
+ - layers.18.self_attn.q_proj
171
+ - layers.19.self_attn.q_proj
172
+ - layers.20.self_attn.q_proj
173
+ - layers.21.self_attn.q_proj
174
+ - layers.22.self_attn.q_proj
175
+ - layers.23.self_attn.q_proj
176
+ - layers.24.self_attn.q_proj
177
+ - layers.25.self_attn.q_proj
178
+ - layers.26.self_attn.q_proj
179
+ - layers.27.self_attn.q_proj
180
+ - layers.28.self_attn.q_proj
181
+ - layers.29.self_attn.q_proj
182
+ - layers.30.self_attn.q_proj
183
+ - layers.31.self_attn.q_proj
184
+ - layers.0.mlp.down_proj
185
+ - layers.1.mlp.down_proj
186
+ - layers.2.mlp.down_proj
187
+ - layers.3.mlp.down_proj
188
+ - layers.4.mlp.down_proj
189
+ - layers.5.mlp.down_proj
190
+ - layers.6.mlp.down_proj
191
+ - layers.7.mlp.down_proj
192
+ - layers.8.mlp.down_proj
193
+ - layers.9.mlp.down_proj
194
+ - layers.10.mlp.down_proj
195
+ - layers.11.mlp.down_proj
196
+ - layers.12.mlp.down_proj
197
+ - layers.13.mlp.down_proj
198
+ - layers.14.mlp.down_proj
199
+ - layers.15.mlp.down_proj
200
+ - layers.16.mlp.down_proj
201
+ - layers.17.mlp.down_proj
202
+ - layers.18.mlp.down_proj
203
+ - layers.19.mlp.down_proj
204
+ - layers.20.mlp.down_proj
205
+ - layers.21.mlp.down_proj
206
+ - layers.22.mlp.down_proj
207
+ - layers.23.mlp.down_proj
208
+ - layers.24.mlp.down_proj
209
+ - layers.25.mlp.down_proj
210
+ - layers.26.mlp.down_proj
211
+ - layers.27.mlp.down_proj
212
+ - layers.28.mlp.down_proj
213
+ - layers.29.mlp.down_proj
214
+ - layers.30.mlp.down_proj
215
+ - layers.31.mlp.down_proj
216
+ - layers.0.mlp.up_proj
217
+ - layers.1.mlp.up_proj
218
+ - layers.2.mlp.up_proj
219
+ - layers.3.mlp.up_proj
220
+ - layers.4.mlp.up_proj
221
+ - layers.5.mlp.up_proj
222
+ - layers.6.mlp.up_proj
223
+ - layers.7.mlp.up_proj
224
+ - layers.8.mlp.up_proj
225
+ - layers.9.mlp.up_proj
226
+ - layers.10.mlp.up_proj
227
+ - layers.11.mlp.up_proj
228
+ - layers.12.mlp.up_proj
229
+ - layers.13.mlp.up_proj
230
+ - layers.14.mlp.up_proj
231
+ - layers.15.mlp.up_proj
232
+ - layers.16.mlp.up_proj
233
+ - layers.17.mlp.up_proj
234
+ - layers.18.mlp.up_proj
235
+ - layers.19.mlp.up_proj
236
+ - layers.20.mlp.up_proj
237
+ - layers.21.mlp.up_proj
238
+ - layers.22.mlp.up_proj
239
+ - layers.23.mlp.up_proj
240
+ - layers.24.mlp.up_proj
241
+ - layers.25.mlp.up_proj
242
+ - layers.26.mlp.up_proj
243
+ - layers.27.mlp.up_proj
244
+ - layers.28.mlp.up_proj
245
+ - layers.29.mlp.up_proj
246
+ - layers.30.mlp.up_proj
247
+ - layers.31.mlp.up_proj
248
+ # important: you need to unfreeze the lm.head
249
+ - lm.head
250
+
251
+ wandb_project: axolotl-sailor7b-toba
252
+ wandb_entity:
253
+ wandb_watch:
254
+ wandb_run_id:
255
+ wandb_log_model:
256
+
257
+ gradient_accumulation_steps: 2
258
+ micro_batch_size: 2
259
+ num_epochs: 3
260
+ optimizer: adamw_bnb_8bit
261
+ lr_scheduler: cosine
262
+ learning_rate: 0.00025
263
+
264
+ train_on_inputs: false
265
+ group_by_length: false
266
+ bf16: true
267
+ fp16: false
268
+ tf32: false
269
+
270
+ gradient_checkpointing: true
271
+ early_stopping_patience:
272
+ resume_from_checkpoint:
273
+ local_rank:
274
+ logging_steps: 1
275
+ xformers_attention:
276
+ flash_attention: true
277
+
278
+ warmup_steps: 100
279
+ eval_steps: 0.2
280
+ eval_table_size:
281
+ eval_table_max_new_tokens: 128
282
+ save_steps:
283
+ debug:
284
+ deepspeed:
285
+ weight_decay: 0.0
286
+ fsdp:
287
+ fsdp_config:
288
+ special_tokens:
289
+ ```
290
+
291
+ </details><br>
292
+
293
+ # Sailor-7B-toba
294
+
295
+ This model is a fine-tuned version of [sail/Sailor-7B](https://huggingface.co/sail/Sailor-7B) on the None dataset.
296
+ It achieves the following results on the evaluation set:
297
+ - Loss: 1.3876
298
+
299
+ ## Model description
300
+
301
+ More information needed
302
+
303
+ ## Intended uses & limitations
304
+
305
+ More information needed
306
+
307
+ ## Training and evaluation data
308
+
309
+ More information needed
310
+
311
+ ## Training procedure
312
+
313
+ ### Training hyperparameters
314
+
315
+ The following hyperparameters were used during training:
316
+ - learning_rate: 0.00025
317
+ - train_batch_size: 2
318
+ - eval_batch_size: 2
319
+ - seed: 42
320
+ - gradient_accumulation_steps: 2
321
+ - total_train_batch_size: 4
322
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
323
+ - lr_scheduler_type: cosine
324
+ - lr_scheduler_warmup_steps: 100
325
+ - num_epochs: 3
326
+
327
+ ### Training results
328
+
329
+ | Training Loss | Epoch | Step | Validation Loss |
330
+ |:-------------:|:-----:|:----:|:---------------:|
331
+ | 5.0998 | 0.0 | 1 | 5.1501 |
332
+ | 1.3477 | 0.6 | 622 | 1.6304 |
333
+ | 1.268 | 1.2 | 1244 | 1.4755 |
334
+ | 0.8714 | 1.8 | 1866 | 1.2799 |
335
+ | 0.4408 | 2.4 | 2488 | 1.3876 |
336
+
337
+
338
+ ### Framework versions
339
+
340
+ - PEFT 0.9.1.dev0
341
+ - Transformers 4.39.0.dev0
342
+ - Pytorch 2.1.2+cu118
343
+ - Datasets 2.18.0
344
+ - Tokenizers 0.15.0
adapter_config.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "sail/Sailor-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 64,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "layers.3.self_attn.v_proj",
23
+ "layers.30.self_attn.k_proj",
24
+ "layers.12.self_attn.o_proj",
25
+ "layers.3.mlp.up_proj",
26
+ "layers.1.self_attn.k_proj",
27
+ "layers.10.self_attn.o_proj",
28
+ "layers.28.self_attn.o_proj",
29
+ "layers.4.self_attn.v_proj",
30
+ "layers.3.self_attn.o_proj",
31
+ "layers.10.mlp.down_proj",
32
+ "layers.6.self_attn.o_proj",
33
+ "layers.1.self_attn.q_proj",
34
+ "layers.15.self_attn.q_proj",
35
+ "layers.7.mlp.up_proj",
36
+ "layers.31.mlp.up_proj",
37
+ "layers.27.self_attn.k_proj",
38
+ "layers.31.self_attn.k_proj",
39
+ "layers.30.mlp.down_proj",
40
+ "layers.17.mlp.down_proj",
41
+ "layers.14.self_attn.v_proj",
42
+ "layers.28.mlp.down_proj",
43
+ "layers.9.mlp.down_proj",
44
+ "layers.31.mlp.down_proj",
45
+ "layers.5.self_attn.o_proj",
46
+ "layers.6.mlp.down_proj",
47
+ "layers.17.mlp.up_proj",
48
+ "layers.26.self_attn.v_proj",
49
+ "layers.2.self_attn.v_proj",
50
+ "layers.7.self_attn.o_proj",
51
+ "layers.2.mlp.down_proj",
52
+ "layers.6.self_attn.v_proj",
53
+ "layers.18.self_attn.v_proj",
54
+ "layers.18.mlp.up_proj",
55
+ "layers.25.self_attn.v_proj",
56
+ "layers.0.self_attn.k_proj",
57
+ "layers.13.self_attn.v_proj",
58
+ "layers.10.self_attn.k_proj",
59
+ "layers.27.self_attn.o_proj",
60
+ "layers.0.self_attn.v_proj",
61
+ "layers.23.mlp.down_proj",
62
+ "layers.11.self_attn.v_proj",
63
+ "layers.30.mlp.up_proj",
64
+ "layers.20.self_attn.o_proj",
65
+ "layers.15.mlp.up_proj",
66
+ "layers.4.self_attn.k_proj",
67
+ "layers.9.mlp.up_proj",
68
+ "layers.29.self_attn.o_proj",
69
+ "layers.5.mlp.down_proj",
70
+ "layers.21.self_attn.q_proj",
71
+ "layers.16.self_attn.o_proj",
72
+ "layers.12.self_attn.v_proj",
73
+ "layers.14.self_attn.q_proj",
74
+ "layers.29.self_attn.v_proj",
75
+ "layers.16.self_attn.v_proj",
76
+ "layers.28.self_attn.k_proj",
77
+ "layers.13.self_attn.o_proj",
78
+ "layers.13.self_attn.k_proj",
79
+ "layers.5.self_attn.v_proj",
80
+ "layers.7.self_attn.q_proj",
81
+ "layers.13.mlp.up_proj",
82
+ "layers.25.mlp.up_proj",
83
+ "layers.8.mlp.down_proj",
84
+ "layers.19.self_attn.k_proj",
85
+ "layers.26.self_attn.o_proj",
86
+ "layers.14.mlp.up_proj",
87
+ "layers.6.self_attn.k_proj",
88
+ "layers.23.self_attn.v_proj",
89
+ "layers.3.self_attn.q_proj",
90
+ "layers.1.mlp.up_proj",
91
+ "layers.26.mlp.down_proj",
92
+ "layers.7.self_attn.k_proj",
93
+ "layers.26.mlp.up_proj",
94
+ "layers.12.self_attn.q_proj",
95
+ "layers.21.self_attn.v_proj",
96
+ "layers.11.mlp.down_proj",
97
+ "layers.14.self_attn.o_proj",
98
+ "layers.8.self_attn.k_proj",
99
+ "layers.19.self_attn.q_proj",
100
+ "layers.23.self_attn.o_proj",
101
+ "layers.8.mlp.up_proj",
102
+ "layers.11.mlp.up_proj",
103
+ "layers.25.self_attn.q_proj",
104
+ "layers.29.mlp.up_proj",
105
+ "layers.20.self_attn.v_proj",
106
+ "layers.18.self_attn.o_proj",
107
+ "layers.17.self_attn.k_proj",
108
+ "layers.0.self_attn.o_proj",
109
+ "layers.1.self_attn.v_proj",
110
+ "layers.7.mlp.down_proj",
111
+ "layers.5.self_attn.k_proj",
112
+ "layers.29.self_attn.k_proj",
113
+ "layers.21.self_attn.o_proj",
114
+ "layers.20.self_attn.q_proj",
115
+ "layers.24.mlp.up_proj",
116
+ "layers.4.self_attn.q_proj",
117
+ "layers.22.mlp.down_proj",
118
+ "layers.14.self_attn.k_proj",
119
+ "layers.0.mlp.up_proj",
120
+ "layers.15.self_attn.o_proj",
121
+ "layers.18.mlp.down_proj",
122
+ "layers.8.self_attn.o_proj",
123
+ "layers.2.mlp.up_proj",
124
+ "layers.4.mlp.up_proj",
125
+ "layers.6.self_attn.q_proj",
126
+ "layers.16.mlp.up_proj",
127
+ "layers.23.mlp.up_proj",
128
+ "layers.12.mlp.up_proj",
129
+ "layers.30.self_attn.v_proj",
130
+ "layers.5.self_attn.q_proj",
131
+ "layers.25.self_attn.k_proj",
132
+ "layers.4.self_attn.o_proj",
133
+ "layers.0.self_attn.q_proj",
134
+ "layers.11.self_attn.q_proj",
135
+ "layers.3.mlp.down_proj",
136
+ "layers.27.mlp.down_proj",
137
+ "layers.27.self_attn.q_proj",
138
+ "layers.15.mlp.down_proj",
139
+ "layers.12.self_attn.k_proj",
140
+ "layers.16.self_attn.k_proj",
141
+ "layers.13.mlp.down_proj",
142
+ "layers.9.self_attn.q_proj",
143
+ "layers.13.self_attn.q_proj",
144
+ "layers.12.mlp.down_proj",
145
+ "layers.22.self_attn.o_proj",
146
+ "layers.22.self_attn.q_proj",
147
+ "layers.1.self_attn.o_proj",
148
+ "layers.30.self_attn.q_proj",
149
+ "layers.24.self_attn.k_proj",
150
+ "layers.19.mlp.up_proj",
151
+ "layers.10.mlp.up_proj",
152
+ "layers.28.self_attn.v_proj",
153
+ "layers.31.self_attn.o_proj",
154
+ "layers.15.self_attn.v_proj",
155
+ "layers.22.self_attn.v_proj",
156
+ "layers.16.self_attn.q_proj",
157
+ "layers.27.mlp.up_proj",
158
+ "layers.20.self_attn.k_proj",
159
+ "layers.17.self_attn.o_proj",
160
+ "layers.29.self_attn.q_proj",
161
+ "layers.29.mlp.down_proj",
162
+ "layers.4.mlp.down_proj",
163
+ "layers.8.self_attn.q_proj",
164
+ "layers.25.mlp.down_proj",
165
+ "layers.11.self_attn.k_proj",
166
+ "layers.17.self_attn.q_proj",
167
+ "layers.2.self_attn.q_proj",
168
+ "layers.5.mlp.up_proj",
169
+ "layers.0.mlp.down_proj",
170
+ "layers.2.self_attn.o_proj",
171
+ "layers.16.mlp.down_proj",
172
+ "layers.20.mlp.up_proj",
173
+ "layers.27.self_attn.v_proj",
174
+ "layers.22.self_attn.k_proj",
175
+ "layers.26.self_attn.k_proj",
176
+ "layers.7.self_attn.v_proj",
177
+ "layers.9.self_attn.v_proj",
178
+ "layers.31.self_attn.v_proj",
179
+ "layers.3.self_attn.k_proj",
180
+ "layers.15.self_attn.k_proj",
181
+ "layers.19.self_attn.v_proj",
182
+ "layers.21.mlp.down_proj",
183
+ "layers.9.self_attn.k_proj",
184
+ "layers.19.self_attn.o_proj",
185
+ "layers.10.self_attn.v_proj",
186
+ "layers.9.self_attn.o_proj",
187
+ "layers.25.self_attn.o_proj",
188
+ "layers.24.mlp.down_proj",
189
+ "layers.31.self_attn.q_proj",
190
+ "layers.10.self_attn.q_proj",
191
+ "layers.18.self_attn.q_proj",
192
+ "layers.19.mlp.down_proj",
193
+ "layers.21.self_attn.k_proj",
194
+ "layers.20.mlp.down_proj",
195
+ "layers.24.self_attn.o_proj",
196
+ "layers.26.self_attn.q_proj",
197
+ "layers.11.self_attn.o_proj",
198
+ "layers.14.mlp.down_proj",
199
+ "layers.24.self_attn.q_proj",
200
+ "layers.28.mlp.up_proj",
201
+ "layers.23.self_attn.k_proj",
202
+ "layers.6.mlp.up_proj",
203
+ "layers.22.mlp.up_proj",
204
+ "layers.24.self_attn.v_proj",
205
+ "layers.30.self_attn.o_proj",
206
+ "layers.1.mlp.down_proj",
207
+ "layers.17.self_attn.v_proj",
208
+ "layers.18.self_attn.k_proj",
209
+ "layers.28.self_attn.q_proj",
210
+ "layers.21.mlp.up_proj",
211
+ "lm.head",
212
+ "layers.8.self_attn.v_proj",
213
+ "layers.23.self_attn.q_proj",
214
+ "layers.2.self_attn.k_proj"
215
+ ],
216
+ "task_type": "CAUSAL_LM",
217
+ "use_dora": false,
218
+ "use_rslora": false
219
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58df9a56c186e2463bd3ed8be94206423219163cfe53ffff3f0a9777e6b51752
3
+ size 516038346
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
checkpoint-1036/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: sail/Sailor-7B
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.9.1.dev0
checkpoint-1036/adapter_config.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "sail/Sailor-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 64,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "layers.3.self_attn.v_proj",
23
+ "layers.30.self_attn.k_proj",
24
+ "layers.12.self_attn.o_proj",
25
+ "layers.3.mlp.up_proj",
26
+ "layers.1.self_attn.k_proj",
27
+ "layers.10.self_attn.o_proj",
28
+ "layers.28.self_attn.o_proj",
29
+ "layers.4.self_attn.v_proj",
30
+ "layers.3.self_attn.o_proj",
31
+ "layers.10.mlp.down_proj",
32
+ "layers.6.self_attn.o_proj",
33
+ "layers.1.self_attn.q_proj",
34
+ "layers.15.self_attn.q_proj",
35
+ "layers.7.mlp.up_proj",
36
+ "layers.31.mlp.up_proj",
37
+ "layers.27.self_attn.k_proj",
38
+ "layers.31.self_attn.k_proj",
39
+ "layers.30.mlp.down_proj",
40
+ "layers.17.mlp.down_proj",
41
+ "layers.14.self_attn.v_proj",
42
+ "layers.28.mlp.down_proj",
43
+ "layers.9.mlp.down_proj",
44
+ "layers.31.mlp.down_proj",
45
+ "layers.5.self_attn.o_proj",
46
+ "layers.6.mlp.down_proj",
47
+ "layers.17.mlp.up_proj",
48
+ "layers.26.self_attn.v_proj",
49
+ "layers.2.self_attn.v_proj",
50
+ "layers.7.self_attn.o_proj",
51
+ "layers.2.mlp.down_proj",
52
+ "layers.6.self_attn.v_proj",
53
+ "layers.18.self_attn.v_proj",
54
+ "layers.18.mlp.up_proj",
55
+ "layers.25.self_attn.v_proj",
56
+ "layers.0.self_attn.k_proj",
57
+ "layers.13.self_attn.v_proj",
58
+ "layers.10.self_attn.k_proj",
59
+ "layers.27.self_attn.o_proj",
60
+ "layers.0.self_attn.v_proj",
61
+ "layers.23.mlp.down_proj",
62
+ "layers.11.self_attn.v_proj",
63
+ "layers.30.mlp.up_proj",
64
+ "layers.20.self_attn.o_proj",
65
+ "layers.15.mlp.up_proj",
66
+ "layers.4.self_attn.k_proj",
67
+ "layers.9.mlp.up_proj",
68
+ "layers.29.self_attn.o_proj",
69
+ "layers.5.mlp.down_proj",
70
+ "layers.21.self_attn.q_proj",
71
+ "layers.16.self_attn.o_proj",
72
+ "layers.12.self_attn.v_proj",
73
+ "layers.14.self_attn.q_proj",
74
+ "layers.29.self_attn.v_proj",
75
+ "layers.16.self_attn.v_proj",
76
+ "layers.28.self_attn.k_proj",
77
+ "layers.13.self_attn.o_proj",
78
+ "layers.13.self_attn.k_proj",
79
+ "layers.5.self_attn.v_proj",
80
+ "layers.7.self_attn.q_proj",
81
+ "layers.13.mlp.up_proj",
82
+ "layers.25.mlp.up_proj",
83
+ "layers.8.mlp.down_proj",
84
+ "layers.19.self_attn.k_proj",
85
+ "layers.26.self_attn.o_proj",
86
+ "layers.14.mlp.up_proj",
87
+ "layers.6.self_attn.k_proj",
88
+ "layers.23.self_attn.v_proj",
89
+ "layers.3.self_attn.q_proj",
90
+ "layers.1.mlp.up_proj",
91
+ "layers.26.mlp.down_proj",
92
+ "layers.7.self_attn.k_proj",
93
+ "layers.26.mlp.up_proj",
94
+ "layers.12.self_attn.q_proj",
95
+ "layers.21.self_attn.v_proj",
96
+ "layers.11.mlp.down_proj",
97
+ "layers.14.self_attn.o_proj",
98
+ "layers.8.self_attn.k_proj",
99
+ "layers.19.self_attn.q_proj",
100
+ "layers.23.self_attn.o_proj",
101
+ "layers.8.mlp.up_proj",
102
+ "layers.11.mlp.up_proj",
103
+ "layers.25.self_attn.q_proj",
104
+ "layers.29.mlp.up_proj",
105
+ "layers.20.self_attn.v_proj",
106
+ "layers.18.self_attn.o_proj",
107
+ "layers.17.self_attn.k_proj",
108
+ "layers.0.self_attn.o_proj",
109
+ "layers.1.self_attn.v_proj",
110
+ "layers.7.mlp.down_proj",
111
+ "layers.5.self_attn.k_proj",
112
+ "layers.29.self_attn.k_proj",
113
+ "layers.21.self_attn.o_proj",
114
+ "layers.20.self_attn.q_proj",
115
+ "layers.24.mlp.up_proj",
116
+ "layers.4.self_attn.q_proj",
117
+ "layers.22.mlp.down_proj",
118
+ "layers.14.self_attn.k_proj",
119
+ "layers.0.mlp.up_proj",
120
+ "layers.15.self_attn.o_proj",
121
+ "layers.18.mlp.down_proj",
122
+ "layers.8.self_attn.o_proj",
123
+ "layers.2.mlp.up_proj",
124
+ "layers.4.mlp.up_proj",
125
+ "layers.6.self_attn.q_proj",
126
+ "layers.16.mlp.up_proj",
127
+ "layers.23.mlp.up_proj",
128
+ "layers.12.mlp.up_proj",
129
+ "layers.30.self_attn.v_proj",
130
+ "layers.5.self_attn.q_proj",
131
+ "layers.25.self_attn.k_proj",
132
+ "layers.4.self_attn.o_proj",
133
+ "layers.0.self_attn.q_proj",
134
+ "layers.11.self_attn.q_proj",
135
+ "layers.3.mlp.down_proj",
136
+ "layers.27.mlp.down_proj",
137
+ "layers.27.self_attn.q_proj",
138
+ "layers.15.mlp.down_proj",
139
+ "layers.12.self_attn.k_proj",
140
+ "layers.16.self_attn.k_proj",
141
+ "layers.13.mlp.down_proj",
142
+ "layers.9.self_attn.q_proj",
143
+ "layers.13.self_attn.q_proj",
144
+ "layers.12.mlp.down_proj",
145
+ "layers.22.self_attn.o_proj",
146
+ "layers.22.self_attn.q_proj",
147
+ "layers.1.self_attn.o_proj",
148
+ "layers.30.self_attn.q_proj",
149
+ "layers.24.self_attn.k_proj",
150
+ "layers.19.mlp.up_proj",
151
+ "layers.10.mlp.up_proj",
152
+ "layers.28.self_attn.v_proj",
153
+ "layers.31.self_attn.o_proj",
154
+ "layers.15.self_attn.v_proj",
155
+ "layers.22.self_attn.v_proj",
156
+ "layers.16.self_attn.q_proj",
157
+ "layers.27.mlp.up_proj",
158
+ "layers.20.self_attn.k_proj",
159
+ "layers.17.self_attn.o_proj",
160
+ "layers.29.self_attn.q_proj",
161
+ "layers.29.mlp.down_proj",
162
+ "layers.4.mlp.down_proj",
163
+ "layers.8.self_attn.q_proj",
164
+ "layers.25.mlp.down_proj",
165
+ "layers.11.self_attn.k_proj",
166
+ "layers.17.self_attn.q_proj",
167
+ "layers.2.self_attn.q_proj",
168
+ "layers.5.mlp.up_proj",
169
+ "layers.0.mlp.down_proj",
170
+ "layers.2.self_attn.o_proj",
171
+ "layers.16.mlp.down_proj",
172
+ "layers.20.mlp.up_proj",
173
+ "layers.27.self_attn.v_proj",
174
+ "layers.22.self_attn.k_proj",
175
+ "layers.26.self_attn.k_proj",
176
+ "layers.7.self_attn.v_proj",
177
+ "layers.9.self_attn.v_proj",
178
+ "layers.31.self_attn.v_proj",
179
+ "layers.3.self_attn.k_proj",
180
+ "layers.15.self_attn.k_proj",
181
+ "layers.19.self_attn.v_proj",
182
+ "layers.21.mlp.down_proj",
183
+ "layers.9.self_attn.k_proj",
184
+ "layers.19.self_attn.o_proj",
185
+ "layers.10.self_attn.v_proj",
186
+ "layers.9.self_attn.o_proj",
187
+ "layers.25.self_attn.o_proj",
188
+ "layers.24.mlp.down_proj",
189
+ "layers.31.self_attn.q_proj",
190
+ "layers.10.self_attn.q_proj",
191
+ "layers.18.self_attn.q_proj",
192
+ "layers.19.mlp.down_proj",
193
+ "layers.21.self_attn.k_proj",
194
+ "layers.20.mlp.down_proj",
195
+ "layers.24.self_attn.o_proj",
196
+ "layers.26.self_attn.q_proj",
197
+ "layers.11.self_attn.o_proj",
198
+ "layers.14.mlp.down_proj",
199
+ "layers.24.self_attn.q_proj",
200
+ "layers.28.mlp.up_proj",
201
+ "layers.23.self_attn.k_proj",
202
+ "layers.6.mlp.up_proj",
203
+ "layers.22.mlp.up_proj",
204
+ "layers.24.self_attn.v_proj",
205
+ "layers.30.self_attn.o_proj",
206
+ "layers.1.mlp.down_proj",
207
+ "layers.17.self_attn.v_proj",
208
+ "layers.18.self_attn.k_proj",
209
+ "layers.28.self_attn.q_proj",
210
+ "layers.21.mlp.up_proj",
211
+ "lm.head",
212
+ "layers.8.self_attn.v_proj",
213
+ "layers.23.self_attn.q_proj",
214
+ "layers.2.self_attn.k_proj"
215
+ ],
216
+ "task_type": "CAUSAL_LM",
217
+ "use_dora": false,
218
+ "use_rslora": false
219
+ }
checkpoint-1036/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01b05dc849debef9d2fa3d53eb605c9ef6717dc69515f123150e532319ffcc78
3
+ size 515951312
checkpoint-1036/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9740c97011df0478673668cd53ef61cc4884abb165f3a2681fa62d43f6b063c7
3
+ size 258900244
checkpoint-1036/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d767e07218851864a3e29461588b6dfafd7c9c108116d6d6a78ac0df4990f92c
3
+ size 14244
checkpoint-1036/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd4e9e65b91e22f9fcd460c806a30334c2b2d47322a24d41e33a5bca2987066e
3
+ size 1064
checkpoint-1036/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1036/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fa70061cdbb7600b8f24d24febb8ee89ee01acb653af286c3ad5a4afa0f96e6
3
+ size 5560
checkpoint-2072/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: sail/Sailor-7B
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.9.1.dev0
checkpoint-2072/adapter_config.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "sail/Sailor-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 64,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "layers.3.self_attn.v_proj",
23
+ "layers.30.self_attn.k_proj",
24
+ "layers.12.self_attn.o_proj",
25
+ "layers.3.mlp.up_proj",
26
+ "layers.1.self_attn.k_proj",
27
+ "layers.10.self_attn.o_proj",
28
+ "layers.28.self_attn.o_proj",
29
+ "layers.4.self_attn.v_proj",
30
+ "layers.3.self_attn.o_proj",
31
+ "layers.10.mlp.down_proj",
32
+ "layers.6.self_attn.o_proj",
33
+ "layers.1.self_attn.q_proj",
34
+ "layers.15.self_attn.q_proj",
35
+ "layers.7.mlp.up_proj",
36
+ "layers.31.mlp.up_proj",
37
+ "layers.27.self_attn.k_proj",
38
+ "layers.31.self_attn.k_proj",
39
+ "layers.30.mlp.down_proj",
40
+ "layers.17.mlp.down_proj",
41
+ "layers.14.self_attn.v_proj",
42
+ "layers.28.mlp.down_proj",
43
+ "layers.9.mlp.down_proj",
44
+ "layers.31.mlp.down_proj",
45
+ "layers.5.self_attn.o_proj",
46
+ "layers.6.mlp.down_proj",
47
+ "layers.17.mlp.up_proj",
48
+ "layers.26.self_attn.v_proj",
49
+ "layers.2.self_attn.v_proj",
50
+ "layers.7.self_attn.o_proj",
51
+ "layers.2.mlp.down_proj",
52
+ "layers.6.self_attn.v_proj",
53
+ "layers.18.self_attn.v_proj",
54
+ "layers.18.mlp.up_proj",
55
+ "layers.25.self_attn.v_proj",
56
+ "layers.0.self_attn.k_proj",
57
+ "layers.13.self_attn.v_proj",
58
+ "layers.10.self_attn.k_proj",
59
+ "layers.27.self_attn.o_proj",
60
+ "layers.0.self_attn.v_proj",
61
+ "layers.23.mlp.down_proj",
62
+ "layers.11.self_attn.v_proj",
63
+ "layers.30.mlp.up_proj",
64
+ "layers.20.self_attn.o_proj",
65
+ "layers.15.mlp.up_proj",
66
+ "layers.4.self_attn.k_proj",
67
+ "layers.9.mlp.up_proj",
68
+ "layers.29.self_attn.o_proj",
69
+ "layers.5.mlp.down_proj",
70
+ "layers.21.self_attn.q_proj",
71
+ "layers.16.self_attn.o_proj",
72
+ "layers.12.self_attn.v_proj",
73
+ "layers.14.self_attn.q_proj",
74
+ "layers.29.self_attn.v_proj",
75
+ "layers.16.self_attn.v_proj",
76
+ "layers.28.self_attn.k_proj",
77
+ "layers.13.self_attn.o_proj",
78
+ "layers.13.self_attn.k_proj",
79
+ "layers.5.self_attn.v_proj",
80
+ "layers.7.self_attn.q_proj",
81
+ "layers.13.mlp.up_proj",
82
+ "layers.25.mlp.up_proj",
83
+ "layers.8.mlp.down_proj",
84
+ "layers.19.self_attn.k_proj",
85
+ "layers.26.self_attn.o_proj",
86
+ "layers.14.mlp.up_proj",
87
+ "layers.6.self_attn.k_proj",
88
+ "layers.23.self_attn.v_proj",
89
+ "layers.3.self_attn.q_proj",
90
+ "layers.1.mlp.up_proj",
91
+ "layers.26.mlp.down_proj",
92
+ "layers.7.self_attn.k_proj",
93
+ "layers.26.mlp.up_proj",
94
+ "layers.12.self_attn.q_proj",
95
+ "layers.21.self_attn.v_proj",
96
+ "layers.11.mlp.down_proj",
97
+ "layers.14.self_attn.o_proj",
98
+ "layers.8.self_attn.k_proj",
99
+ "layers.19.self_attn.q_proj",
100
+ "layers.23.self_attn.o_proj",
101
+ "layers.8.mlp.up_proj",
102
+ "layers.11.mlp.up_proj",
103
+ "layers.25.self_attn.q_proj",
104
+ "layers.29.mlp.up_proj",
105
+ "layers.20.self_attn.v_proj",
106
+ "layers.18.self_attn.o_proj",
107
+ "layers.17.self_attn.k_proj",
108
+ "layers.0.self_attn.o_proj",
109
+ "layers.1.self_attn.v_proj",
110
+ "layers.7.mlp.down_proj",
111
+ "layers.5.self_attn.k_proj",
112
+ "layers.29.self_attn.k_proj",
113
+ "layers.21.self_attn.o_proj",
114
+ "layers.20.self_attn.q_proj",
115
+ "layers.24.mlp.up_proj",
116
+ "layers.4.self_attn.q_proj",
117
+ "layers.22.mlp.down_proj",
118
+ "layers.14.self_attn.k_proj",
119
+ "layers.0.mlp.up_proj",
120
+ "layers.15.self_attn.o_proj",
121
+ "layers.18.mlp.down_proj",
122
+ "layers.8.self_attn.o_proj",
123
+ "layers.2.mlp.up_proj",
124
+ "layers.4.mlp.up_proj",
125
+ "layers.6.self_attn.q_proj",
126
+ "layers.16.mlp.up_proj",
127
+ "layers.23.mlp.up_proj",
128
+ "layers.12.mlp.up_proj",
129
+ "layers.30.self_attn.v_proj",
130
+ "layers.5.self_attn.q_proj",
131
+ "layers.25.self_attn.k_proj",
132
+ "layers.4.self_attn.o_proj",
133
+ "layers.0.self_attn.q_proj",
134
+ "layers.11.self_attn.q_proj",
135
+ "layers.3.mlp.down_proj",
136
+ "layers.27.mlp.down_proj",
137
+ "layers.27.self_attn.q_proj",
138
+ "layers.15.mlp.down_proj",
139
+ "layers.12.self_attn.k_proj",
140
+ "layers.16.self_attn.k_proj",
141
+ "layers.13.mlp.down_proj",
142
+ "layers.9.self_attn.q_proj",
143
+ "layers.13.self_attn.q_proj",
144
+ "layers.12.mlp.down_proj",
145
+ "layers.22.self_attn.o_proj",
146
+ "layers.22.self_attn.q_proj",
147
+ "layers.1.self_attn.o_proj",
148
+ "layers.30.self_attn.q_proj",
149
+ "layers.24.self_attn.k_proj",
150
+ "layers.19.mlp.up_proj",
151
+ "layers.10.mlp.up_proj",
152
+ "layers.28.self_attn.v_proj",
153
+ "layers.31.self_attn.o_proj",
154
+ "layers.15.self_attn.v_proj",
155
+ "layers.22.self_attn.v_proj",
156
+ "layers.16.self_attn.q_proj",
157
+ "layers.27.mlp.up_proj",
158
+ "layers.20.self_attn.k_proj",
159
+ "layers.17.self_attn.o_proj",
160
+ "layers.29.self_attn.q_proj",
161
+ "layers.29.mlp.down_proj",
162
+ "layers.4.mlp.down_proj",
163
+ "layers.8.self_attn.q_proj",
164
+ "layers.25.mlp.down_proj",
165
+ "layers.11.self_attn.k_proj",
166
+ "layers.17.self_attn.q_proj",
167
+ "layers.2.self_attn.q_proj",
168
+ "layers.5.mlp.up_proj",
169
+ "layers.0.mlp.down_proj",
170
+ "layers.2.self_attn.o_proj",
171
+ "layers.16.mlp.down_proj",
172
+ "layers.20.mlp.up_proj",
173
+ "layers.27.self_attn.v_proj",
174
+ "layers.22.self_attn.k_proj",
175
+ "layers.26.self_attn.k_proj",
176
+ "layers.7.self_attn.v_proj",
177
+ "layers.9.self_attn.v_proj",
178
+ "layers.31.self_attn.v_proj",
179
+ "layers.3.self_attn.k_proj",
180
+ "layers.15.self_attn.k_proj",
181
+ "layers.19.self_attn.v_proj",
182
+ "layers.21.mlp.down_proj",
183
+ "layers.9.self_attn.k_proj",
184
+ "layers.19.self_attn.o_proj",
185
+ "layers.10.self_attn.v_proj",
186
+ "layers.9.self_attn.o_proj",
187
+ "layers.25.self_attn.o_proj",
188
+ "layers.24.mlp.down_proj",
189
+ "layers.31.self_attn.q_proj",
190
+ "layers.10.self_attn.q_proj",
191
+ "layers.18.self_attn.q_proj",
192
+ "layers.19.mlp.down_proj",
193
+ "layers.21.self_attn.k_proj",
194
+ "layers.20.mlp.down_proj",
195
+ "layers.24.self_attn.o_proj",
196
+ "layers.26.self_attn.q_proj",
197
+ "layers.11.self_attn.o_proj",
198
+ "layers.14.mlp.down_proj",
199
+ "layers.24.self_attn.q_proj",
200
+ "layers.28.mlp.up_proj",
201
+ "layers.23.self_attn.k_proj",
202
+ "layers.6.mlp.up_proj",
203
+ "layers.22.mlp.up_proj",
204
+ "layers.24.self_attn.v_proj",
205
+ "layers.30.self_attn.o_proj",
206
+ "layers.1.mlp.down_proj",
207
+ "layers.17.self_attn.v_proj",
208
+ "layers.18.self_attn.k_proj",
209
+ "layers.28.self_attn.q_proj",
210
+ "layers.21.mlp.up_proj",
211
+ "lm.head",
212
+ "layers.8.self_attn.v_proj",
213
+ "layers.23.self_attn.q_proj",
214
+ "layers.2.self_attn.k_proj"
215
+ ],
216
+ "task_type": "CAUSAL_LM",
217
+ "use_dora": false,
218
+ "use_rslora": false
219
+ }
checkpoint-2072/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5249e878d0457fb9d6911b5154a443b832af001fb934d6e32d01b69390c81d4f
3
+ size 515951312
checkpoint-2072/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48fa210ffd7d345400f86fb2df26418e676f25786299f937c1db3eb5692034ee
3
+ size 258900244
checkpoint-2072/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd0d6adc5021f17a6d77ac6b68fb70ca1d9fd5ead58ce37c1721ea2ef66aa83
3
+ size 14244
checkpoint-2072/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:898d015d6b4cc63de52c6917a769d45a319b1d8b3784c078d7dfd8535a874791
3
+ size 1064
checkpoint-2072/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2072/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fa70061cdbb7600b8f24d24febb8ee89ee01acb653af286c3ad5a4afa0f96e6
3
+ size 5560
checkpoint-3108/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: sail/Sailor-7B
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.9.1.dev0
checkpoint-3108/adapter_config.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "sail/Sailor-7B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 16,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 64,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "layers.3.self_attn.v_proj",
23
+ "layers.30.self_attn.k_proj",
24
+ "layers.12.self_attn.o_proj",
25
+ "layers.3.mlp.up_proj",
26
+ "layers.1.self_attn.k_proj",
27
+ "layers.10.self_attn.o_proj",
28
+ "layers.28.self_attn.o_proj",
29
+ "layers.4.self_attn.v_proj",
30
+ "layers.3.self_attn.o_proj",
31
+ "layers.10.mlp.down_proj",
32
+ "layers.6.self_attn.o_proj",
33
+ "layers.1.self_attn.q_proj",
34
+ "layers.15.self_attn.q_proj",
35
+ "layers.7.mlp.up_proj",
36
+ "layers.31.mlp.up_proj",
37
+ "layers.27.self_attn.k_proj",
38
+ "layers.31.self_attn.k_proj",
39
+ "layers.30.mlp.down_proj",
40
+ "layers.17.mlp.down_proj",
41
+ "layers.14.self_attn.v_proj",
42
+ "layers.28.mlp.down_proj",
43
+ "layers.9.mlp.down_proj",
44
+ "layers.31.mlp.down_proj",
45
+ "layers.5.self_attn.o_proj",
46
+ "layers.6.mlp.down_proj",
47
+ "layers.17.mlp.up_proj",
48
+ "layers.26.self_attn.v_proj",
49
+ "layers.2.self_attn.v_proj",
50
+ "layers.7.self_attn.o_proj",
51
+ "layers.2.mlp.down_proj",
52
+ "layers.6.self_attn.v_proj",
53
+ "layers.18.self_attn.v_proj",
54
+ "layers.18.mlp.up_proj",
55
+ "layers.25.self_attn.v_proj",
56
+ "layers.0.self_attn.k_proj",
57
+ "layers.13.self_attn.v_proj",
58
+ "layers.10.self_attn.k_proj",
59
+ "layers.27.self_attn.o_proj",
60
+ "layers.0.self_attn.v_proj",
61
+ "layers.23.mlp.down_proj",
62
+ "layers.11.self_attn.v_proj",
63
+ "layers.30.mlp.up_proj",
64
+ "layers.20.self_attn.o_proj",
65
+ "layers.15.mlp.up_proj",
66
+ "layers.4.self_attn.k_proj",
67
+ "layers.9.mlp.up_proj",
68
+ "layers.29.self_attn.o_proj",
69
+ "layers.5.mlp.down_proj",
70
+ "layers.21.self_attn.q_proj",
71
+ "layers.16.self_attn.o_proj",
72
+ "layers.12.self_attn.v_proj",
73
+ "layers.14.self_attn.q_proj",
74
+ "layers.29.self_attn.v_proj",
75
+ "layers.16.self_attn.v_proj",
76
+ "layers.28.self_attn.k_proj",
77
+ "layers.13.self_attn.o_proj",
78
+ "layers.13.self_attn.k_proj",
79
+ "layers.5.self_attn.v_proj",
80
+ "layers.7.self_attn.q_proj",
81
+ "layers.13.mlp.up_proj",
82
+ "layers.25.mlp.up_proj",
83
+ "layers.8.mlp.down_proj",
84
+ "layers.19.self_attn.k_proj",
85
+ "layers.26.self_attn.o_proj",
86
+ "layers.14.mlp.up_proj",
87
+ "layers.6.self_attn.k_proj",
88
+ "layers.23.self_attn.v_proj",
89
+ "layers.3.self_attn.q_proj",
90
+ "layers.1.mlp.up_proj",
91
+ "layers.26.mlp.down_proj",
92
+ "layers.7.self_attn.k_proj",
93
+ "layers.26.mlp.up_proj",
94
+ "layers.12.self_attn.q_proj",
95
+ "layers.21.self_attn.v_proj",
96
+ "layers.11.mlp.down_proj",
97
+ "layers.14.self_attn.o_proj",
98
+ "layers.8.self_attn.k_proj",
99
+ "layers.19.self_attn.q_proj",
100
+ "layers.23.self_attn.o_proj",
101
+ "layers.8.mlp.up_proj",
102
+ "layers.11.mlp.up_proj",
103
+ "layers.25.self_attn.q_proj",
104
+ "layers.29.mlp.up_proj",
105
+ "layers.20.self_attn.v_proj",
106
+ "layers.18.self_attn.o_proj",
107
+ "layers.17.self_attn.k_proj",
108
+ "layers.0.self_attn.o_proj",
109
+ "layers.1.self_attn.v_proj",
110
+ "layers.7.mlp.down_proj",
111
+ "layers.5.self_attn.k_proj",
112
+ "layers.29.self_attn.k_proj",
113
+ "layers.21.self_attn.o_proj",
114
+ "layers.20.self_attn.q_proj",
115
+ "layers.24.mlp.up_proj",
116
+ "layers.4.self_attn.q_proj",
117
+ "layers.22.mlp.down_proj",
118
+ "layers.14.self_attn.k_proj",
119
+ "layers.0.mlp.up_proj",
120
+ "layers.15.self_attn.o_proj",
121
+ "layers.18.mlp.down_proj",
122
+ "layers.8.self_attn.o_proj",
123
+ "layers.2.mlp.up_proj",
124
+ "layers.4.mlp.up_proj",
125
+ "layers.6.self_attn.q_proj",
126
+ "layers.16.mlp.up_proj",
127
+ "layers.23.mlp.up_proj",
128
+ "layers.12.mlp.up_proj",
129
+ "layers.30.self_attn.v_proj",
130
+ "layers.5.self_attn.q_proj",
131
+ "layers.25.self_attn.k_proj",
132
+ "layers.4.self_attn.o_proj",
133
+ "layers.0.self_attn.q_proj",
134
+ "layers.11.self_attn.q_proj",
135
+ "layers.3.mlp.down_proj",
136
+ "layers.27.mlp.down_proj",
137
+ "layers.27.self_attn.q_proj",
138
+ "layers.15.mlp.down_proj",
139
+ "layers.12.self_attn.k_proj",
140
+ "layers.16.self_attn.k_proj",
141
+ "layers.13.mlp.down_proj",
142
+ "layers.9.self_attn.q_proj",
143
+ "layers.13.self_attn.q_proj",
144
+ "layers.12.mlp.down_proj",
145
+ "layers.22.self_attn.o_proj",
146
+ "layers.22.self_attn.q_proj",
147
+ "layers.1.self_attn.o_proj",
148
+ "layers.30.self_attn.q_proj",
149
+ "layers.24.self_attn.k_proj",
150
+ "layers.19.mlp.up_proj",
151
+ "layers.10.mlp.up_proj",
152
+ "layers.28.self_attn.v_proj",
153
+ "layers.31.self_attn.o_proj",
154
+ "layers.15.self_attn.v_proj",
155
+ "layers.22.self_attn.v_proj",
156
+ "layers.16.self_attn.q_proj",
157
+ "layers.27.mlp.up_proj",
158
+ "layers.20.self_attn.k_proj",
159
+ "layers.17.self_attn.o_proj",
160
+ "layers.29.self_attn.q_proj",
161
+ "layers.29.mlp.down_proj",
162
+ "layers.4.mlp.down_proj",
163
+ "layers.8.self_attn.q_proj",
164
+ "layers.25.mlp.down_proj",
165
+ "layers.11.self_attn.k_proj",
166
+ "layers.17.self_attn.q_proj",
167
+ "layers.2.self_attn.q_proj",
168
+ "layers.5.mlp.up_proj",
169
+ "layers.0.mlp.down_proj",
170
+ "layers.2.self_attn.o_proj",
171
+ "layers.16.mlp.down_proj",
172
+ "layers.20.mlp.up_proj",
173
+ "layers.27.self_attn.v_proj",
174
+ "layers.22.self_attn.k_proj",
175
+ "layers.26.self_attn.k_proj",
176
+ "layers.7.self_attn.v_proj",
177
+ "layers.9.self_attn.v_proj",
178
+ "layers.31.self_attn.v_proj",
179
+ "layers.3.self_attn.k_proj",
180
+ "layers.15.self_attn.k_proj",
181
+ "layers.19.self_attn.v_proj",
182
+ "layers.21.mlp.down_proj",
183
+ "layers.9.self_attn.k_proj",
184
+ "layers.19.self_attn.o_proj",
185
+ "layers.10.self_attn.v_proj",
186
+ "layers.9.self_attn.o_proj",
187
+ "layers.25.self_attn.o_proj",
188
+ "layers.24.mlp.down_proj",
189
+ "layers.31.self_attn.q_proj",
190
+ "layers.10.self_attn.q_proj",
191
+ "layers.18.self_attn.q_proj",
192
+ "layers.19.mlp.down_proj",
193
+ "layers.21.self_attn.k_proj",
194
+ "layers.20.mlp.down_proj",
195
+ "layers.24.self_attn.o_proj",
196
+ "layers.26.self_attn.q_proj",
197
+ "layers.11.self_attn.o_proj",
198
+ "layers.14.mlp.down_proj",
199
+ "layers.24.self_attn.q_proj",
200
+ "layers.28.mlp.up_proj",
201
+ "layers.23.self_attn.k_proj",
202
+ "layers.6.mlp.up_proj",
203
+ "layers.22.mlp.up_proj",
204
+ "layers.24.self_attn.v_proj",
205
+ "layers.30.self_attn.o_proj",
206
+ "layers.1.mlp.down_proj",
207
+ "layers.17.self_attn.v_proj",
208
+ "layers.18.self_attn.k_proj",
209
+ "layers.28.self_attn.q_proj",
210
+ "layers.21.mlp.up_proj",
211
+ "lm.head",
212
+ "layers.8.self_attn.v_proj",
213
+ "layers.23.self_attn.q_proj",
214
+ "layers.2.self_attn.k_proj"
215
+ ],
216
+ "task_type": "CAUSAL_LM",
217
+ "use_dora": false,
218
+ "use_rslora": false
219
+ }
checkpoint-3108/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:647d7c7ea41254629468f756b800db8c8b990cbd387cb76ab944a056c8449dbe
3
+ size 515951312
checkpoint-3108/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3048c513a709a3326e5ddcd2e903784e7957f5ab54aea44536cf7e823a4d4676
3
+ size 258900244
checkpoint-3108/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaa9fec4ce6d77cc70ba336c0db662fb1d386591699c634fa7636fcbb2bc84ad
3
+ size 14244
checkpoint-3108/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ece357ba5008202c6875696f25f985008223cc82e98ab7fc6c1e42dea66cd7dd
3
+ size 1064
checkpoint-3108/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-3108/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fa70061cdbb7600b8f24d24febb8ee89ee01acb653af286c3ad5a4afa0f96e6
3
+ size 5560
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sail/Sailor-7B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "max_position_embeddings": 32768,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 32,
17
+ "num_key_value_heads": 32,
18
+ "quantization_config": {
19
+ "_load_in_4bit": true,
20
+ "_load_in_8bit": false,
21
+ "bnb_4bit_compute_dtype": "bfloat16",
22
+ "bnb_4bit_quant_type": "nf4",
23
+ "bnb_4bit_use_double_quant": true,
24
+ "llm_int8_enable_fp32_cpu_offload": false,
25
+ "llm_int8_has_fp16_weight": false,
26
+ "llm_int8_skip_modules": null,
27
+ "llm_int8_threshold": 6.0,
28
+ "load_in_4bit": true,
29
+ "load_in_8bit": false,
30
+ "quant_method": "bitsandbytes"
31
+ },
32
+ "rms_norm_eps": 1e-06,
33
+ "rope_theta": 1000000.0,
34
+ "sliding_window": 32768,
35
+ "tie_word_embeddings": false,
36
+ "torch_dtype": "bfloat16",
37
+ "transformers_version": "4.39.0.dev0",
38
+ "use_cache": false,
39
+ "use_sliding_window": false,
40
+ "vocab_size": 151936
41
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|endoftext|>",
37
+ "errors": "replace",
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff