williambarberjr commited on
Commit
10e6adc
·
verified ·
1 Parent(s): 7778f85

Upload functionCallingSpectrum.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. functionCallingSpectrum.yml +227 -0
functionCallingSpectrum.yml ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: meta-llama/Meta-Llama-3.1-8B
2
+ model_type: LlamaForCausalLM
3
+ tokenizer_type: AutoTokenizer
4
+
5
+ trust_remote_code: true
6
+
7
+ datasets:
8
+ - path: /workspace/datasets/openhermes_200k_unfiltered/Open_Hermes_200k_unfiltered.jsonl
9
+ type: sharegpt
10
+ conversation: chatml
11
+ - path: /workspace/datasets/magpie_function_calling/magpie_function_calling.jsonl
12
+ type: sharegpt
13
+ conversation: chatml
14
+
15
+ unfrozen_parameters:
16
+ - ^lm_head.weight$
17
+ - ^model.embed_tokens.weight$
18
+ # input_layernorm layers
19
+ - model.layers.0.input_layernorm
20
+ - model.layers.1.input_layernorm
21
+ - model.layers.2.input_layernorm
22
+ - model.layers.3.input_layernorm
23
+ - model.layers.4.input_layernorm
24
+ - model.layers.5.input_layernorm
25
+ - model.layers.6.input_layernorm
26
+ - model.layers.7.input_layernorm
27
+ - model.layers.8.input_layernorm
28
+ - model.layers.9.input_layernorm
29
+ - model.layers.10.input_layernorm
30
+ - model.layers.11.input_layernorm
31
+ - model.layers.12.input_layernorm
32
+ - model.layers.13.input_layernorm
33
+ - model.layers.14.input_layernorm
34
+ - model.layers.15.input_layernorm
35
+ # lm_head layers
36
+ # mlp.down_proj layers
37
+ - model.layers.1.mlp.down_proj
38
+ - model.layers.0.mlp.down_proj
39
+ - model.layers.30.mlp.down_proj
40
+ - model.layers.2.mlp.down_proj
41
+ - model.layers.21.mlp.down_proj
42
+ - model.layers.22.mlp.down_proj
43
+ - model.layers.29.mlp.down_proj
44
+ - model.layers.5.mlp.down_proj
45
+ - model.layers.4.mlp.down_proj
46
+ - model.layers.20.mlp.down_proj
47
+ - model.layers.23.mlp.down_proj
48
+ - model.layers.19.mlp.down_proj
49
+ - model.layers.3.mlp.down_proj
50
+ - model.layers.17.mlp.down_proj
51
+ - model.layers.6.mlp.down_proj
52
+ - model.layers.31.mlp.down_proj
53
+ # mlp.gate_proj layers
54
+ - model.layers.1.mlp.gate_proj
55
+ - model.layers.2.mlp.gate_proj
56
+ - model.layers.3.mlp.gate_proj
57
+ - model.layers.4.mlp.gate_proj
58
+ - model.layers.0.mlp.gate_proj
59
+ - model.layers.25.mlp.gate_proj
60
+ - model.layers.26.mlp.gate_proj
61
+ - model.layers.5.mlp.gate_proj
62
+ - model.layers.24.mlp.gate_proj
63
+ - model.layers.28.mlp.gate_proj
64
+ - model.layers.23.mlp.gate_proj
65
+ - model.layers.27.mlp.gate_proj
66
+ - model.layers.21.mlp.gate_proj
67
+ - model.layers.22.mlp.gate_proj
68
+ - model.layers.29.mlp.gate_proj
69
+ - model.layers.20.mlp.gate_proj
70
+ # mlp.up_proj layers
71
+ - model.layers.4.mlp.up_proj
72
+ - model.layers.3.mlp.up_proj
73
+ - model.layers.0.mlp.up_proj
74
+ - model.layers.5.mlp.up_proj
75
+ - model.layers.7.mlp.up_proj
76
+ - model.layers.6.mlp.up_proj
77
+ - model.layers.2.mlp.up_proj
78
+ - model.layers.1.mlp.up_proj
79
+ - model.layers.8.mlp.up_proj
80
+ - model.layers.12.mlp.up_proj
81
+ - model.layers.14.mlp.up_proj
82
+ - model.layers.9.mlp.up_proj
83
+ - model.layers.15.mlp.up_proj
84
+ - model.layers.17.mlp.up_proj
85
+ - model.layers.13.mlp.up_proj
86
+ - model.layers.19.mlp.up_proj
87
+ # model.embed_tokens layers
88
+ # model.norm layers
89
+ # post_attention_layernorm layers
90
+ - model.layers.0.post_attention_layernorm
91
+ - model.layers.1.post_attention_layernorm
92
+ - model.layers.2.post_attention_layernorm
93
+ - model.layers.3.post_attention_layernorm
94
+ - model.layers.4.post_attention_layernorm
95
+ - model.layers.5.post_attention_layernorm
96
+ - model.layers.6.post_attention_layernorm
97
+ - model.layers.7.post_attention_layernorm
98
+ - model.layers.8.post_attention_layernorm
99
+ - model.layers.9.post_attention_layernorm
100
+ - model.layers.10.post_attention_layernorm
101
+ - model.layers.11.post_attention_layernorm
102
+ - model.layers.12.post_attention_layernorm
103
+ - model.layers.13.post_attention_layernorm
104
+ - model.layers.14.post_attention_layernorm
105
+ - model.layers.15.post_attention_layernorm
106
+ # self_attn.k_proj layers
107
+ - model.layers.29.self_attn.k_proj
108
+ - model.layers.25.self_attn.k_proj
109
+ - model.layers.23.self_attn.k_proj
110
+ - model.layers.28.self_attn.k_proj
111
+ - model.layers.21.self_attn.k_proj
112
+ - model.layers.19.self_attn.k_proj
113
+ - model.layers.22.self_attn.k_proj
114
+ - model.layers.20.self_attn.k_proj
115
+ - model.layers.24.self_attn.k_proj
116
+ - model.layers.31.self_attn.k_proj
117
+ - model.layers.27.self_attn.k_proj
118
+ - model.layers.26.self_attn.k_proj
119
+ - model.layers.17.self_attn.k_proj
120
+ - model.layers.11.self_attn.k_proj
121
+ - model.layers.18.self_attn.k_proj
122
+ - model.layers.14.self_attn.k_proj
123
+ # self_attn.o_proj layers
124
+ - model.layers.14.self_attn.o_proj
125
+ - model.layers.7.self_attn.o_proj
126
+ - model.layers.5.self_attn.o_proj
127
+ - model.layers.11.self_attn.o_proj
128
+ - model.layers.6.self_attn.o_proj
129
+ - model.layers.24.self_attn.o_proj
130
+ - model.layers.9.self_attn.o_proj
131
+ - model.layers.13.self_attn.o_proj
132
+ - model.layers.10.self_attn.o_proj
133
+ - model.layers.12.self_attn.o_proj
134
+ - model.layers.8.self_attn.o_proj
135
+ - model.layers.25.self_attn.o_proj
136
+ - model.layers.21.self_attn.o_proj
137
+ - model.layers.23.self_attn.o_proj
138
+ - model.layers.15.self_attn.o_proj
139
+ - model.layers.16.self_attn.o_proj
140
+ # self_attn.q_proj layers
141
+ - model.layers.8.self_attn.q_proj
142
+ - model.layers.13.self_attn.q_proj
143
+ - model.layers.9.self_attn.q_proj
144
+ - model.layers.14.self_attn.q_proj
145
+ - model.layers.10.self_attn.q_proj
146
+ - model.layers.11.self_attn.q_proj
147
+ - model.layers.0.self_attn.q_proj
148
+ - model.layers.15.self_attn.q_proj
149
+ - model.layers.1.self_attn.q_proj
150
+ - model.layers.6.self_attn.q_proj
151
+ - model.layers.5.self_attn.q_proj
152
+ - model.layers.7.self_attn.q_proj
153
+ - model.layers.12.self_attn.q_proj
154
+ - model.layers.16.self_attn.q_proj
155
+ - model.layers.17.self_attn.q_proj
156
+ - model.layers.26.self_attn.q_proj
157
+ # self_attn.v_proj layers
158
+ - model.layers.26.self_attn.v_proj
159
+ - model.layers.17.self_attn.v_proj
160
+ - model.layers.3.self_attn.v_proj
161
+ - model.layers.28.self_attn.v_proj
162
+ - model.layers.29.self_attn.v_proj
163
+ - model.layers.21.self_attn.v_proj
164
+ - model.layers.15.self_attn.v_proj
165
+ - model.layers.16.self_attn.v_proj
166
+ - model.layers.20.self_attn.v_proj
167
+ - model.layers.25.self_attn.v_proj
168
+ - model.layers.6.self_attn.v_proj
169
+ - model.layers.23.self_attn.v_proj
170
+ - model.layers.4.self_attn.v_proj
171
+ - model.layers.1.self_attn.v_proj
172
+ - model.layers.22.self_attn.v_proj
173
+ - model.layers.14.self_attn.v_proj
174
+
175
+
176
+ chat_template: chatml
177
+ val_set_size: 0.01
178
+ output_dir: llama_3.1_8b_function_calling
179
+ data_seed: 49
180
+ seed: 49
181
+
182
+ sequence_len: 8192
183
+ sample_packing: true
184
+ pad_to_sequence_len: true
185
+
186
+ wandb_project: function-calling-spectrum
187
+ wandb_entity: therealagi-reviewramp
188
+ wandb_watch: gradients
189
+ wandb_name:
190
+ wandb_log_model:
191
+
192
+ gradient_accumulation_steps: 4
193
+ micro_batch_size: 2
194
+ num_epochs: 3
195
+ optimizer: adamw_torch
196
+ lr_scheduler: linear #linear is theoreticall optimal... https://arxiv.org/abs/2310.07831
197
+ learning_rate: 5e-6
198
+
199
+ train_on_inputs: false
200
+ group_by_length: false
201
+ bf16: auto
202
+ fp16:
203
+ tf32: false
204
+
205
+ gradient_checkpointing: true
206
+ early_stopping_patience:
207
+ resume_from_checkpoint:
208
+ local_rank:
209
+ logging_steps: 1
210
+ xformers_attention:
211
+ flash_attention: true
212
+ auto_resume_from_checkpoints: true
213
+
214
+ warmup_steps: 10
215
+ evals_per_epoch: 2
216
+ eval_table_size:
217
+ eval_max_new_tokens: 128
218
+ saves_per_epoch: 2
219
+ save_total_limit: 2
220
+ debug:
221
+ deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16_cpuoffload_params.json
222
+ weight_decay: 0.05
223
+ fsdp:
224
+ fsdp_config:
225
+ special_tokens:
226
+ pad_token: "<|endoftext|>"
227
+ eos_token: "<|im_end|>"