ptrdvn commited on
Commit
6c94cf4
1 Parent(s): 2d08a71

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +157 -0
README.md ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ base_model: mistral-community/Mixtral-8x22B-v0.1
7
+ model-index:
8
+ - name: qlora-out-2048-multiling
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.0`
19
+ ```yaml
20
+ base_model: mistral-community/Mixtral-8x22B-v0.1
21
+ model_type: AutoModelForCausalLM
22
+ tokenizer_type: AutoTokenizer
23
+ trust_remote_code: true
24
+
25
+ load_in_8bit: false
26
+ load_in_4bit: true
27
+ strict: false
28
+
29
+ datasets:
30
+ - path: lightblue/gpt4_conversations_multilingual
31
+ type: sharegpt
32
+ conversation: mistral
33
+ dataset_prepared_path: ./prepared_dataset_2048-multiling
34
+ val_set_size: 0
35
+ output_dir: ./qlora-out-2048-multiling
36
+
37
+ ## You can optionally freeze the entire model and unfreeze a subset of parameters
38
+ unfrozen_parameters:
39
+ # - ^lm_head.weight$
40
+ # - ^model.embed_tokens.weight$[:32000]
41
+ # - model.layers.2[0-9]+.block_sparse_moe.gate
42
+ # - model.layers.2[0-9]+.block_sparse_moe.experts
43
+ # - model.layers.3[0-9]+.block_sparse_moe.gate
44
+ # - model.layers.3[0-9]+.block_sparse_moe.experts
45
+
46
+ model_config:
47
+ output_router_logits: true
48
+
49
+ adapter: qlora
50
+ lora_model_dir:
51
+
52
+ sequence_len: 2048
53
+ sample_packing: true
54
+ pad_to_sequence_len: true
55
+
56
+ lora_r: 16
57
+ lora_alpha: 16
58
+ lora_dropout: 0.05
59
+ lora_target_linear: true
60
+ lora_fan_in_fan_out:
61
+ #lora_target_modules:
62
+ # - gate
63
+ # - q_proj
64
+ # - k_proj
65
+ # - v_proj
66
+ # - o_proj
67
+ # - w1
68
+ # - w2
69
+ # - w3
70
+
71
+ gradient_accumulation_steps: 2
72
+ micro_batch_size: 1
73
+ num_epochs: 1
74
+ optimizer: adamw_bnb_8bit
75
+ lr_scheduler: cosine
76
+ learning_rate: 0.0002
77
+
78
+ use_wandb: true
79
+ wandb_project: axolotl
80
+ wandb_entity: peterd
81
+ wandb_name: mixtral_8x22b_test
82
+
83
+ train_on_inputs: false
84
+ group_by_length: false
85
+ bf16: auto
86
+ fp16:
87
+ tf32: false
88
+
89
+ gradient_checkpointing: true
90
+ early_stopping_patience:
91
+ resume_from_checkpoint:
92
+ local_rank:
93
+ logging_steps: 1
94
+ xformers_attention:
95
+ flash_attention: true
96
+
97
+ warmup_steps: 10
98
+ evals_per_epoch: 0
99
+ eval_table_size:
100
+ eval_max_new_tokens: 128
101
+ saves_per_epoch: 5
102
+ debug:
103
+ deepspeed: /workspace/axolotl/deepspeed_configs/zero2.json
104
+ weight_decay: 0.0
105
+ fsdp:
106
+ fsdp_config:
107
+ special_tokens:
108
+ ```
109
+
110
+ </details><br>
111
+
112
+ # qlora-out-2048-multiling
113
+
114
+ This model is a fine-tuned version of [mistral-community/Mixtral-8x22B-v0.1](https://huggingface.co/mistral-community/Mixtral-8x22B-v0.1) on the None dataset.
115
+
116
+ ## Model description
117
+
118
+ More information needed
119
+
120
+ ## Intended uses & limitations
121
+
122
+ More information needed
123
+
124
+ ## Training and evaluation data
125
+
126
+ More information needed
127
+
128
+ ## Training procedure
129
+
130
+ ### Training hyperparameters
131
+
132
+ The following hyperparameters were used during training:
133
+ - learning_rate: 0.0002
134
+ - train_batch_size: 1
135
+ - eval_batch_size: 1
136
+ - seed: 42
137
+ - distributed_type: multi-GPU
138
+ - num_devices: 4
139
+ - gradient_accumulation_steps: 2
140
+ - total_train_batch_size: 8
141
+ - total_eval_batch_size: 4
142
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
143
+ - lr_scheduler_type: cosine
144
+ - lr_scheduler_warmup_steps: 10
145
+ - num_epochs: 1
146
+
147
+ ### Training results
148
+
149
+
150
+
151
+ ### Framework versions
152
+
153
+ - PEFT 0.10.0
154
+ - Transformers 4.40.0.dev0
155
+ - Pytorch 2.1.2+cu121
156
+ - Datasets 2.18.0
157
+ - Tokenizers 0.15.0