Owen Arliawan commited on
Commit
864a556
1 Parent(s): 01ec086

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +121 -0
README.md CHANGED
@@ -1,3 +1,124 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+ Based on Meta-Llama-3-8b-Instruct, and is governed by Meta Llama 3 License agreement:
5
+ https://huggingface.co/cognitivecomputations/dolphin-2.9-llama3-8b/blob/main/LICENSE
6
+
7
+ We don't know if it is any good since we have not benched this yet but we are happy for anyone to try it out and give some feedback.
8
+ You can try this model on our API at https://www.awanllm.com/
9
+
10
+ Trained on 2048 sequence length, while the base model is 8192 sequence length. From testing it still performs the same 8192 context just fine.
11
+
12
+ Trained using Cognitive Computations Eric Hartford's https://huggingface.co/datasets/cognitivecomputations/dolphin dataset as we've found great results from their dolphin models in previous Llama models.
13
+
14
+ Trained for 2 days on 2x RTX3090 on our own machine, using 4-bit loading and Qlora 64-rank 128-alpha resulting in ~2% trainable weights.
15
+
16
+ The goal for this model is to have the model less-censored and great at general tasks like the previous dolphin models by Eric Hartford.
17
+ We started training this BEFORE they launched their own full weight trained Llama-3-8B-Dolphin-2.9 with their own curated datasets and the newer "Dolphin 2.9" dataset.
18
+ https://huggingface.co/cognitivecomputations/dolphin-2.9-llama3-8b
19
+
20
+ The difference is that we train this using Meta's new Llama 3 instruct format and not the regular ChatML format that Dolphin models are usually trained on. This is because we think that it might perform better using the format it was originally trained on.
21
+ Instruct format:
22
+ ```
23
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
24
+
25
+ {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
26
+
27
+ {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
28
+
29
+ {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|>
30
+
31
+ {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
32
+ ```
33
+
34
+ Quants:
35
+
36
+ GGUF: https://huggingface.co/AwanLLM/Meta-Llama-3-8B-Dolphin-Lite-v0.1-GGUF
37
+
38
+ FP16: https://huggingface.co/AwanLLM/Meta-Llama-3-8B-Instruct-Dolphin-Lite
39
+
40
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
41
+
42
+ Axolotl Config:
43
+ ```
44
+ base_model: Meta-Llama-3-8B-Instruct
45
+ model_type: LlamaForCausalLM
46
+ tokenizer_type: AutoTokenizer
47
+
48
+ train_on_inputs: false
49
+ group_by_length: false
50
+ load_in_8bit: false
51
+ load_in_4bit: true
52
+ strict: false
53
+ sequence_len: 2048
54
+ bf16: true
55
+ fp16: false
56
+ tf32: false
57
+ flash_attention: true
58
+
59
+ # Data
60
+ datasets:
61
+ - path: flan1m-universal-uncensored-system-2048.jsonl
62
+ type:
63
+ system_prompt: ""
64
+ system_format: "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
65
+ field_system: system
66
+ field_instruction: input
67
+ field_output: output
68
+ format: "{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
69
+ no_input_format: "{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
70
+
71
+ warmup_steps: 10
72
+ dataset_prepared_path: ./last_run_prepared
73
+
74
+ # Iterations
75
+ num_epochs: 1
76
+ saves_per_epoch: 4
77
+
78
+ # Evaluation
79
+ val_set_size: 0.01
80
+ eval_table_size:
81
+ eval_table_max_new_tokens:
82
+ eval_sample_packing: false
83
+ evals_per_epoch: 4
84
+
85
+ # LoRA
86
+ output_dir: ./qlora-out
87
+ adapter: qlora
88
+ lora_model_dir:
89
+ lora_r: 64
90
+ lora_alpha: 128
91
+ lora_dropout: 0.05
92
+ lora_target_linear: true
93
+ lora_fan_in_fan_out:
94
+ lora_target_modules:
95
+ save_safetensors: true
96
+
97
+ # Sampling
98
+ sample_packing: true
99
+ pad_to_sequence_len: true
100
+
101
+ # Batching
102
+ gradient_accumulation_steps: 32
103
+ micro_batch_size: 4
104
+ gradient_checkpointing: true
105
+ gradient_checkpointing_kwargs:
106
+ use_reentrant: true
107
+
108
+ # Optimizer
109
+ optimizer: paged_adamw_8bit
110
+ lr_scheduler: cosine
111
+ learning_rate: 0.0002
112
+
113
+ # Misc
114
+ early_stopping_patience:
115
+ resume_from_checkpoint:
116
+ logging_steps: 1
117
+ debug:
118
+ deepspeed: zero3_bf16.json
119
+ weight_decay: 0.1
120
+ special_tokens:
121
+ pad_token: <|end_of_text|>
122
+ ```
123
+
124
+