AuriAetherwiing commited on
Commit
1701067
1 Parent(s): 85b97dd

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +145 -0
README.md ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gemma
3
+ datasets:
4
+ - Mielikki/Erebus-87k
5
+ - allura-org/r_shortstories_24k
6
+ base_model:
7
+ - UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3
8
+ pipeline_tag: text-generation
9
+ ---
10
+
11
+ # Gemma-2-9B Sugarquill v0
12
+
13
+ An experimental continued pretrain of Gemma-2-9B-It-SPPO-Iter3 on Gemma-2-9B-It-SPPO-Iter3 on assorted short story data from the web.
14
+ I was trying to diversify Gemma's prose, without completely destroying it's smarts. I think I half-succeeded? This model could have used another epoch of training, but even this is already more creative and descriptive than it's base model, w/o becoming too silly. Doesn't seem to have degraded much in terms of core abilities as well.
15
+ Should be usable both for RP and raw completion storywriting.
16
+ I originally planned to use this in a merge, but I feel like this model is interesting enough to be released on it's own as well.
17
+
18
+ **Training notes.**
19
+
20
+ This model was trained for 2 epochs on 10k rows (~18.7M tokens), taken equally from Erebus-87k and r_shortstories_24k datasets. It was trained on 8xH100 SXM node for 30 minutes with rsLoRA.
21
+ I got complete nonsense reported to my wandb during this run, and logging stopped altogether after step 13 for some reason. Seems to be directly related to Gemma, as my training setup worked flawlessly for Qwen.
22
+
23
+ **Format**
24
+
25
+ Model responds to Gemma instruct formatting, exactly like it's base model.
26
+
27
+ ```
28
+ <bos>
29
+ <start_of_turn>user{user message}<end_of_turn>
30
+ <start_of_turn>model{response}<end_of_turn>
31
+ <eos>
32
+ ```
33
+
34
+ **Training config**
35
+ <details><summary>See LLaMA-Factory config</summary>
36
+
37
+ ```yaml
38
+ ### Model
39
+ model_name_or_path: UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3
40
+ #ref_model: # Reference model for RL (optional, for everything besides SimPO, which doesn't take it at all)
41
+ #ref_model_quantization_bit: 8 # 8 or 4
42
+
43
+ ### Method
44
+ stage: pt # pt, sft, rm, ppo, kto, dpo (includes orpo and simpo)
45
+ do_train: true
46
+ finetuning_type: lora # full, freeze or lora
47
+ lora_target: all
48
+ #pref_beta: 0.1
49
+ #pref_loss: simpo # sigmoid (dpo), orpo, simpo, ipo, hinge
50
+
51
+ ### Reward model
52
+ #reward_model: RLHFlow/ArmoRM-Llama3-8B-v0.1 # or sfairXC/FsfairX-Gemma2-RM-v0.1 or nvidia/Llama-3.1-Nemotron-70B-Reward-HF
53
+ #reward_model_type: full # full, lora, api
54
+ #reward_model_adapters: # Path to RM LoRA adapter(s) if using a LoRA RM
55
+ #reward_model_quantization_bit: 8 # 4 or 8
56
+
57
+ ### Freeze
58
+ #freeze_trainable_layers: # The number of trainable layers for freeze (partial-parameter) fine-tuning. Positive number means n last layers to train, negative - n first layers to train
59
+ #freeze_trainable_modules: # Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. Use commas to separate
60
+ #freeze_extra_modules: # Name(s) of modules apart from hidden layers to be set as trainable. Use commas to separate
61
+
62
+ ### LoRA
63
+ #loraplus_lr_ratio: 8.0
64
+ #loraplus_lr_embedding:
65
+ use_dora: false
66
+ use_rslora: true
67
+ lora_rank: 64 # 64 is optimal for most trains on instruct, if training on base - use rslora or dora
68
+ lora_alpha: 32
69
+ lora_dropout: 0.05
70
+ #pissa_init: true
71
+ #pissa_iter: 16
72
+ #pissa_convert: true
73
+
74
+ ### QLoRA
75
+ quantization_bit: 8 # 2,3,4,5,6,8 in HQQ, 4 or 8 in bnb
76
+ quantization_method: hqq # bitsandbytes or hqq
77
+
78
+ ### DeepSpeed
79
+ deepspeed: examples/deepspeed/ds_z2_config.json # ds_z3_config.json or ds_z2_config.json which is required for HQQ on multigpu
80
+
81
+ ### Dataset
82
+ dataset: sugarquill-10k # define in data/dataset_info.json
83
+ cutoff_len: 8192
84
+ max_samples: 10000
85
+ overwrite_cache: true
86
+ preprocessing_num_workers: 16
87
+ #template: chatml
88
+
89
+ ### Output
90
+ output_dir: saves/gemma/lora/sugarquill-1
91
+ logging_steps: 3
92
+ save_steps: 50
93
+ plot_loss: true
94
+ compute_accuracy: true
95
+ overwrite_output_dir: true
96
+
97
+ ### Train
98
+ per_device_train_batch_size: 1 # Effective b/s == per-device b/s * grad accum steps * number of GPUs
99
+ gradient_accumulation_steps: 8
100
+ learning_rate: 3.0e-5
101
+ optim: paged_adamw_8bit # paged_adamw_8bit or adamw_torch usually
102
+ num_train_epochs: 2.0
103
+ lr_scheduler_type: cosine # cosine, constant or linear
104
+ warmup_ratio: 0.05
105
+ bf16: true
106
+ ddp_timeout: 180000000
107
+ packing: true
108
+ max_grad_norm: 1.0
109
+
110
+ ### Opts
111
+ flash_attn: fa2 # auto, disabled, sdpa, fa2 | Gemma will fallback to eager
112
+ enable_liger_kernel: true # Pretty much must have if it works
113
+ #use_unsloth: true # May not work with multigpu idk
114
+ #use_adam_mini: true # Comment optim if using this
115
+
116
+ ### Eval
117
+ val_size: 0.1
118
+ per_device_eval_batch_size: 1
119
+ eval_strategy: steps
120
+ eval_steps: 0.05
121
+
122
+ ### Misc
123
+ include_num_input_tokens_seen: true
124
+ ddp_find_unused_parameters: false # Stupid thing tries to start distributed training otherwise
125
+ upcast_layernorm: true
126
+
127
+ ### Inference for PPO
128
+ #max_new_tokens: 512
129
+ #temperature: 0.8
130
+ #top_k: 0
131
+ #top_p: 0.8
132
+
133
+ ### Tracking
134
+ report_to: wandb # or tensorboard or mlflow | LOGIN BEFORE STARTING TRAIN OR ELSE IT WILL CRASH
135
+ run_name: G2-9B-Sugarquill-1
136
+
137
+ ### Merge Adapter
138
+ #export_dir: models/G2-9B-Sugarquill
139
+ #export_size: 4
140
+ #export_device: gpu
141
+ #export_legacy_format: false
142
+
143
+ ```
144
+
145
+ </details>