File size: 11,054 Bytes
abb9e91 2601523 abb9e91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import gradio as gr
import os
class Main:
async def train_model(self,max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model,
strict, datasets_path, dataset_format, shards,
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
group_by_length, bf16, fp16, tf32, gradient_checkpointing,
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
wandb_name, wandb_log_model,last_tab,progress=gr.Progress(track_tqdm=True)):
a = [base_model, model_type, tokenizer_type, is_llama_derived_model,
strict, datasets_path, dataset_format, shards,
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
group_by_length, bf16, fp16, tf32, gradient_checkpointing,
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
wandb_name, wandb_log_model,last_tab]
return a
def initiate_userInterface(self):
with gr.Blocks() as self.app:
gr.Markdown("### Axolotl UI")
# Finetuning Tab
with gr.Tab("FineTuning UI"):
base_model = gr.Dropdown(choices=["NousResearch/Llama-2-7b-hf", "mistralai/Mistral-7B-Instruct-v0.2"], label="Select Model", value="NousResearch/Llama-2-7b-hf")
datasets_path = gr.Textbox(label="datasets_path", value="mhenrichsen/alpaca_2k_test")
dataset_format = gr.Radio(choices=['Alpaca'], label="Dataset Format", value='Alpaca')
shards = gr.Slider(minimum=0, maximum=20, step=1, label="shards", value=10)
last_tab = gr.Checkbox(label='last_tab',value=False,visible=False)
with gr.Accordion("Advanced Settings",open=False):
with gr.Tab("YAML Configuration"):
model_type = gr.Radio(label="model_type", choices=['MistralForCausalLM','LlamaForCausalLM'],info="",value="LlamaForCausalLM")
tokenizer_type = gr.Textbox(label="tokenizer_type", value="LlamaTokenizer",visible=False)
is_llama_derived_model = gr.Checkbox(label="is_llama_derived_model", value=True,info="Determines the padding strategy based on the parent type of the model")
strict = gr.Checkbox(label="strict", value=False,visible=False)
val_set_size = gr.Slider(minimum=0, maximum=1, step=0.1, label="val_set_size", value=0.05,info="Percentage of training data to be used for validation")
output_dir = gr.Textbox(label="output_dir", value="./finetune-out",info="Output directory of the finetuned model")
adapter = gr.Radio(choices=["qlora", "lora"], label="adapter",value='qlora',info="Parameter efficient training strategy")
lora_model_dir = gr.Textbox(label="lora_model_dir",info="Directory of a custom adapter can be provided",visible=False)
sequence_len = gr.Slider(minimum=512, maximum=4096, step=10,label="sequence_len", value=1024,info="The maximum length input allowed to train")
sample_packing = gr.Checkbox(label="sample_packing", value=True,info="Speeds up data preparation but recommended false for small datasets")
pad_to_sequence_len = gr.Checkbox(label="pad_to_sequence_len", value=True, info="Pads the input to match sequence length to avoid memory fragmentation and out of memory issues. Recommended true")
# eval_sample_packing = gr.Checkbox(label="eval_sample_packing", value=False)
lora_r = gr.Slider(minimum=8, maximum=64, step=2,label="lora_r", value=32,info="The number of parameters in adaptation layers.")
lora_alpha = gr.Slider(minimum=8, maximum=64, step=0.1,label="lora_alpha", value=16,info="How much adapted weights affect base model's")
lora_dropout = gr.Slider(minimum=0, maximum=1, label="lora_dropout", value=0.05, step=0.01,info="The ratio of weights ignored randomly within adapted weights")
lora_target_modules = gr.Textbox(label="lora_target_modules", value="q_proj, v_proj, k_proj",info="All dense layers can be targeted using parameter efficient tuning")
lora_target_linear = gr.Checkbox(label="lora_target_linear", value=True,info="Lora Target Modules will be ignored and all linear layers will be used")
lora_fan_in_fan_out = gr.Textbox(label="lora_fan_in_fan_out",visible=False)
gradient_accumulation_steps = gr.Slider(minimum=4, maximum=64, step=1,label="gradient_accumulation_steps", value=4,info="Number of steps required to update the weights with cumulative gradients")
micro_batch_size = gr.Slider(minimum=1, maximum=64, step=2,label="micro_batch_size", value=2,info="Number of samples sent to each gpu")
num_epochs = gr.Slider(minimum=1, maximum=4, step=1,label="num_epochs", value=1)
max_steps = gr.Textbox(label="max_steps",value='1',info="Maximum number of steps to be trained. Overwrites the number of epochs",visible=False)
optimizer = gr.Radio(choices=["adamw_hf",'adamw_torch','adamw_torch_fused','adamw_torch_xla','adamw_apex_fused','adafactor','adamw_anyprecision','sgd','adagrad','adamw_bnb_8bit','lion_8bit','lion_32bit','paged_adamw_32bit','paged_adamw_8bit','paged_lion_32bit','paged_lion_8bit'], value="paged_adamw_32bit",label='optimizer',info="Use an optimizer which aligns with the quantization of model")
lr_scheduler = gr.Radio(label="lr_scheduler", choices=['one_cycle', 'log_sweep', 'cosine'],value="cosine",info="Determines dynamic learning rate based on current step")
learning_rate = gr.Textbox(label="max_learning_rate", value="2e-5",info="")
train_on_inputs = gr.Checkbox(label="train_on_inputs", value=False,visible=False)
group_by_length = gr.Checkbox(label="group_by_length", value=False,visible=False)
bf16 = gr.Checkbox(label="bfloat16", value=False,info="Enable bfloat16 precision for tensors; supported only on Ampere or newer GPUs.")
fp16 = gr.Checkbox(label="Half Precision", value=True,info="Enable half precision (FP16) for tensor processing.")
tf32 = gr.Checkbox(label="TensorFloat32", value=False,info="Enable TensorFloat32 precision for tensors; supported only on Ampere or newer GPUs.")
gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True,info='',visible=False)
resume_from_checkpoint = gr.Textbox(label="resume_from_checkpoint",visible=False)
local_rank = gr.Textbox(label="local_rank",visible=False)
logging_steps = gr.Slider(minimum=1, maximum=100, step=1,label="logging_steps", value=1,info='',visible=False)
xformers_attention = gr.Checkbox(label="xformers_attention", value=False,visible=False)
flash_attention = gr.Checkbox(label="flash_attention", value=False,info='',visible=False)
load_best_model_at_end = gr.Checkbox(label="load_best_model_at_end", value=False,visible=False)
warmup_steps = gr.Slider(minimum=1, maximum=100, step=1,label="warmup_steps", value=10,visible=False)
evals_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="evals_per_epoch", value=4,info='No. of Evaluation Per Epoch',visible=False)
eval_table_size = gr.Textbox(label="eval_table_size",visible=False)
saves_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="saves_per_epoch", value=1,info='No. of checkpoints to be saved')
debug = gr.Checkbox(label="debug", value=False,visible=False)
weight_decay = gr.Number(label="weight_decay", value=0.0,visible=False)
wandb_watch = gr.Checkbox(label="wandb_watch", value=False,visible=False)
wandb_log_model = gr.Checkbox(label="wandb_log_model", value=False,visible=False)
wandb_project = gr.Textbox(label="wandb_project",visible=False)
wandb_entity = gr.Textbox(label="wandb_entity",visible=False)
wandb_name = gr.Textbox(label="wandb_name",visible=False)
train_btn = gr.Button("Start Training")
train_btn.click(
self.train_model,
inputs=[max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model,
strict, datasets_path, dataset_format, shards,
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
group_by_length, bf16, fp16, tf32, gradient_checkpointing,
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
wandb_name, wandb_log_model,last_tab],
outputs=[gr.Textbox(label="Training Output",interactive=False)]
)
return self.app
if __name__ == "__main__":
main = Main()
app = main.initiate_userInterface()
app.queue().launch(share=True,server_name='0.0.0.0')
|