|
import gradio as gr |
|
import os |
|
|
|
|
|
class Main: |
|
|
|
async def train_model(self,max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model, |
|
strict, datasets_path, dataset_format, shards, |
|
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing, |
|
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout, |
|
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps, |
|
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs, |
|
group_by_length, bf16, fp16, tf32, gradient_checkpointing, |
|
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention, |
|
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch, |
|
debug, weight_decay, wandb_project, wandb_entity, wandb_watch, |
|
wandb_name, wandb_log_model,last_tab,progress=gr.Progress(track_tqdm=True)): |
|
|
|
a = [base_model, model_type, tokenizer_type, is_llama_derived_model, |
|
strict, datasets_path, dataset_format, shards, |
|
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing, |
|
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout, |
|
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps, |
|
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs, |
|
group_by_length, bf16, fp16, tf32, gradient_checkpointing, |
|
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention, |
|
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch, |
|
debug, weight_decay, wandb_project, wandb_entity, wandb_watch, |
|
wandb_name, wandb_log_model,last_tab] |
|
|
|
return a |
|
|
|
|
|
|
|
|
|
|
|
def initiate_userInterface(self): |
|
with gr.Blocks() as self.app: |
|
gr.Markdown("### Axolotl UI") |
|
|
|
|
|
with gr.Tab("FineTuning UI"): |
|
base_model = gr.Dropdown(choices=["NousResearch/Llama-2-7b-hf", "mistralai/Mistral-7B-Instruct-v0.2"], label="Select Model", value="NousResearch/Llama-2-7b-hf") |
|
datasets_path = gr.Textbox(label="datasets_path", value="mhenrichsen/alpaca_2k_test") |
|
dataset_format = gr.Radio(choices=['Alpaca'], label="Dataset Format", value='Alpaca') |
|
shards = gr.Slider(minimum=0, maximum=20, step=1, label="shards", value=10) |
|
last_tab = gr.Checkbox(label='last_tab',value=False,visible=False) |
|
|
|
with gr.Accordion("Advanced Settings",open=False): |
|
with gr.Tab("YAML Configuration"): |
|
model_type = gr.Radio(label="model_type", choices=['MistralForCausalLM','LlamaForCausalLM'],info="",value="LlamaForCausalLM") |
|
tokenizer_type = gr.Textbox(label="tokenizer_type", value="LlamaTokenizer",visible=False) |
|
is_llama_derived_model = gr.Checkbox(label="is_llama_derived_model", value=True,info="Determines the padding strategy based on the parent type of the model") |
|
strict = gr.Checkbox(label="strict", value=False,visible=False) |
|
val_set_size = gr.Slider(minimum=0, maximum=1, step=0.1, label="val_set_size", value=0.05,info="Percentage of training data to be used for validation") |
|
output_dir = gr.Textbox(label="output_dir", value="./finetune-out",info="Output directory of the finetuned model") |
|
adapter = gr.Radio(choices=["qlora", "lora"], label="adapter",value='qlora',info="Parameter efficient training strategy") |
|
lora_model_dir = gr.Textbox(label="lora_model_dir",info="Directory of a custom adapter can be provided",visible=False) |
|
sequence_len = gr.Slider(minimum=512, maximum=4096, step=10,label="sequence_len", value=1024,info="The maximum length input allowed to train") |
|
sample_packing = gr.Checkbox(label="sample_packing", value=True,info="Speeds up data preparation but recommended false for small datasets") |
|
pad_to_sequence_len = gr.Checkbox(label="pad_to_sequence_len", value=True, info="Pads the input to match sequence length to avoid memory fragmentation and out of memory issues. Recommended true") |
|
|
|
lora_r = gr.Slider(minimum=8, maximum=64, step=2,label="lora_r", value=32,info="The number of parameters in adaptation layers.") |
|
lora_alpha = gr.Slider(minimum=8, maximum=64, step=0.1,label="lora_alpha", value=16,info="How much adapted weights affect base model's") |
|
lora_dropout = gr.Slider(minimum=0, maximum=1, label="lora_dropout", value=0.05, step=0.01,info="The ratio of weights ignored randomly within adapted weights") |
|
lora_target_modules = gr.Textbox(label="lora_target_modules", value="q_proj, v_proj, k_proj",info="All dense layers can be targeted using parameter efficient tuning") |
|
lora_target_linear = gr.Checkbox(label="lora_target_linear", value=True,info="Lora Target Modules will be ignored and all linear layers will be used") |
|
lora_fan_in_fan_out = gr.Textbox(label="lora_fan_in_fan_out",visible=False) |
|
|
|
gradient_accumulation_steps = gr.Slider(minimum=4, maximum=64, step=1,label="gradient_accumulation_steps", value=4,info="Number of steps required to update the weights with cumulative gradients") |
|
micro_batch_size = gr.Slider(minimum=1, maximum=64, step=2,label="micro_batch_size", value=2,info="Number of samples sent to each gpu") |
|
num_epochs = gr.Slider(minimum=1, maximum=4, step=1,label="num_epochs", value=1) |
|
max_steps = gr.Textbox(label="max_steps",value='1',info="Maximum number of steps to be trained. Overwrites the number of epochs",visible=False) |
|
optimizer = gr.Radio(choices=["adamw_hf",'adamw_torch','adamw_torch_fused','adamw_torch_xla','adamw_apex_fused','adafactor','adamw_anyprecision','sgd','adagrad','adamw_bnb_8bit','lion_8bit','lion_32bit','paged_adamw_32bit','paged_adamw_8bit','paged_lion_32bit','paged_lion_8bit'], value="paged_adamw_32bit",label='optimizer',info="Use an optimizer which aligns with the quantization of model") |
|
lr_scheduler = gr.Radio(label="lr_scheduler", choices=['one_cycle', 'log_sweep', 'cosine'],value="cosine",info="Determines dynamic learning rate based on current step") |
|
learning_rate = gr.Textbox(label="max_learning_rate", value="2e-5",info="") |
|
train_on_inputs = gr.Checkbox(label="train_on_inputs", value=False,visible=False) |
|
group_by_length = gr.Checkbox(label="group_by_length", value=False,visible=False) |
|
bf16 = gr.Checkbox(label="bfloat16", value=False,info="Enable bfloat16 precision for tensors; supported only on Ampere or newer GPUs.") |
|
fp16 = gr.Checkbox(label="Half Precision", value=True,info="Enable half precision (FP16) for tensor processing.") |
|
tf32 = gr.Checkbox(label="TensorFloat32", value=False,info="Enable TensorFloat32 precision for tensors; supported only on Ampere or newer GPUs.") |
|
gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True,info='',visible=False) |
|
resume_from_checkpoint = gr.Textbox(label="resume_from_checkpoint",visible=False) |
|
local_rank = gr.Textbox(label="local_rank",visible=False) |
|
logging_steps = gr.Slider(minimum=1, maximum=100, step=1,label="logging_steps", value=1,info='',visible=False) |
|
xformers_attention = gr.Checkbox(label="xformers_attention", value=False,visible=False) |
|
flash_attention = gr.Checkbox(label="flash_attention", value=False,info='',visible=False) |
|
load_best_model_at_end = gr.Checkbox(label="load_best_model_at_end", value=False,visible=False) |
|
warmup_steps = gr.Slider(minimum=1, maximum=100, step=1,label="warmup_steps", value=10,visible=False) |
|
evals_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="evals_per_epoch", value=4,info='No. of Evaluation Per Epoch',visible=False) |
|
eval_table_size = gr.Textbox(label="eval_table_size",visible=False) |
|
saves_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="saves_per_epoch", value=1,info='No. of checkpoints to be saved') |
|
|
|
debug = gr.Checkbox(label="debug", value=False,visible=False) |
|
|
|
weight_decay = gr.Number(label="weight_decay", value=0.0,visible=False) |
|
wandb_watch = gr.Checkbox(label="wandb_watch", value=False,visible=False) |
|
wandb_log_model = gr.Checkbox(label="wandb_log_model", value=False,visible=False) |
|
wandb_project = gr.Textbox(label="wandb_project",visible=False) |
|
wandb_entity = gr.Textbox(label="wandb_entity",visible=False) |
|
wandb_name = gr.Textbox(label="wandb_name",visible=False) |
|
|
|
|
|
train_btn = gr.Button("Start Training") |
|
train_btn.click( |
|
self.train_model, |
|
inputs=[max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model, |
|
strict, datasets_path, dataset_format, shards, |
|
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing, |
|
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout, |
|
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps, |
|
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs, |
|
group_by_length, bf16, fp16, tf32, gradient_checkpointing, |
|
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention, |
|
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch, |
|
debug, weight_decay, wandb_project, wandb_entity, wandb_watch, |
|
wandb_name, wandb_log_model,last_tab], |
|
outputs=[gr.Textbox(label="Training Output",interactive=False)] |
|
) |
|
|
|
return self.app |
|
|
|
|
|
if __name__ == "__main__": |
|
main = Main() |
|
app = main.initiate_userInterface() |
|
app.queue().launch(share=True,server_name='0.0.0.0') |
|
|
|
|
|
|