test / app.py
goku6045's picture
Update app.py
2601523 verified
raw
history blame
11.1 kB
import gradio as gr
import os
class Main:
async def train_model(self,max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model,
strict, datasets_path, dataset_format, shards,
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
group_by_length, bf16, fp16, tf32, gradient_checkpointing,
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
wandb_name, wandb_log_model,last_tab,progress=gr.Progress(track_tqdm=True)):
a = [base_model, model_type, tokenizer_type, is_llama_derived_model,
strict, datasets_path, dataset_format, shards,
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
group_by_length, bf16, fp16, tf32, gradient_checkpointing,
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
wandb_name, wandb_log_model,last_tab]
return a
def initiate_userInterface(self):
with gr.Blocks() as self.app:
gr.Markdown("### Axolotl UI")
# Finetuning Tab
with gr.Tab("FineTuning UI"):
base_model = gr.Dropdown(choices=["NousResearch/Llama-2-7b-hf", "mistralai/Mistral-7B-Instruct-v0.2"], label="Select Model", value="NousResearch/Llama-2-7b-hf")
datasets_path = gr.Textbox(label="datasets_path", value="mhenrichsen/alpaca_2k_test")
dataset_format = gr.Radio(choices=['Alpaca'], label="Dataset Format", value='Alpaca')
shards = gr.Slider(minimum=0, maximum=20, step=1, label="shards", value=10)
last_tab = gr.Checkbox(label='last_tab',value=False,visible=False)
with gr.Accordion("Advanced Settings",open=False):
with gr.Tab("YAML Configuration"):
model_type = gr.Radio(label="model_type", choices=['MistralForCausalLM','LlamaForCausalLM'],info="",value="LlamaForCausalLM")
tokenizer_type = gr.Textbox(label="tokenizer_type", value="LlamaTokenizer",visible=False)
is_llama_derived_model = gr.Checkbox(label="is_llama_derived_model", value=True,info="Determines the padding strategy based on the parent type of the model")
strict = gr.Checkbox(label="strict", value=False,visible=False)
val_set_size = gr.Slider(minimum=0, maximum=1, step=0.1, label="val_set_size", value=0.05,info="Percentage of training data to be used for validation")
output_dir = gr.Textbox(label="output_dir", value="./finetune-out",info="Output directory of the finetuned model")
adapter = gr.Radio(choices=["qlora", "lora"], label="adapter",value='qlora',info="Parameter efficient training strategy")
lora_model_dir = gr.Textbox(label="lora_model_dir",info="Directory of a custom adapter can be provided",visible=False)
sequence_len = gr.Slider(minimum=512, maximum=4096, step=10,label="sequence_len", value=1024,info="The maximum length input allowed to train")
sample_packing = gr.Checkbox(label="sample_packing", value=True,info="Speeds up data preparation but recommended false for small datasets")
pad_to_sequence_len = gr.Checkbox(label="pad_to_sequence_len", value=True, info="Pads the input to match sequence length to avoid memory fragmentation and out of memory issues. Recommended true")
# eval_sample_packing = gr.Checkbox(label="eval_sample_packing", value=False)
lora_r = gr.Slider(minimum=8, maximum=64, step=2,label="lora_r", value=32,info="The number of parameters in adaptation layers.")
lora_alpha = gr.Slider(minimum=8, maximum=64, step=0.1,label="lora_alpha", value=16,info="How much adapted weights affect base model's")
lora_dropout = gr.Slider(minimum=0, maximum=1, label="lora_dropout", value=0.05, step=0.01,info="The ratio of weights ignored randomly within adapted weights")
lora_target_modules = gr.Textbox(label="lora_target_modules", value="q_proj, v_proj, k_proj",info="All dense layers can be targeted using parameter efficient tuning")
lora_target_linear = gr.Checkbox(label="lora_target_linear", value=True,info="Lora Target Modules will be ignored and all linear layers will be used")
lora_fan_in_fan_out = gr.Textbox(label="lora_fan_in_fan_out",visible=False)
gradient_accumulation_steps = gr.Slider(minimum=4, maximum=64, step=1,label="gradient_accumulation_steps", value=4,info="Number of steps required to update the weights with cumulative gradients")
micro_batch_size = gr.Slider(minimum=1, maximum=64, step=2,label="micro_batch_size", value=2,info="Number of samples sent to each gpu")
num_epochs = gr.Slider(minimum=1, maximum=4, step=1,label="num_epochs", value=1)
max_steps = gr.Textbox(label="max_steps",value='1',info="Maximum number of steps to be trained. Overwrites the number of epochs",visible=False)
optimizer = gr.Radio(choices=["adamw_hf",'adamw_torch','adamw_torch_fused','adamw_torch_xla','adamw_apex_fused','adafactor','adamw_anyprecision','sgd','adagrad','adamw_bnb_8bit','lion_8bit','lion_32bit','paged_adamw_32bit','paged_adamw_8bit','paged_lion_32bit','paged_lion_8bit'], value="paged_adamw_32bit",label='optimizer',info="Use an optimizer which aligns with the quantization of model")
lr_scheduler = gr.Radio(label="lr_scheduler", choices=['one_cycle', 'log_sweep', 'cosine'],value="cosine",info="Determines dynamic learning rate based on current step")
learning_rate = gr.Textbox(label="max_learning_rate", value="2e-5",info="")
train_on_inputs = gr.Checkbox(label="train_on_inputs", value=False,visible=False)
group_by_length = gr.Checkbox(label="group_by_length", value=False,visible=False)
bf16 = gr.Checkbox(label="bfloat16", value=False,info="Enable bfloat16 precision for tensors; supported only on Ampere or newer GPUs.")
fp16 = gr.Checkbox(label="Half Precision", value=True,info="Enable half precision (FP16) for tensor processing.")
tf32 = gr.Checkbox(label="TensorFloat32", value=False,info="Enable TensorFloat32 precision for tensors; supported only on Ampere or newer GPUs.")
gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True,info='',visible=False)
resume_from_checkpoint = gr.Textbox(label="resume_from_checkpoint",visible=False)
local_rank = gr.Textbox(label="local_rank",visible=False)
logging_steps = gr.Slider(minimum=1, maximum=100, step=1,label="logging_steps", value=1,info='',visible=False)
xformers_attention = gr.Checkbox(label="xformers_attention", value=False,visible=False)
flash_attention = gr.Checkbox(label="flash_attention", value=False,info='',visible=False)
load_best_model_at_end = gr.Checkbox(label="load_best_model_at_end", value=False,visible=False)
warmup_steps = gr.Slider(minimum=1, maximum=100, step=1,label="warmup_steps", value=10,visible=False)
evals_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="evals_per_epoch", value=4,info='No. of Evaluation Per Epoch',visible=False)
eval_table_size = gr.Textbox(label="eval_table_size",visible=False)
saves_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="saves_per_epoch", value=1,info='No. of checkpoints to be saved')
debug = gr.Checkbox(label="debug", value=False,visible=False)
weight_decay = gr.Number(label="weight_decay", value=0.0,visible=False)
wandb_watch = gr.Checkbox(label="wandb_watch", value=False,visible=False)
wandb_log_model = gr.Checkbox(label="wandb_log_model", value=False,visible=False)
wandb_project = gr.Textbox(label="wandb_project",visible=False)
wandb_entity = gr.Textbox(label="wandb_entity",visible=False)
wandb_name = gr.Textbox(label="wandb_name",visible=False)
train_btn = gr.Button("Start Training")
train_btn.click(
self.train_model,
inputs=[max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model,
strict, datasets_path, dataset_format, shards,
val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
group_by_length, bf16, fp16, tf32, gradient_checkpointing,
resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
wandb_name, wandb_log_model,last_tab],
outputs=[gr.Textbox(label="Training Output",interactive=False)]
)
return self.app
if __name__ == "__main__":
main = Main()
app = main.initiate_userInterface()
app.queue().launch(share=True,server_name='0.0.0.0')