File size: 11,054 Bytes
abb9e91
2601523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abb9e91
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import os


class Main:

    async def train_model(self,max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model,
                    strict, datasets_path, dataset_format, shards,
                    val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
                    pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
                    lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
                    micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
                    group_by_length, bf16, fp16, tf32, gradient_checkpointing,
                    resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
                    load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
                    debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
                    wandb_name, wandb_log_model,last_tab,progress=gr.Progress(track_tqdm=True)):

      a = [base_model, model_type, tokenizer_type, is_llama_derived_model,
          strict, datasets_path, dataset_format, shards,
          val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
          pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
          lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
          micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
          group_by_length, bf16, fp16, tf32, gradient_checkpointing,
          resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
          load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
          debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
          wandb_name, wandb_log_model,last_tab]

      return a





    def initiate_userInterface(self):
        with gr.Blocks() as self.app:
            gr.Markdown("### Axolotl UI")

            # Finetuning Tab
            with gr.Tab("FineTuning UI"):
                base_model = gr.Dropdown(choices=["NousResearch/Llama-2-7b-hf", "mistralai/Mistral-7B-Instruct-v0.2"], label="Select Model", value="NousResearch/Llama-2-7b-hf")
                datasets_path = gr.Textbox(label="datasets_path", value="mhenrichsen/alpaca_2k_test")
                dataset_format = gr.Radio(choices=['Alpaca'], label="Dataset Format", value='Alpaca')
                shards = gr.Slider(minimum=0, maximum=20, step=1, label="shards", value=10)
                last_tab = gr.Checkbox(label='last_tab',value=False,visible=False)

                with gr.Accordion("Advanced Settings",open=False):
                    with gr.Tab("YAML Configuration"):
                        model_type = gr.Radio(label="model_type", choices=['MistralForCausalLM','LlamaForCausalLM'],info="",value="LlamaForCausalLM")
                        tokenizer_type = gr.Textbox(label="tokenizer_type", value="LlamaTokenizer",visible=False)
                        is_llama_derived_model = gr.Checkbox(label="is_llama_derived_model", value=True,info="Determines the padding strategy based on the parent type of the model")
                        strict = gr.Checkbox(label="strict", value=False,visible=False)
                        val_set_size = gr.Slider(minimum=0, maximum=1, step=0.1, label="val_set_size", value=0.05,info="Percentage of training data to be used for validation")
                        output_dir = gr.Textbox(label="output_dir", value="./finetune-out",info="Output directory of the finetuned model")
                        adapter = gr.Radio(choices=["qlora", "lora"], label="adapter",value='qlora',info="Parameter efficient training strategy")
                        lora_model_dir = gr.Textbox(label="lora_model_dir",info="Directory of a custom adapter can be provided",visible=False)
                        sequence_len = gr.Slider(minimum=512, maximum=4096, step=10,label="sequence_len", value=1024,info="The maximum length input allowed to train")
                        sample_packing = gr.Checkbox(label="sample_packing", value=True,info="Speeds up data preparation but recommended false for small datasets")
                        pad_to_sequence_len = gr.Checkbox(label="pad_to_sequence_len", value=True, info="Pads the input to match sequence length to avoid memory fragmentation and out of memory issues. Recommended true")
                        # eval_sample_packing = gr.Checkbox(label="eval_sample_packing", value=False)
                        lora_r = gr.Slider(minimum=8, maximum=64, step=2,label="lora_r", value=32,info="The number of parameters in adaptation layers.")
                        lora_alpha = gr.Slider(minimum=8, maximum=64, step=0.1,label="lora_alpha", value=16,info="How much adapted weights affect base model's")
                        lora_dropout = gr.Slider(minimum=0, maximum=1, label="lora_dropout", value=0.05, step=0.01,info="The ratio of weights ignored randomly within adapted weights")
                        lora_target_modules = gr.Textbox(label="lora_target_modules", value="q_proj, v_proj, k_proj",info="All dense layers can be targeted using parameter efficient tuning")
                        lora_target_linear = gr.Checkbox(label="lora_target_linear", value=True,info="Lora Target Modules will be ignored and all linear layers will be used")
                        lora_fan_in_fan_out = gr.Textbox(label="lora_fan_in_fan_out",visible=False)

                        gradient_accumulation_steps = gr.Slider(minimum=4, maximum=64, step=1,label="gradient_accumulation_steps", value=4,info="Number of steps required to update the weights with cumulative gradients")
                        micro_batch_size = gr.Slider(minimum=1, maximum=64, step=2,label="micro_batch_size", value=2,info="Number of samples sent to each gpu")
                        num_epochs = gr.Slider(minimum=1, maximum=4, step=1,label="num_epochs", value=1)
                        max_steps = gr.Textbox(label="max_steps",value='1',info="Maximum number of steps to be trained. Overwrites the number of epochs",visible=False)
                        optimizer = gr.Radio(choices=["adamw_hf",'adamw_torch','adamw_torch_fused','adamw_torch_xla','adamw_apex_fused','adafactor','adamw_anyprecision','sgd','adagrad','adamw_bnb_8bit','lion_8bit','lion_32bit','paged_adamw_32bit','paged_adamw_8bit','paged_lion_32bit','paged_lion_8bit'], value="paged_adamw_32bit",label='optimizer',info="Use an optimizer which aligns with the quantization of model")
                        lr_scheduler = gr.Radio(label="lr_scheduler", choices=['one_cycle', 'log_sweep', 'cosine'],value="cosine",info="Determines dynamic learning rate based on current step")
                        learning_rate = gr.Textbox(label="max_learning_rate", value="2e-5",info="")
                        train_on_inputs = gr.Checkbox(label="train_on_inputs", value=False,visible=False)
                        group_by_length = gr.Checkbox(label="group_by_length", value=False,visible=False)
                        bf16 = gr.Checkbox(label="bfloat16", value=False,info="Enable bfloat16 precision for tensors; supported only on Ampere or newer GPUs.")
                        fp16 = gr.Checkbox(label="Half Precision", value=True,info="Enable half precision (FP16) for tensor processing.")
                        tf32 = gr.Checkbox(label="TensorFloat32", value=False,info="Enable TensorFloat32 precision for tensors; supported only on Ampere or newer GPUs.")
                        gradient_checkpointing = gr.Checkbox(label="gradient_checkpointing", value=True,info='',visible=False)
                        resume_from_checkpoint = gr.Textbox(label="resume_from_checkpoint",visible=False)
                        local_rank = gr.Textbox(label="local_rank",visible=False)
                        logging_steps = gr.Slider(minimum=1, maximum=100, step=1,label="logging_steps", value=1,info='',visible=False)
                        xformers_attention = gr.Checkbox(label="xformers_attention", value=False,visible=False)
                        flash_attention = gr.Checkbox(label="flash_attention", value=False,info='',visible=False)
                        load_best_model_at_end = gr.Checkbox(label="load_best_model_at_end", value=False,visible=False)
                        warmup_steps = gr.Slider(minimum=1, maximum=100, step=1,label="warmup_steps", value=10,visible=False)
                        evals_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="evals_per_epoch", value=4,info='No. of Evaluation Per Epoch',visible=False)
                        eval_table_size = gr.Textbox(label="eval_table_size",visible=False)
                        saves_per_epoch = gr.Slider(minimum=1, maximum=100, step=1,label="saves_per_epoch", value=1,info='No. of checkpoints to be saved')

                        debug = gr.Checkbox(label="debug", value=False,visible=False)

                        weight_decay = gr.Number(label="weight_decay", value=0.0,visible=False)
                        wandb_watch = gr.Checkbox(label="wandb_watch", value=False,visible=False)
                        wandb_log_model = gr.Checkbox(label="wandb_log_model", value=False,visible=False)
                        wandb_project = gr.Textbox(label="wandb_project",visible=False)
                        wandb_entity = gr.Textbox(label="wandb_entity",visible=False)
                        wandb_name = gr.Textbox(label="wandb_name",visible=False)


                train_btn = gr.Button("Start Training")
                train_btn.click(
                    self.train_model,
                    inputs=[max_steps, base_model, model_type, tokenizer_type, is_llama_derived_model,
                    strict, datasets_path, dataset_format, shards,
                    val_set_size, output_dir, adapter, lora_model_dir, sequence_len, sample_packing,
                    pad_to_sequence_len, lora_r, lora_alpha, lora_dropout,
                    lora_target_modules, lora_target_linear, lora_fan_in_fan_out, gradient_accumulation_steps,
                    micro_batch_size, num_epochs, optimizer, lr_scheduler, learning_rate, train_on_inputs,
                    group_by_length, bf16, fp16, tf32, gradient_checkpointing,
                    resume_from_checkpoint, local_rank, logging_steps, xformers_attention, flash_attention,
                    load_best_model_at_end, warmup_steps, evals_per_epoch, eval_table_size, saves_per_epoch,
                    debug, weight_decay, wandb_project, wandb_entity, wandb_watch,
                    wandb_name, wandb_log_model,last_tab],
                    outputs=[gr.Textbox(label="Training Output",interactive=False)]
                )

        return self.app


if __name__ == "__main__":
    main = Main()
    app = main.initiate_userInterface()                   
    app.queue().launch(share=True,server_name='0.0.0.0')