inoid commited on
Commit
478d560
1 Parent(s): a3a731c

Add finnetuing process configuration to model

Browse files
Files changed (2) hide show
  1. app.py +12 -33
  2. spanish_medica_llm.py +157 -3
app.py CHANGED
@@ -10,7 +10,7 @@ import sys
10
  import torch
11
 
12
 
13
- from spanish_medica_llm import run_training, run_training_process
14
 
15
  import gradio as gr
16
 
@@ -45,41 +45,18 @@ def train_model(*inputs):
45
  if "IS_SHARED_UI" in os.environ:
46
  raise gr.Error("This Space only works in duplicated instances")
47
 
48
- # args_general = argparse.Namespace(
49
- # image_captions_filename = True,
50
- # train_text_encoder = True,
51
- # #stop_text_encoder_training = stptxt,
52
- # save_n_steps = 0,
53
- # #pretrained_model_name_or_path = model_to_load,
54
- # instance_data_dir="instance_images",
55
- # #class_data_dir=class_data_dir,
56
- # output_dir="output_model",
57
- # instance_prompt="",
58
- # seed=42,
59
- # resolution=512,
60
- # mixed_precision="fp16",
61
- # train_batch_size=1,
62
- # gradient_accumulation_steps=1,
63
- # use_8bit_adam=True,
64
- # learning_rate=2e-6,
65
- # lr_scheduler="polynomial",
66
- # lr_warmup_steps = 0,
67
- # #max_train_steps=Training_Steps,
68
- # )
69
- # run_training(args_general)
70
- # torch.cuda.empty_cache()
71
- # #convert("output_model", "model.ckpt")
72
- # #shutil.rmtree('instance_images')
73
- # #shutil.make_archive("diffusers_model", 'zip', "output_model")
74
- # #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
75
- # # zipdir('output_model/', zipf)
76
- # torch.cuda.empty_cache()
77
- # return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
78
- run_training_process()
79
-
80
 
81
  return f"Train Model Sucessful!!!"
82
 
 
 
 
 
 
 
 
 
83
  def stop_model(*input):
84
  return f"Model with Gradio!"
85
 
@@ -93,6 +70,8 @@ with gr.Blocks() as demo:
93
  btn_response.click(fn=generate_model, inputs=inp, outputs=out)
94
  btn_train = gr.Button("Train Model")
95
  btn_train.click(fn=train_model, inputs=[], outputs=out)
 
 
96
  btn_evaluate = gr.Button("Evaluate Model")
97
  btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
98
  btn_stop = gr.Button("Stop Model")
 
10
  import torch
11
 
12
 
13
+ from spanish_medica_llm import run_training, run_training_process, run_finnetuning_process
14
 
15
  import gradio as gr
16
 
 
45
  if "IS_SHARED_UI" in os.environ:
46
  raise gr.Error("This Space only works in duplicated instances")
47
 
48
+ run_training_process()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  return f"Train Model Sucessful!!!"
51
 
52
+ def finnetuning_model(*inputs):
53
+ if "IS_SHARED_UI" in os.environ:
54
+ raise gr.Error("This Space only works in duplicated instances")
55
+
56
+ run_finnetuning_process()
57
+
58
+ return f"Finnetuning Model Sucessful!!!"
59
+
60
  def stop_model(*input):
61
  return f"Model with Gradio!"
62
 
 
70
  btn_response.click(fn=generate_model, inputs=inp, outputs=out)
71
  btn_train = gr.Button("Train Model")
72
  btn_train.click(fn=train_model, inputs=[], outputs=out)
73
+ btn_finnetuning = gr.Button("Finnetuning Model")
74
+ btn_finnetuning.click(fn=finnetuning_model, inputs=[], outputs=out)
75
  btn_evaluate = gr.Button("Evaluate Model")
76
  btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
77
  btn_stop = gr.Button("Stop Model")
spanish_medica_llm.py CHANGED
@@ -331,6 +331,45 @@ MAX_TRAINING_STEPS = 2
331
 
332
  TOKEN_NAME = TOKEN_MISTRAL_NAME
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  def loadSpanishTokenizer():
335
  """
336
 
@@ -379,12 +418,32 @@ def splitDatasetInTestValid(dataset):
379
  return (dataset['train'], eval_dataset, test_dataset)
380
 
381
  def loadSpanishDataset():
 
382
  spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
383
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
384
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
385
  return spanishMedicaLllmDataset
386
 
 
 
 
 
 
 
387
  ##See Jupyter Notebook for change CONTEXT_LENGTH size
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
  def accelerateConfigModel():
390
  """
@@ -483,6 +542,26 @@ def modelLoraConfigBioMistral(model):
483
  model = accelerator.prepare_model(model)
484
  return (model)
485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
  # A note on training. You can set the max_steps to be high initially, and examine at what step your
488
  # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
@@ -541,10 +620,85 @@ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
541
 
542
  trainer.push_to_hub()
543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544
 
545
 
 
 
546
 
547
  def run_training_process():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
  #Loggin to Huggin Face
549
  login(token = os.environ.get('HG_FACE_TOKEN'))
550
  os.environ['WANDB_DISABLED'] = 'true'
@@ -554,7 +708,7 @@ def run_training_process():
554
  getTokenizedDataset( medicalSpanishDataset, tokenizer)
555
  )
556
 
557
- base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
558
- base_model = modelLoraConfigBioMistral(base_model)
559
 
560
- configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)
 
 
331
 
332
  TOKEN_NAME = TOKEN_MISTRAL_NAME
333
 
334
+ def get_chat_format(element):
335
+ """
336
+ Processes a single sample from the alpaca dataset to structure it for chatbot training.
337
+
338
+ This function transforms the dataset sample into a format suitable for training,
339
+ where each message is categorized by its role in the conversation (system, input, user, assistant).
340
+ It initializes the conversation with a system message, then conditionally adds an input message,
341
+ follows with the user's instruction, and finally, the assistant's output based on the provided inputs.
342
+
343
+ Parameters
344
+ ----------
345
+ sample : dict
346
+ A dictionary representing a single sample from the dataset. It must contain
347
+ keys corresponding to input and output components of the conversation.
348
+
349
+ Returns
350
+ -------
351
+ dict
352
+ A modified dictionary with a 'messages' key that contains a list of ordered messages,
353
+ each annotated with its role in the conversation.
354
+ """
355
+
356
+ prompt_template="""A partir del caso clínico que se expone a continuación, tu tarea es la siguiente.
357
+ Como médico experto, tu tarea es la de diagnosticar al paciente en base al caso clínico. Responde únicamente con el diagnóstico para el paciente de forma concisa.
358
+ Caso clínico: {caso_clinico}
359
+ """
360
+ # cómo usarlo con un LLM:
361
+
362
+ system_prompt = "Eres un experto en medicina que realiza diagnósticos en base a casos clínicos."
363
+
364
+ messages = [
365
+ {"role": "system", "content": system_prompt},
366
+ {"role": "user", "content": prompt_template.format(caso_clinico=element["raw_text"])},
367
+ {"role": "assistant", "content": element["topic"]},
368
+ ]
369
+
370
+ element["raw_text"] = messages
371
+ return element
372
+
373
  def loadSpanishTokenizer():
374
  """
375
 
 
418
  return (dataset['train'], eval_dataset, test_dataset)
419
 
420
  def loadSpanishDataset():
421
+
422
  spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
423
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
424
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
425
  return spanishMedicaLllmDataset
426
 
427
+ def loadSpanishDatasetFinnetuning():
428
+
429
+ spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
430
+ spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] in FILTER_CRITERIA)
431
+ return spanishMedicaLllmDataset
432
+
433
  ##See Jupyter Notebook for change CONTEXT_LENGTH size
434
+ def applyChatInstructFormat(dataset, filterColumns = ['raw_text', 'topic']):
435
+ """
436
+ Apply instruccion chat_template
437
+ """
438
+ if dataset == None:
439
+ return dataset
440
+ else:
441
+ dataset = dataset.remove_columns([col for col in dataset.features if col not in filterColumns])
442
+ return dataset.map(
443
+ get_chat_format,
444
+ batched=False,
445
+ num_proc=4
446
+ )
447
 
448
  def accelerateConfigModel():
449
  """
 
542
  model = accelerator.prepare_model(model)
543
  return (model)
544
 
545
+ def getLoraConfiguration():
546
+ """
547
+ """
548
+ return LoraConfig(
549
+ r=8,
550
+ lora_alpha=16,
551
+ target_modules=[
552
+ "q_proj",
553
+ "k_proj",
554
+ "v_proj",
555
+ "o_proj",
556
+ "gate_proj",
557
+ "up_proj",
558
+ "down_proj",
559
+ "lm_head",
560
+ ],
561
+ bias="none",
562
+ lora_dropout=0.05, # Conventional
563
+ task_type="CAUSAL_LM",
564
+ )
565
 
566
  # A note on training. You can set the max_steps to be high initially, and examine at what step your
567
  # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
 
620
 
621
  trainer.push_to_hub()
622
 
623
+ def configAndRunFineTuning(basemodel, dataset, eval_dataset, tokenizer):
624
+ if basemodel is None or dataset is None or tokenizer is None:
625
+ return None
626
+ else:
627
+ tokenizer.pad_token = tokenizer.eos_token
628
+
629
+
630
+ training_args = TrainingArguments(
631
+ output_dir=output_dir,
632
+ push_to_hub = True,
633
+ hub_private_repo = False,
634
+ hub_model_id = HUB_MODEL_ID,
635
+ warmup_steps = 5,
636
+ per_device_train_batch_size = MICRO_BATCH_SIZE,
637
+ per_device_eval_batch_size=1,
638
+ #gradient_checkpointing=True,
639
+ gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
640
+ num_train_epochs = 1,
641
+ learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate
642
+ logging_steps = 5,
643
+ optim="paged_adamw_8bit",
644
+ logging_dir="./logs", # Directory for storing logs
645
+ save_strategy = "steps", # Save the model checkpoint every logging step
646
+ save_steps = 50, # Save checkpoints every 50 steps
647
+ evaluation_strategy = "steps", # Evaluate the model every logging step
648
+ eval_steps = 50, # Evaluate and save checkpoints every 50 steps
649
+ do_eval = True, # Perform evaluation at the end of training
650
+ eval_steps=50,
651
+ save_total_limit=2,
652
+ remove_unused_columns = True,
653
+ report_to = None, # Comment this out if you don't want to use weights & baises
654
+ run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional)
655
+ fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter
656
+ bf16=False
657
+ )
658
+
659
+ trainer = SFTTrainer(
660
+ model=basemodel,
661
+ train_dataset = dataset,
662
+ eval_dataset = eval_dataset,
663
+ peft_config = getLoraConfiguration(),
664
+ dataset_text_field = "raw_text",
665
+ max_seq_length = 1024, #512
666
+ tokenizer = tokenizer,
667
+ args = training_args,
668
+ dataset_kwargs={
669
+ "add_special_tokens": False, # We template with special tokens
670
+ "append_concat_token": False, # No need to add additional separator token
671
+ },
672
+ packing=True
673
+ )
674
+ basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference!
675
+ trainer.train()
676
 
677
 
678
+ trainer.push_to_hub()
679
+
680
 
681
  def run_training_process():
682
+ #Loggin to Huggin Face
683
+ login(token = os.environ.get('HG_FACE_TOKEN'))
684
+ os.environ['WANDB_DISABLED'] = 'true'
685
+ tokenizer = loadSpanishTokenizer()
686
+ medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
687
+ medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
688
+
689
+ # train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
690
+ # getTokenizedDataset( medicalSpanishDataset, tokenizer)
691
+ # )
692
+
693
+
694
+ train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
695
+
696
+ base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
697
+ base_model = modelLoraConfigBioMistral(base_model)
698
+
699
+ configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)
700
+
701
+ def run_finnetuning_process():
702
  #Loggin to Huggin Face
703
  login(token = os.environ.get('HG_FACE_TOKEN'))
704
  os.environ['WANDB_DISABLED'] = 'true'
 
708
  getTokenizedDataset( medicalSpanishDataset, tokenizer)
709
  )
710
 
711
+ base_model = loadBaseModel(HUB_MODEL_ID)
 
712
 
713
+ configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer)
714
+