Spaces:

somosnlp
/

SpanishMedicaLLM

Sleeping

App Files Files Community

inoid commited on Mar 30

Commit

478d560

•

1 Parent(s): a3a731c

Add finnetuing process configuration to model

Browse files

Files changed (2) hide show

app.py +12 -33
spanish_medica_llm.py +157 -3

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sys
 import torch
-from spanish_medica_llm import run_training, run_training_process
 import gradio as gr
@@ -45,41 +45,18 @@ def train_model(*inputs):
  if "IS_SHARED_UI" in os.environ:
  raise gr.Error("This Space only works in duplicated instances")
- # args_general = argparse.Namespace(
- # image_captions_filename = True,
- # train_text_encoder = True,
- # #stop_text_encoder_training = stptxt,
- # save_n_steps = 0,
- # #pretrained_model_name_or_path = model_to_load,
- # instance_data_dir="instance_images",
- # #class_data_dir=class_data_dir,
- # output_dir="output_model",
- # instance_prompt="",
- # seed=42,
- # resolution=512,
- # mixed_precision="fp16",
- # train_batch_size=1,
- # gradient_accumulation_steps=1,
- # use_8bit_adam=True,
- # learning_rate=2e-6,
- # lr_scheduler="polynomial",
- # lr_warmup_steps = 0,
- # #max_train_steps=Training_Steps,
- # )
- # run_training(args_general)
- # torch.cuda.empty_cache()
- # #convert("output_model", "model.ckpt")
- # #shutil.rmtree('instance_images')
- # #shutil.make_archive("diffusers_model", 'zip', "output_model")
- # #with zipfile.ZipFile('diffusers_model.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
- # # zipdir('output_model/', zipf)
- # torch.cuda.empty_cache()
- # return [gr.update(visible=True, value=["diffusers_model.zip"]), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)]
- run_training_process()
  return f"Train Model Sucessful!!!"
 def stop_model(*input):
  return f"Model with Gradio!"
@@ -93,6 +70,8 @@ with gr.Blocks() as demo:
  btn_response.click(fn=generate_model, inputs=inp, outputs=out)
  btn_train = gr.Button("Train Model")
  btn_train.click(fn=train_model, inputs=[], outputs=out)
  btn_evaluate = gr.Button("Evaluate Model")
  btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
  btn_stop = gr.Button("Stop Model")

 import torch
+from spanish_medica_llm import run_training, run_training_process, run_finnetuning_process
 import gradio as gr
  if "IS_SHARED_UI" in os.environ:
  raise gr.Error("This Space only works in duplicated instances")
+ run_training_process()
  return f"Train Model Sucessful!!!"
+def finnetuning_model(*inputs):
+ if "IS_SHARED_UI" in os.environ:
+ raise gr.Error("This Space only works in duplicated instances")
+ run_finnetuning_process()
+ return f"Finnetuning Model Sucessful!!!"
 def stop_model(*input):
  return f"Model with Gradio!"
  btn_response.click(fn=generate_model, inputs=inp, outputs=out)
  btn_train = gr.Button("Train Model")
  btn_train.click(fn=train_model, inputs=[], outputs=out)
+ btn_finnetuning = gr.Button("Finnetuning Model")
+ btn_finnetuning.click(fn=finnetuning_model, inputs=[], outputs=out)
  btn_evaluate = gr.Button("Evaluate Model")
  btn_evaluate.click(fn=evaluate_model, inputs=[], outputs=out)
  btn_stop = gr.Button("Stop Model")

spanish_medica_llm.py CHANGED Viewed

@@ -331,6 +331,45 @@ MAX_TRAINING_STEPS = 2
 TOKEN_NAME = TOKEN_MISTRAL_NAME
 def loadSpanishTokenizer():
  """
@@ -379,12 +418,32 @@ def splitDatasetInTestValid(dataset):
  return (dataset['train'], eval_dataset, test_dataset)
 def loadSpanishDataset():
  spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
  return spanishMedicaLllmDataset
  ##See Jupyter Notebook for change CONTEXT_LENGTH size
 def accelerateConfigModel():
  """
@@ -483,6 +542,26 @@ def modelLoraConfigBioMistral(model):
  model = accelerator.prepare_model(model)
  return (model)
 # A note on training. You can set the max_steps to be high initially, and examine at what step your
 # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
@@ -541,10 +620,85 @@ def configAndRunTraining(basemodel, dataset, eval_dataset, tokenizer):
  trainer.push_to_hub()
 def run_training_process():
  #Loggin to Huggin Face
  login(token = os.environ.get('HG_FACE_TOKEN'))
  os.environ['WANDB_DISABLED'] = 'true'
@@ -554,7 +708,7 @@ def run_training_process():
  getTokenizedDataset( medicalSpanishDataset, tokenizer)
  )
- base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
- base_model = modelLoraConfigBioMistral(base_model)
- configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)

 TOKEN_NAME = TOKEN_MISTRAL_NAME
+def get_chat_format(element):
+ """
+ Processes a single sample from the alpaca dataset to structure it for chatbot training.
+ This function transforms the dataset sample into a format suitable for training,
+ where each message is categorized by its role in the conversation (system, input, user, assistant).
+ It initializes the conversation with a system message, then conditionally adds an input message,
+ follows with the user's instruction, and finally, the assistant's output based on the provided inputs.
+ Parameters
+ ----------
+ sample : dict
+ A dictionary representing a single sample from the dataset. It must contain
+ keys corresponding to input and output components of the conversation.
+ Returns
+ -------
+ dict
+ A modified dictionary with a 'messages' key that contains a list of ordered messages,
+ each annotated with its role in the conversation.
+ """
+ prompt_template="""A partir del caso clínico que se expone a continuación, tu tarea es la siguiente.
+ Como médico experto, tu tarea es la de diagnosticar al paciente en base al caso clínico. Responde únicamente con el diagnóstico para el paciente de forma concisa.
+ Caso clínico: {caso_clinico}
+ """
+ # cómo usarlo con un LLM:
+ system_prompt = "Eres un experto en medicina que realiza diagnósticos en base a casos clínicos."
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": prompt_template.format(caso_clinico=element["raw_text"])},
+ {"role": "assistant", "content": element["topic"]},
+ ]
+ element["raw_text"] = messages
+ return element
 def loadSpanishTokenizer():
  """
  return (dataset['train'], eval_dataset, test_dataset)
 def loadSpanishDataset():
  spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] not in FILTER_CRITERIA)
  spanishMedicaLllmDataset = spanishMedicaLllmDataset.train_test_split(0.2, seed=203984)
  return spanishMedicaLllmDataset
+def loadSpanishDatasetFinnetuning():
+ spanishMedicaLllmDataset = load_dataset(SPANISH_MEDICA_LLM_DATASET, split="train")
+ spanishMedicaLllmDataset = spanishMedicaLllmDataset.filter(lambda example: example["topic_type"] in FILTER_CRITERIA)
+ return spanishMedicaLllmDataset
  ##See Jupyter Notebook for change CONTEXT_LENGTH size
+def applyChatInstructFormat(dataset, filterColumns = ['raw_text', 'topic']):
+ """
+ Apply instruccion chat_template
+ """
+ if dataset == None:
+ return dataset
+ else:
+ dataset = dataset.remove_columns([col for col in dataset.features if col not in filterColumns])
+ return dataset.map(
+ get_chat_format,
+ batched=False,
+ num_proc=4
+ )
 def accelerateConfigModel():
  """
  model = accelerator.prepare_model(model)
  return (model)
+def getLoraConfiguration():
+ """
+ """
+ return LoraConfig(
+ r=8,
+ lora_alpha=16,
+ target_modules=[
+ "q_proj",
+ "k_proj",
+ "v_proj",
+ "o_proj",
+ "gate_proj",
+ "up_proj",
+ "down_proj",
+ "lm_head",
+ ],
+ bias="none",
+ lora_dropout=0.05, # Conventional
+ task_type="CAUSAL_LM",
+ )
 # A note on training. You can set the max_steps to be high initially, and examine at what step your
 # model's performance starts to degrade. There is where you'll find a sweet spot for how many steps
  trainer.push_to_hub()
+def configAndRunFineTuning(basemodel, dataset, eval_dataset, tokenizer):
+ if basemodel is None or dataset is None or tokenizer is None:
+ return None
+ else:
+ tokenizer.pad_token = tokenizer.eos_token
+ training_args = TrainingArguments(
+ output_dir=output_dir,
+ push_to_hub = True,
+ hub_private_repo = False,
+ hub_model_id = HUB_MODEL_ID,
+ warmup_steps = 5,
+ per_device_train_batch_size = MICRO_BATCH_SIZE,
+ per_device_eval_batch_size=1,
+ #gradient_checkpointing=True,
+ gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
+ num_train_epochs = 1,
+ learning_rate = 2.5e-5, # Want about 10x smaller than the Mistral learning rate
+ logging_steps = 5,
+ optim="paged_adamw_8bit",
+ logging_dir="./logs", # Directory for storing logs
+ save_strategy = "steps", # Save the model checkpoint every logging step
+ save_steps = 50, # Save checkpoints every 50 steps
+ evaluation_strategy = "steps", # Evaluate the model every logging step
+ eval_steps = 50, # Evaluate and save checkpoints every 50 steps
+ do_eval = True, # Perform evaluation at the end of training
+ eval_steps=50,
+ save_total_limit=2,
+ remove_unused_columns = True,
+ report_to = None, # Comment this out if you don't want to use weights & baises
+ run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" , # Name of the W&B run (optional)
+ fp16=True, #Set for GPU T4 for more powerful GPU as G-100 or another change to false and bf16 parameter
+ bf16=False
+ )
+ trainer = SFTTrainer(
+ model=basemodel,
+ train_dataset = dataset,
+ eval_dataset = eval_dataset,
+ peft_config = getLoraConfiguration(),
+ dataset_text_field = "raw_text",
+ max_seq_length = 1024, #512
+ tokenizer = tokenizer,
+ args = training_args,
+ dataset_kwargs={
+ "add_special_tokens": False, # We template with special tokens
+ "append_concat_token": False, # No need to add additional separator token
+ },
+ packing=True
+ )
+ basemodel.config.use_cache = False # silence the warnings. Please re-enable for inference!
+ trainer.train()
+ trainer.push_to_hub()
 def run_training_process():
+ #Loggin to Huggin Face
+ login(token = os.environ.get('HG_FACE_TOKEN'))
+ os.environ['WANDB_DISABLED'] = 'true'
+ tokenizer = loadSpanishTokenizer()
+ medicalSpanishDataset = applyChatInstructFormat( loadSpanishDatasetFinnetuning())
+ medicalSpanishDataset = medicalSpanishDataset.train_test_split(0.2, seed=203984)
+ # train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid(
+ # getTokenizedDataset( medicalSpanishDataset, tokenizer)
+ # )
+ train_dataset, eval_dataset, test_dataset = splitDatasetInTestValid( medicalSpanishDataset )
+ base_model = loadBaseModel(MISTRAL_BASE_MODEL_ID)
+ base_model = modelLoraConfigBioMistral(base_model)
+ configAndRunTraining(base_model,train_dataset, eval_dataset, tokenizer)
+def run_finnetuning_process():
  #Loggin to Huggin Face
  login(token = os.environ.get('HG_FACE_TOKEN'))
  os.environ['WANDB_DISABLED'] = 'true'
  getTokenizedDataset( medicalSpanishDataset, tokenizer)
  )
+ base_model = loadBaseModel(HUB_MODEL_ID)
+ configAndRunFineTuning(base_model,train_dataset, eval_dataset, tokenizer)