# Autoeval

In [None]:
import os
source_model = "unsloth/Llama-3.2-3B-Instruct"
destination_model = "Llama-3.2-3B-appreciation"
dataset_url = "eltorio/appreciation"
epoch = 5
push_to_hub = True if os.path.exists('/kaggle/working') else False
output_directory = '/kaggle/working' if os.path.exists('/kaggle/working') else './'
kaggle_model = f"eltorio/{destination_model.lower()}/transformers/default"

## Install the required libraries

In [None]:
%%capture
!pip install -U "safetensors>=0.4.5"
!pip install -U tensorflow
!pip install -U "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.44.2.dev0-py3-none-manylinux_2_24_x86_64.whl"
!pip install -U git+https://github.com/huggingface/transformers.git
!pip install huggingface_hub[cli] accelerate datasets peft
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install tf-keras

### Log in Kaggle

In [None]:
import os
import json
if not os.path.exists('/kaggle/.kaggle/kaggle.json'):
 try:
 from kaggle_secrets import UserSecretsClient
 user_secrets = UserSecretsClient()
 KAGGLE_JSON = user_secrets.get_secret("KAGGLE_JSON")
 except:
 KAGGLE_JSON = os.getenv("KAGGLE_JSON")

 kaggle_dir = os.path.expanduser("~/.kaggle")
 kaggle_file = os.path.join(kaggle_dir, "kaggle.json")

 os.makedirs(kaggle_dir, exist_ok=True)

 with open(kaggle_file, 'w') as file:
 json.dump(KAGGLE_JSON, file)

### Login WandB

In [None]:
import wandb
try:
 from kaggle_secrets import UserSecretsClient
 user_secrets = UserSecretsClient()
 WANDB_API_KEY = user_secrets.get_secret("WANDB_API_KEY")
 os.environ["WANDB_API_KEY"] = WANDB_API_KEY
except:
 if os.getenv("WANDB_API_KEY") is None:
 os.environ["WANDB_API_KEY"] = input("Enter your W&B API key: ")

if not wandb.login():
 raise Exception("Can't login to W&B")
else:
 print("Logged in to W&B")
 os.environ["WANDB_PROJECT"]=destination_model

### Log in Hugging hub

In [None]:
from huggingface_hub import login
import os

try:
 from kaggle_secrets import UserSecretsClient
 user_secrets = UserSecretsClient()
 HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
 os.environ["HF_TOKEN"] = HF_TOKEN
except:
 if not os.getenv("HF_TOKEN"):
 raise ValueError("You need to set the HF_TOKEN environment variable.")
 HF_TOKEN = os.getenv("HF_TOKEN")

print(f"Login with {HF_TOKEN}")
login(
 token=HF_TOKEN,
 add_to_git_credential=False
)

## Training parameters

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


## Load the source model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
 model_name = source_model, # or choose "unsloth/Llama-3.2-1B-Instruct"
 max_seq_length = max_seq_length,
 dtype = dtype,
 load_in_4bit = load_in_4bit,
 token = HF_TOKEN,
)

## Add the Peft model

In [None]:
model = FastLanguageModel.get_peft_model(
 model,
 r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
 target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
 "gate_proj", "up_proj", "down_proj",],
 lora_alpha = 16,
 lora_dropout = 0, # Supports any, but = 0 is optimized
 bias = "none", # Supports any, but = "none" is optimized
 # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
 use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
 random_state = 3407,
 use_rslora = False, # We support rank stabilized LoRA
 loftq_config = None, # And LoftQ
)

### Read the data

In [None]:
from datasets import load_dataset
dataset = load_dataset(dataset_url)
dataset['train']

### Create the messages from the data

The data is in the form of a csv file with the following columns:

```csv

Id,redoublant,matière,trimestre,note 1er trimestre,note 2ème trimestre,note 3ème trimestre,comportement 0-10,participation 0-10,travail 0-10,commentaire

0,0,,1,"Mauvais trimestre, manque de travail",5.0,,,5.0,5.0,5.0,X a beaucoup de difficultés dues à des lacunes mais aussi à un manque de travail qui ne permet pas de les combler. Il faut s'y mettre au prochain trimestre.

```

We need to create HuggingFace's normal multiturn format 

In [None]:
def create_training_turn(row):
 trimestre = row['trimestre']
 redoublant = 'redoublant ' if row['redoublant'] == 1 else ''
 moyenne_1 = row['note 1er trimestre'] if not isinstance(row['note 1er trimestre'],float|int) else 'N/A'
 moyenne_2 = row['note 2ème trimestre'] if not isinstance(row['note 2ème trimestre'],float|int) else 'N/A'
 moyenne_3 = row['note 3ème trimestre'] if not isinstance(row['note 3ème trimestre'],float|int) else 'N/A'
 comportement = row['comportement 0-10']
 participation = row['participation 0-10']
 travail = row['travail 0-10']
 system_prompt = "Vous êtes une IA assistant les enseignants d'histoire-géographie en rédigeant à leur place une appréciation personnalisée pour leur élève en fonction de ses performances. Votre appréciation doit être en français formel et impersonnel. Votre appréciation doit être bienveillante, constructive, et aider l'élève à comprendre ses points forts et les axes d'amélioration. Votre appréciation doit comporter de 8 à 250 caractères. Votre appréciation ne doit jamais comporter les valeurs des notes. "

 if trimestre == 1:
 trimestre_full = "premier trimestre"
 user_input = f"Veuillez rédiger une appréciation en moins de 250 caractères pour le {trimestre_full} pour cet élève {redoublant}qui a eu {moyenne_1} de moyenne, j'ai évalué son comportement à {comportement}/10, sa participation à {participation}/10 et son travail à {travail}/10. Les notes ne doivent pas apparaître dans l'appréciation."
 elif trimestre == 2:
 trimestre_full = "deuxième trimestre"
 user_input = f"Veuillez rédiger une appréciation en moins de 250 caractères pour le {trimestre_full} pour cet élève {redoublant}qui a eu {moyenne_2} de moyenne ce trimestre et {moyenne_1} au premier trimestre, j'ai évalué son comportement à {comportement}/10, sa participation à {participation}/10 et son travail à {travail}/10. Les notes ne doivent pas apparaître dans l'appréciation."
 elif trimestre == 3:
 trimestre_full = "troisième trimestre"
 user_input = f"Veuillez rédiger une appréciation en moins de 250 caractères pour le {trimestre_full} pour cet élève {redoublant}qui a eu {moyenne_3} de moyenne ce trimestre, {moyenne_2} au deuxième trimestre et {moyenne_1} au premier trimestre, j'ai évalué son comportement à {comportement}/10, sa participation à {participation}/10 et son travail à {travail}/10. Les notes ne doivent pas apparaître dans l'appréciation."

 assistant_response = row['commentaire']

 return {"conversation":[
 {"role": "system", "content":system_prompt},
 {"role": "user", "content":user_input},
 {"role": "assistant", "content":assistant_response}
 ]}


### Check the function

In [None]:
test_row = dataset['train'][68]
create_training_turn(test_row)

### Create the dataset

In [None]:
multi_turn_dataset = dataset.map(create_training_turn)
multi_turn_dataset['train'][68]

## Tokenize the data

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
 tokenizer,
 chat_template = "llama-3.1",
)

def formatting_prompts_func(messages):
 convos = messages["conversation"]
 texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
 return { "text" : texts, }
pass

multi_turn_dataset = multi_turn_dataset.map(
 formatting_prompts_func,
 batched=True,
)

### Check the tokenized data

In [None]:
multi_turn_dataset["train"]["text"][278]

### Parmeters for training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
 model = model,
 tokenizer = tokenizer,
 train_dataset = multi_turn_dataset["train"],
 eval_dataset=multi_turn_dataset["validation"],
 dataset_text_field = "text",

 max_seq_length = max_seq_length,
 data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
 dataset_num_proc = 2,
 packing = False, # Can make training 5x faster for short sequences.
 args = TrainingArguments(
 per_device_train_batch_size = 2,
 gradient_accumulation_steps = 4,
 warmup_steps = 5,
 num_train_epochs = epoch, # Set this for 1 full training run.
 eval_strategy="epoch",
 save_strategy="epoch",
 logging_strategy="epoch",
 # max_steps = 60,
 learning_rate = 2e-4,
 fp16 = not is_bfloat16_supported(),
 bf16 = is_bfloat16_supported(),
 logging_steps = 1,
 optim = "adamw_8bit",
 weight_decay = 0.01,
 lr_scheduler_type = "linear",
 seed = 3407,
 output_dir = output_directory,
 report_to = "wandb", # Use this for WandB etc
 push_to_hub = push_to_hub,
 hub_model_id = destination_model
 ),

)

In [None]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
 trainer,
 instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
 response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

### Create the model

In [None]:
trainer_stats = trainer.train()

## Publish to Kaggle

In [None]:
import kagglehub
import os
import re

def get_latest_checkpoint(directory):
 # Liste tous les répertoires dans le répertoire donné
 subdirs = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
 # Filtre les répertoires qui correspondent au format "checkpoint_xxx"
 checkpoint_dirs = [d for d in subdirs if re.match(r'checkpoint-\d+', d)]
 print(checkpoint_dirs)
 # Extrait les valeurs numériques et trouve la plus élevée
 max_checkpoint = max(checkpoint_dirs, key=lambda x: int(x.split('-')[1]))
 print(max_checkpoint)
 return os.path.join(directory, max_checkpoint)


latest_checkpoint = get_latest_checkpoint(output_directory)
print(f'The newest model is : {latest_checkpoint}')

kagglehub.login()
kagglehub.model_upload(
 handle= kaggle_model,
 local_model_dir = latest_checkpoint
)
