# Autoeval

## Install the required libraries

In [None]:
%%capture

!pip install unsloth

# Also get the latest nightly Unsloth!

!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

!pip install tf-keras

### Log in Hugging hub

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import os

user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

login(
  token=HF_TOKEN,
  add_to_git_credential=False
)

os.environ["HF_TOKEN"] = HF_TOKEN

## Training parameters

In [None]:
from unsloth import FastLanguageModel

import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!

dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+

load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


## Load the source model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(

    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"

    max_seq_length = max_seq_length,

    dtype = dtype,

    load_in_4bit = load_in_4bit,

    token = HF_TOKEN,

)

## Add the Peft model

In [None]:
model = FastLanguageModel.get_peft_model(

    model,

    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128

    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",

                      "gate_proj", "up_proj", "down_proj",],

    lora_alpha = 16,

    lora_dropout = 0, # Supports any, but = 0 is optimized

    bias = "none",    # Supports any, but = "none" is optimized

    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!

    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context

    random_state = 3407,

    use_rslora = False,  # We support rank stabilized LoRA

    loftq_config = None, # And LoftQ

)

### Read the data

In [None]:
import pandas as pd

df_data = pd.read_csv("/kaggle/input/appreciation/appreciation.csv")

df_data.head()


### Create the messages from the data

The data is in the form of a csv file with the following columns:

```csv

Id,trimestre,résumé,note 1er trimestre,note 2ème trimestre,note 3ème trimestre,comportement 0-10,participation 0-10,travail 0-10,commentaire

0,1,"Mauvais trimestre, manque de travail",5.0,,,5.0,5.0,5.0,X a beaucoup de difficultés dues à des lacunes mais aussi à un manque de travail qui ne permet pas de les combler. Il faut s'y mettre au prochain trimestre.

```

We need to create HuggingFace's normal multiturn format 

In [None]:
def create_training_turn(row):

    system_prompt = (

        "Vous êtes une IA assistant les enseignants à rédiger des appréciations personnalisées "

        "pour leurs élèves en fonction de leurs performances. Vos commentaires doivent être en français, "

        "bienveillants, constructifs, et aider l'élève à comprendre ses points forts et les axes d'amélioration."

    )

    

    user_input = f"""Voici les données de l'élève :



- Trimestre : {row['trimestre']}

- Note du 1er trimestre : {row['note 1er trimestre'] if not pd.isna(row['note 1er trimestre']) else 'N/A'}

- Note du 2ᵉ trimestre : {row['note 2ème trimestre'] if not pd.isna(row['note 2ème trimestre']) else 'N/A'}

- Note du 3ᵉ trimestre : {row['note 3ème trimestre'] if not pd.isna(row['note 3ème trimestre']) else 'N/A'}

- Comportement (0-10) : {row['comportement 0-10']}

- Participation (0-10) : {row['participation 0-10']}

- Travail (0-10) : {row['travail 0-10']}



Veuillez rédiger une appréciation pour cet élève."""

    

    assistant_response = row['commentaire']

    return [

        {"role": "system", "content":system_prompt},

        {"role": "user", "content":user_input},

        {"role": "assistant", "content":assistant_response}

    ]


### Check the function

In [None]:
test_row = df_data.iloc[208]

create_training_turn(test_row)

### Create the dataset

In [None]:
from datasets import Dataset

multi_turn_data = df_data.apply(create_training_turn, axis=1)

dataset = Dataset.from_dict({"conversations": multi_turn_data.tolist()})

## Tokenize the data

In [None]:
from unsloth.chat_templates import get_chat_template



tokenizer = get_chat_template(

    tokenizer,

    chat_template = "llama-3.1",

)



def formatting_prompts_func(messages):

    convos = messages["conversations"]

    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]

    return { "text" : texts, }

pass



dataset = dataset.map(formatting_prompts_func, batched = True,)

### Check the tokenized data

In [None]:
dataset["text"][0]

### Parmeters for training

In [None]:
from trl import SFTTrainer

from transformers import TrainingArguments, DataCollatorForSeq2Seq

from unsloth import is_bfloat16_supported



trainer = SFTTrainer(

    model = model,

    tokenizer = tokenizer,

    train_dataset = dataset,

    dataset_text_field = "text",

    max_seq_length = max_seq_length,

    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),

    dataset_num_proc = 2,

    packing = False, # Can make training 5x faster for short sequences.

    args = TrainingArguments(

        per_device_train_batch_size = 2,

        gradient_accumulation_steps = 4,

        warmup_steps = 5,

        # num_train_epochs = 1, # Set this for 1 full training run.

        max_steps = 60,

        learning_rate = 2e-4,

        fp16 = not is_bfloat16_supported(),

        bf16 = is_bfloat16_supported(),

        logging_steps = 1,

        optim = "adamw_8bit",

        weight_decay = 0.01,

        lr_scheduler_type = "linear",

        seed = 3407,

        output_dir = "outputs",

        report_to = "none", # Use this for WandB etc

        push_to_hub = True,

        push_to_hub_model_id = "Llama-3.2-3B-appreciation"

    ),

)

In [None]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(

    trainer,

    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",

    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",

)

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]

tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

In [None]:
trainer_stats = trainer.train()