File size: 20,016 Bytes

f4f7ed0

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Autoeval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-12-02T11:56:29.397635Z",
     "iopub.status.busy": "2024-12-02T11:56:29.397111Z",
     "iopub.status.idle": "2024-12-02T11:56:29.411850Z",
     "shell.execute_reply": "2024-12-02T11:56:29.410508Z",
     "shell.execute_reply.started": "2024-12-02T11:56:29.397590Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "source_model = \"unsloth/Llama-3.2-3B-Instruct\"\n",
    "destination_model = \"Llama-3.2-3B-appreciation\"\n",
    "dataset_url = \"eltorio/appreciation\"\n",
    "epoch = 5\n",
    "push_to_hub = True if os.path.exists('/kaggle/working') else False\n",
    "output_directory = '/kaggle/working' if os.path.exists('/kaggle/working') else './'\n",
    "kaggle_model = f\"eltorio/{destination_model.lower()}/transformers/default\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Install the required libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install -U \"safetensors>=0.4.5\"\n",
    "!pip install -U tensorflow\n",
    "!pip install -U \"https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.44.2.dev0-py3-none-manylinux_2_24_x86_64.whl\"\n",
    "!pip install -U git+https://github.com/huggingface/transformers.git\n",
    "!pip install huggingface_hub[cli] accelerate datasets peft\n",
    "!pip install pip3-autoremove\n",
    "!pip-autoremove torch torchvision torchaudio -y\n",
    "!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121\n",
    "!pip install unsloth\n",
    "!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git\n",
    "!pip install tf-keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Log in Kaggle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "if not os.path.exists('/kaggle/.kaggle/kaggle.json'):\n",
    "    try:\n",
    "        from kaggle_secrets import UserSecretsClient\n",
    "        user_secrets = UserSecretsClient()\n",
    "        KAGGLE_JSON = user_secrets.get_secret(\"KAGGLE_JSON\")\n",
    "    except:\n",
    "        KAGGLE_JSON = os.getenv(\"KAGGLE_JSON\")\n",
    "\n",
    "    kaggle_dir = os.path.expanduser(\"~/.kaggle\")\n",
    "    kaggle_file = os.path.join(kaggle_dir, \"kaggle.json\")\n",
    "\n",
    "    os.makedirs(kaggle_dir, exist_ok=True)\n",
    "\n",
    "    with open(kaggle_file, 'w') as file:\n",
    "        json.dump(KAGGLE_JSON, file)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Login WandB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import wandb\n",
    "try:\n",
    "  from kaggle_secrets import UserSecretsClient\n",
    "  user_secrets = UserSecretsClient()\n",
    "  WANDB_API_KEY = user_secrets.get_secret(\"WANDB_API_KEY\")\n",
    "  os.environ[\"WANDB_API_KEY\"] = WANDB_API_KEY\n",
    "except:\n",
    "  if os.getenv(\"WANDB_API_KEY\") is None:\n",
    "    os.environ[\"WANDB_API_KEY\"] = input(\"Enter your W&B API key: \")\n",
    "\n",
    "if not wandb.login():\n",
    "  raise Exception(\"Can't login to W&B\")\n",
    "else:\n",
    "  print(\"Logged in to W&B\")\n",
    "  os.environ[\"WANDB_PROJECT\"]=destination_model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Log in Hugging hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from huggingface_hub import login\n",
    "import os\n",
    "\n",
    "try:\n",
    "  from kaggle_secrets import UserSecretsClient\n",
    "  user_secrets = UserSecretsClient()\n",
    "  HF_TOKEN = user_secrets.get_secret(\"HF_TOKEN\")\n",
    "  os.environ[\"HF_TOKEN\"] = HF_TOKEN\n",
    "except:\n",
    "  if not os.getenv(\"HF_TOKEN\"):\n",
    "    raise ValueError(\"You need to set the HF_TOKEN environment variable.\")\n",
    "  HF_TOKEN = os.getenv(\"HF_TOKEN\")\n",
    "\n",
    "print(f\"Login with {HF_TOKEN}\")\n",
    "login(\n",
    "  token=HF_TOKEN,\n",
    "  add_to_git_credential=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from unsloth import FastLanguageModel\n",
    "import torch\n",
    "\n",
    "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n",
    "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
    "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the source model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name = source_model, # or choose \"unsloth/Llama-3.2-1B-Instruct\"\n",
    "    max_seq_length = max_seq_length,\n",
    "    dtype = dtype,\n",
    "    load_in_4bit = load_in_4bit,\n",
    "    token = HF_TOKEN,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add the Peft model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "model = FastLanguageModel.get_peft_model(\n",
    "    model,\n",
    "    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
    "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
    "                      \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
    "    lora_alpha = 16,\n",
    "    lora_dropout = 0, # Supports any, but = 0 is optimized\n",
    "    bias = \"none\",    # Supports any, but = \"none\" is optimized\n",
    "    # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
    "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
    "    random_state = 3407,\n",
    "    use_rslora = False,  # We support rank stabilized LoRA\n",
    "    loftq_config = None, # And LoftQ\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-12-02T11:56:34.316028Z",
     "iopub.status.busy": "2024-12-02T11:56:34.315647Z",
     "iopub.status.idle": "2024-12-02T11:56:36.257132Z",
     "shell.execute_reply": "2024-12-02T11:56:36.255969Z",
     "shell.execute_reply.started": "2024-12-02T11:56:34.315997Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "dataset = load_dataset(dataset_url)\n",
    "dataset['train']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create the messages from the data\n",
    "\n",
    "The data is in the form of a csv file with the following columns:\n",
    "\n",
    "```csv\n",
    "\n",
    "Id,redoublant,matière,trimestre,note 1er trimestre,note 2ème trimestre,note 3ème trimestre,comportement 0-10,participation 0-10,travail 0-10,commentaire\n",
    "\n",
    "0,0,,1,\"Mauvais trimestre, manque de travail\",5.0,,,5.0,5.0,5.0,X a beaucoup de difficultés dues à des lacunes mais aussi à un manque de travail qui ne permet pas de les combler. Il faut s'y mettre au prochain trimestre.\n",
    "\n",
    "```\n",
    "\n",
    "We need to create HuggingFace's normal multiturn format "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-12-02T11:56:45.923298Z",
     "iopub.status.busy": "2024-12-02T11:56:45.922896Z",
     "iopub.status.idle": "2024-12-02T11:56:45.933706Z",
     "shell.execute_reply": "2024-12-02T11:56:45.932503Z",
     "shell.execute_reply.started": "2024-12-02T11:56:45.923263Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def create_training_turn(row):\n",
    "    trimestre = row['trimestre']\n",
    "    redoublant = 'redoublant ' if row['redoublant'] == 1 else ''\n",
    "    moyenne_1 = row['note 1er trimestre'] if not isinstance(row['note 1er trimestre'],float|int) else 'N/A'\n",
    "    moyenne_2 = row['note 2ème trimestre'] if not isinstance(row['note 2ème trimestre'],float|int) else 'N/A'\n",
    "    moyenne_3 = row['note 3ème trimestre'] if not isinstance(row['note 3ème trimestre'],float|int) else 'N/A'\n",
    "    comportement = row['comportement 0-10']\n",
    "    participation = row['participation 0-10']\n",
    "    travail = row['travail 0-10']\n",
    "    system_prompt = \"Vous êtes une IA assistant les enseignants d'histoire-géographie en rédigeant à leur place une appréciation personnalisée pour leur élève en fonction de ses performances. Votre appréciation doit être en français formel et impersonnel. Votre appréciation doit être bienveillante, constructive, et aider l'élève à comprendre ses points forts et les axes d'amélioration. Votre appréciation doit comporter de 8 à 250 caractères. Votre appréciation ne doit jamais comporter les valeurs des notes. \"\n",
    "\n",
    "    if trimestre == 1:\n",
    "        trimestre_full = \"premier trimestre\"\n",
    "        user_input = f\"Veuillez rédiger une appréciation en moins de 250 caractères pour le {trimestre_full} pour cet élève {redoublant}qui a eu {moyenne_1} de moyenne, j'ai évalué son comportement à {comportement}/10, sa participation à {participation}/10 et son travail à {travail}/10. Les notes ne doivent pas apparaître dans l'appréciation.\"\n",
    "    elif trimestre == 2:\n",
    "        trimestre_full = \"deuxième trimestre\"\n",
    "        user_input = f\"Veuillez rédiger une appréciation en moins de 250 caractères pour le {trimestre_full} pour cet élève {redoublant}qui a eu {moyenne_2} de moyenne ce trimestre et {moyenne_1} au premier trimestre, j'ai évalué son comportement à {comportement}/10, sa participation à {participation}/10 et son travail à {travail}/10. Les notes ne doivent pas apparaître dans l'appréciation.\"\n",
    "    elif trimestre == 3:\n",
    "        trimestre_full = \"troisième trimestre\"\n",
    "        user_input = f\"Veuillez rédiger une appréciation en moins de 250 caractères pour le {trimestre_full} pour cet élève {redoublant}qui a eu {moyenne_3} de moyenne ce trimestre, {moyenne_2} au deuxième trimestre et {moyenne_1} au premier trimestre, j'ai évalué son comportement à {comportement}/10, sa participation à {participation}/10 et son travail à {travail}/10. Les notes ne doivent pas apparaître dans l'appréciation.\"\n",
    "\n",
    "    assistant_response = row['commentaire']\n",
    "\n",
    "    return {\"conversation\":[\n",
    "        {\"role\": \"system\", \"content\":system_prompt},\n",
    "        {\"role\": \"user\", \"content\":user_input},\n",
    "        {\"role\": \"assistant\", \"content\":assistant_response}\n",
    "    ]}\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check the function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-12-02T11:56:50.058458Z",
     "iopub.status.busy": "2024-12-02T11:56:50.058002Z",
     "iopub.status.idle": "2024-12-02T11:56:50.066899Z",
     "shell.execute_reply": "2024-12-02T11:56:50.065730Z",
     "shell.execute_reply.started": "2024-12-02T11:56:50.058406Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "test_row = dataset['train'][68]\n",
    "create_training_turn(test_row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-12-02T11:56:58.639949Z",
     "iopub.status.busy": "2024-12-02T11:56:58.639529Z",
     "iopub.status.idle": "2024-12-02T11:56:59.178999Z",
     "shell.execute_reply": "2024-12-02T11:56:59.177678Z",
     "shell.execute_reply.started": "2024-12-02T11:56:58.639912Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "multi_turn_dataset = dataset.map(create_training_turn)\n",
    "multi_turn_dataset['train'][68]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenize the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from unsloth.chat_templates import get_chat_template\n",
    "\n",
    "tokenizer = get_chat_template(\n",
    "    tokenizer,\n",
    "    chat_template = \"llama-3.1\",\n",
    ")\n",
    "\n",
    "def formatting_prompts_func(messages):\n",
    "    convos = messages[\"conversation\"]\n",
    "    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]\n",
    "    return { \"text\" : texts, }\n",
    "pass\n",
    "\n",
    "multi_turn_dataset = multi_turn_dataset.map(\n",
    "    formatting_prompts_func,\n",
    "    batched=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check the tokenized data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-12-02T11:57:11.739989Z",
     "iopub.status.busy": "2024-12-02T11:57:11.739580Z",
     "iopub.status.idle": "2024-12-02T11:57:12.535408Z",
     "shell.execute_reply": "2024-12-02T11:57:12.533818Z",
     "shell.execute_reply.started": "2024-12-02T11:57:11.739953Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "multi_turn_dataset[\"train\"][\"text\"][278]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parmeters for training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from trl import SFTTrainer\n",
    "from transformers import TrainingArguments, DataCollatorForSeq2Seq\n",
    "from unsloth import is_bfloat16_supported\n",
    "\n",
    "trainer = SFTTrainer(\n",
    "    model = model,\n",
    "    tokenizer = tokenizer,\n",
    "    train_dataset = multi_turn_dataset[\"train\"],\n",
    "    eval_dataset=multi_turn_dataset[\"validation\"],\n",
    "    dataset_text_field = \"text\",\n",
    "\n",
    "    max_seq_length = max_seq_length,\n",
    "    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),\n",
    "    dataset_num_proc = 2,\n",
    "    packing = False, # Can make training 5x faster for short sequences.\n",
    "    args = TrainingArguments(\n",
    "        per_device_train_batch_size = 2,\n",
    "        gradient_accumulation_steps = 4,\n",
    "        warmup_steps = 5,\n",
    "        num_train_epochs = epoch, # Set this for 1 full training run.\n",
    "        eval_strategy=\"epoch\",\n",
    "        save_strategy=\"epoch\",\n",
    "        logging_strategy=\"epoch\",\n",
    "        # max_steps = 60,\n",
    "        learning_rate = 2e-4,\n",
    "        fp16 = not is_bfloat16_supported(),\n",
    "        bf16 = is_bfloat16_supported(),\n",
    "        logging_steps = 1,\n",
    "        optim = \"adamw_8bit\",\n",
    "        weight_decay = 0.01,\n",
    "        lr_scheduler_type = \"linear\",\n",
    "        seed = 3407,\n",
    "        output_dir = output_directory,\n",
    "        report_to = \"wandb\", # Use this for WandB etc\n",
    "        push_to_hub = push_to_hub,\n",
    "        hub_model_id = destination_model\n",
    "    ),\n",
    "\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from unsloth.chat_templates import train_on_responses_only\n",
    "\n",
    "trainer = train_on_responses_only(\n",
    "    trainer,\n",
    "    instruction_part = \"<|start_header_id|>user<|end_header_id|>\\n\\n\",\n",
    "    response_part = \"<|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "tokenizer.decode(trainer.train_dataset[5][\"input_ids\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "space = tokenizer(\" \", add_special_tokens = False).input_ids[0]\n",
    "tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5][\"labels\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "trainer_stats = trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Publish to Kaggle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import kagglehub\n",
    "import os\n",
    "import re\n",
    "\n",
    "def get_latest_checkpoint(directory):\n",
    "    # Liste tous les répertoires dans le répertoire donné\n",
    "    subdirs = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]\n",
    "    # Filtre les répertoires qui correspondent au format \"checkpoint_xxx\"\n",
    "    checkpoint_dirs = [d for d in subdirs if re.match(r'checkpoint-\\d+', d)]\n",
    "    print(checkpoint_dirs)\n",
    "    # Extrait les valeurs numériques et trouve la plus élevée\n",
    "    max_checkpoint = max(checkpoint_dirs, key=lambda x: int(x.split('-')[1]))\n",
    "    print(max_checkpoint)\n",
    "    return os.path.join(directory, max_checkpoint)\n",
    "\n",
    "\n",
    "latest_checkpoint = get_latest_checkpoint(output_directory)\n",
    "print(f'The newest model is : {latest_checkpoint}')\n",
    "\n",
    "kagglehub.login()\n",
    "kagglehub.model_upload(\n",
    "    handle= kaggle_model,\n",
    "    local_model_dir = latest_checkpoint\n",
    ")\n"
   ]
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "none",
   "dataSources": [
    {
     "datasetId": 6161747,
     "sourceId": 10010677,
     "sourceType": "datasetVersion"
    }
   ],
   "dockerImageVersionId": 30787,
   "isGpuEnabled": false,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}