{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 18624, "status": "ok", "timestamp": 1731936696141, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "kH18jD5cR_Ks", "outputId": "5feea107-90a1-43a9-8a56-3b3f57f9b984" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.4/122.4 MB\u001b[0m \u001b[31m6.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.9/310.9 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ], "source": [ "# !pip install -q accelerate peft bitsandbytes transformers trl faiss-gpu langchain_community wandb flash-attn\n", "!pip install -q accelerate peft bitsandbytes transformers trl datasets\n", "\n", "# flash-attn" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "executionInfo": { "elapsed": 38546, "status": "ok", "timestamp": 1731936734682, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "cgVNTbBa-D3j" }, "outputs": [], "source": [ "# load the required packages.\n", "import torch\n", "from datasets import load_dataset, Dataset\n", "from peft import LoraConfig, AutoPeftModelForCausalLM, PeftModel, get_peft_model\n", "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoConfig, set_seed\n", "from trl import SFTTrainer\n", "import bitsandbytes as bnb\n", "import transformers\n", "\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import sqlparse\n", "import re\n", "import json\n", "\n", "from huggingface_hub import hf_hub_download\n", "from huggingface_hub import HfFileSystem" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "basaX_55Yf_D" }, "outputs": [], "source": [ "#transformers.logging.set_verbosity_info()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "executionInfo": { "elapsed": 10, "status": "ok", "timestamp": 1731936734684, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "bkkjgGdlrNcq" }, "outputs": [], "source": [ "WRITE_TOKEN = userdata.get('hf_write')\n", "READ_TOKEN = userdata.get('hf_read')\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "executionInfo": { "elapsed": 8, "status": "ok", "timestamp": 1731936734685, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "7CKnwlRfZj4V" }, "outputs": [], "source": [ "model_name = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\"\n", "out_name = \"lleticiasilvaa/TinyLlama-schemaLinking-v0\"\n", "prev_checkpoint = None # \"checkpoint-3000\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9zs7nCmt-pMC" }, "outputs": [], "source": [ "#!huggingface-cli login" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "executionInfo": { "elapsed": 1013, "status": "ok", "timestamp": 1731936748136, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "PZdnxs8k-Cgl" }, "outputs": [], "source": [ "spider_id= \"NESPED-GEN/spider_selector_schemaReduzido\" #\"NESPED-GEN/spider_variacoes_formato_schema\"" ] }, { "cell_type": "markdown", "metadata": { "id": "xT2iRdCN_MFH" }, "source": [ "### Load Data\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 368, "referenced_widgets": [ "954d0a42584d4d45988b6781dccd2721", "882971ab369a496e9bafe407e9c6806e", "6098cb4f75a942a2bc70c6a4990b96c7", "91551a62023443e6a4c61ae4274fb3bf", "2b83ce00a20b41929354dc4d6a116728", "d8567c4ba3fc44c99c09e304ebd3590c", "47cc3cad3b784da4a129e64d2e1ee24a", "96b2ee95a84d4d20ae46a3cd79502ebd", "0b286b4dc7ae4f5ba2c3fb5dde679d21", "f0e75ea194e446f2aca9c758aea9ee7b", "4930228f747c4f08bf113a59b67bf0d1", "7705fbeca8d047f68dcfdb63e187e227", "628862d69c59474abc993ba2b0a1e4a9", "cb0cd611ada542d194c1c5b37d49c2a9", "958a0f10d33e4d8bbd0f99a8fc5f977b", "bb972db2c6694ce6aaebcec812fb7e1e", "4dff216ac8d84f0f9bddee1f33487460", "7a1c00f596e54deebc2a681b91d11448", "54e1ada8506a4433b80e7b3ea71c6e69", "dba7d8ba65ef4b239ce9f549036fd7bc", "e2006b4423c8493a9b1ac993f59030f0", "64430c8fab104287aba9e4712193b7d9", "7a153beddcbc48abaac1bc873495bb89", "bda393faad884464958d809342154ccc", "90256975588c40c8ba05537bd85d0ade", "1f447a033b5042ea9d4d061efcc128a8", "6930ae4a42b445659651804ddb593ae9", "b097342684dc458fac80ebf4ab63c166", "ee0a5ec6ec9c426993f77188e55bb9ae", "ba9b64b057194d7f9244c1f77b2b9adb", "f10fff8306d84e86a6af152480873514", "8d56070740ae4f7da131e2be3c37fb6b", "8b3ad462348645bd80010e054ca928f1", "e5d6190fb02242ca8f0415d8f8a443f4", "5106589ddf594b80a046e02b67caf394", "7951dfed0ebe492ca8385f00e8d99fb1", "6150da9b770b4deaa199292928e7c37b", "7c09d9cb5118429da08fd306e3debe5d", "0a226111af6f4e6fb4beb9e48dec06c8", "054c9e5c38c94cd4beac7d3820ed9d26", "7f48eba4b40a47e8bf1ec8fbbc553be9", "a257398c261c4d89bc7e827b35a1b5a8", "a4113770044d46d6858b0b366c88180c", "8e2705bc99e04d80b03dc79e976be5da", "760d55aa8260454dac44713de2731376", "86e2ab3a72314e9b9cc07375ae97288a", "d916bdba31694485879bd83dbe580dfd", "0d65a517e6944cc490ac2166a8bbc7d4", "a9aa9df1faca4157abb9952d776660c3", "1ffea1b63ed84e6caef1116932fae078", "ff1bab0e996f403d80fd0cb45ab046a6", "1f22779c5d3145fba24390ac446d27f5", "3811df5017ad4b68a091e6f24c1b07f8", "6129d30e24ab45c39424bfa2bf4e741d", "185818ef42464b0d8546b968fce348ef" ] }, "executionInfo": { "elapsed": 7537, "status": "ok", "timestamp": 1731936757971, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "lLKgY40efdJo", "outputId": "af220bbe-1f0b-4887-83fc-837c9ee92b8e" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "954d0a42584d4d45988b6781dccd2721", "version_major": 2, "version_minor": 0 }, "text/plain": [ "README.md: 0%| | 0.00/885 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7705fbeca8d047f68dcfdb63e187e227", "version_major": 2, "version_minor": 0 }, "text/plain": [ "dev-00000-of-00001.parquet: 0%| | 0.00/369k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7a153beddcbc48abaac1bc873495bb89", "version_major": 2, "version_minor": 0 }, "text/plain": [ "train-00000-of-00001.parquet: 0%| | 0.00/2.70M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e5d6190fb02242ca8f0415d8f8a443f4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating dev split: 0%| | 0/1034 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "760d55aa8260454dac44713de2731376", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0%| | 0/8656 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Dataset({\n", " features: ['db_id', 'question_en', 'query', 'answer', 'hardness', 'query_llm', 'schema_SQLDatabase', 'schema_our', 'schema_dict', 'selector', 'selector_correct', 'schema_SQLDatabase_reduzido', 'schema_SQLDatabase_reduzido_tabelas'],\n", " num_rows: 8656\n", "})" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spider = load_dataset(spider_id, split=\"train\")\n", "spider" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "executionInfo": { "elapsed": 1078, "status": "ok", "timestamp": 1731936776838, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "po1gNdLjFKdZ" }, "outputs": [], "source": [ "df = spider.to_pandas()" ] }, { "cell_type": "markdown", "metadata": { "id": "ON-hPsEYM1Bu" }, "source": [ "# Load Base Model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "executionInfo": { "elapsed": 6, "status": "ok", "timestamp": 1731936777965, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "yEAZpfzlNOHW" }, "outputs": [], "source": [ "def download_checkpoint(adapter_model_id, checkpoint):\n", " fs = HfFileSystem()\n", " for file in fs.ls(f'{adapter_model_id}/{checkpoint}', detail=False):\n", " file_name = file.split(checkpoint)[-1]\n", "\n", " hf_hub_download(repo_id=adapter_model_id, filename=(f'{checkpoint}{file_name}'), local_dir='out')\n", "\n", " for file in fs.ls(f'{adapter_model_id}/logs', detail=False):\n", " file_name = file.split(checkpoint)[-1]\n", "\n", " hf_hub_download(repo_id=adapter_model_id, filename=(f'logs/{file_name.split(\"/\")[-1]}'), local_dir='out')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "enUxjGXEqHxg" }, "outputs": [], "source": [ "# download_checkpoint(out_name, prev_checkpoint)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 241, "referenced_widgets": [ "323999f5778e4ef3af07aa56cedcb85a", "ba3d0039a8e649029e41cc7e0f2bef79", "cffb2a4c0cb8499cb90941888842ba91", "131add19192a450383c7347764486d1a", "104ffb9e801b4b2eaf2c053e8a49372e", "6f29f032313d406a95b1cd787f693fda", "080f4c06c082442bb4e3a587da7e1b05", "d61ed434ce0b40bc8f235029349dfcc5", "34fff58ab0324276835b2bf0e973d225", "285a977c497b4a07ac6b4aacb93ded47", "ecf6bf5230484f25a75bbf3094fe46b6", "a92456acb2d346e6abfac85b494344c4", "8d62fe07355e424ba3864db55f30f712", "9a7b61b9e30749ef96c586c21b22670b", "f7dcad3c535e4a9796732fa98d5bde0a", "fa6253270d7345968fadc279d7215762", "f430f999529549b18c2c10fbdcafcfc6", "f41c2f1ad1d74b54a007d97ebe0833b3", "ff59616316c449e8a6cba70cb6223ddb", "4ed02c21e8db40e8b0d0ed3396e6eac1", "9fd2f10530b749a4867db48ce525a703", "bf2b14d22ed344dd8ba30af9f409649c", "1762f48c346f45d1ac742c7b6a05b398", "8ea6376bcf9f478b897f5a731c9eb5fd", "d71108505cfe4be9b39ca28d48f82bb6", "d7ea60ff96534928a8506a3375134ce2", "6c80588ccd864686b3688c07bd0cfcea", "45599d9156824c1eb16087f49b6f7b37", "05d0b56eecf5434db858954b0f50f716", "d4ae1431dc9d4f94be7e711aa5600693", "878fd479b0c04a0ab539b138df28cf05", "f139273eb02a4e49a50affe527683778", "1029b6f6cd76480b9ed9aaa4010f8280", "6ac0619bcd1d49e8b902ab88ba95f8be", "9f1a52b32e884cf9a840c2696f48d67a", "77c7d88051b84e25b6b9fab2d371457e", "2b70f933259e401590334ca53a3e9b89", "0ca1402ee71d4ce28d79ec19a09cbc22", "3c26a18bdf764fd4a08d72c75593c02c", "cc4498f4de0c48eca834364ab7bae1ab", "4090c2ed02b94a7fb6fa68ceaed4c9ed", "b03accbff4f04588978931858acb20fa", "7c040fb145c74111b09cceb227883ec2", "81f331bfdfb74ec1979861f70438d2e1", "d8e80b27ee5a44e0954f55157bdfe101", "c0ff7d36febf4004b10a9cceeed71340", "c243fe49adfd4b028a6fa291ec60c33c", "c6180febc9074976b68d4695c29b5734", "6b41cd90b9104695900d28e80bb711d0", "f7fdc57540a548f5afd2ee3e9928d143", "45bd4e518df84079889b4788604e63c6", "a79e5d6ebaa8483fa8182db53bf3d1ab", "58dd3107d8534bf4a1176ec92271b67f", "c9e5f2c7c0294c72bbff41147ab5901c", "77f6133dbd4d416597dd2a56de5e51c2", "b2b6fbab3e55459c9c9d4d0b9117dda2", "615cdee2b4514046973fcdc2560ecf53", "33a7ff3894754148aabe78563dbcf225", "34999d96e9a445209259f8ca81f87ede", "0c72506a49d64b62b3be4c1e32b09406", "495f6c0c17b34bd0a63c85f0468e7338", "14a4142f6a084b87b150f74c54ccf3d5", "6eed3bf7465c46ab90d8f6eddd765611", "4a26283dc2594a6889b6b3f25c573d4c", "3bc2cd6b1694429898a3a4c5f8aad480", "99b8ec4257e1450a8880fb08ebab1267", "5c3bd65a52f7498f94d3f01f3711d85d", "671d9f7c8bcb40dd843b1ee5546a6fdc", "762b5c7976f543ecaa8a0e08e433c4b0", "eb03df617c09483fb52f22ca2482f08e", "abc980f6827643b2b0fc2510181f6e8d", "1126699ee50248baa3507dfe73aae00b", "697e094f2b4a4160a9ce0b72f48d006c", "a592b3b4b2214865ad44fdb857b24333", "c3b0c5bfffcb42f7aad7d4f545f2d016", "7ab5433edfc24819b34f62e037210acb", "9a7bbc5db8564b8c879a8f47e05e3e81" ] }, "executionInfo": { "elapsed": 60619, "status": "ok", "timestamp": 1731936841579, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "M7DoqQMlM_nW", "outputId": "359e9942-4bbe-401c-d1c7-27662241bf3c" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "323999f5778e4ef3af07aa56cedcb85a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/1.29k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a92456acb2d346e6abfac85b494344c4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.model: 0%| | 0.00/500k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1762f48c346f45d1ac742c7b6a05b398", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer.json: 0%| | 0.00/1.84M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "6ac0619bcd1d49e8b902ab88ba95f8be", "version_major": 2, "version_minor": 0 }, "text/plain": [ "special_tokens_map.json: 0%| | 0.00/551 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d8e80b27ee5a44e0954f55157bdfe101", "version_major": 2, "version_minor": 0 }, "text/plain": [ "config.json: 0%| | 0.00/608 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b2b6fbab3e55459c9c9d4d0b9117dda2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/2.20G [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5c3bd65a52f7498f94d3f01f3711d85d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "generation_config.json: 0%| | 0.00/124 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "seed=14\n", "\n", "if (prev_checkpoint != None):\n", " try:\n", " download_checkpoint(out_name, prev_checkpoint)\n", " except:\n", " pass\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=READ_TOKEN, map_device=\"auto\", add_eos_token=True, use_fast=True)\n", "\n", "new_tokens = {'additional_special_tokens': ['[SQL]','[/SQL]', '[QUESTION]','[/QUESTION]']}\n", "#adicionar tokens especiais:\n", "# if (prev_checkpoint == None):\n", "# tokenizer.add_special_tokens(new_tokens)\n", "\n", "\n", "if torch.cuda.is_bf16_supported():\n", " compute_dtype = torch.bfloat16\n", " attn_implementation = 'flash_attention_2'\n", "else:\n", " compute_dtype = torch.float16\n", " attn_implementation = 'sdpa'\n", "\n", "tokenizer.pad_token = tokenizer.eos_token\n", "tokenizer.padding_side = \"right\"\n", "\n", "\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=compute_dtype,\n", " bnb_4bit_use_double_quant=False,\n", ")\n", "\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " torch_dtype=compute_dtype,\n", " device_map=\"auto\",\n", " quantization_config=bnb_config,\n", "\n", " trust_remote_code=True,\n", " token=READ_TOKEN,\n", " # attn_implementation=attn_implementation\n", ")\n", "\n", "# se adicionar special_tokens tem que fazer resize do tokenizer:\n", "# model.resize_token_embeddings(len(tokenizer))\n", "\n", "## model.resize_token_embeddings(max(len(tokenizer), model.config.vocab_size))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 8479, "status": "ok", "timestamp": 1730570376902, "user": { "displayName": "Leticia Oliveira Silva", "userId": "01512049874517593223" }, "user_tz": 180 }, "id": "fLuqzhSJBvi8", "outputId": "65293049-fcc8-491b-aaca-579bca0686c7" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`\n", "The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`\n" ] }, { "data": { "text/plain": [ "Embedding(32004, 2048)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# se adicionar special_tokens tem que fazer resize do tokenizer:\n", "#model.resize_token_embeddings(len(tokenizer))\n", "\n", "#model.resize_token_embeddings(max(len(tokenizer), model.config.vocab_size))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 493, "status": "ok", "timestamp": 1730976767910, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "EZcGlhE5jW4L", "outputId": "6e4648c3-e5d0-4cf9-e3d1-1534dccea1d8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{% for message in messages %}\n", "{% if message['role'] == 'user' %}\n", "{{ '<|user|>\n", "' + message['content'] + eos_token }}\n", "{% elif message['role'] == 'system' %}\n", "{{ '<|system|>\n", "' + message['content'] + eos_token }}\n", "{% elif message['role'] == 'assistant' %}\n", "{{ '<|assistant|>\n", "' + message['content'] + eos_token }}\n", "{% endif %}\n", "{% if loop.last and add_generation_prompt %}\n", "{{ '<|assistant|>' }}\n", "{% endif %}\n", "{% endfor %}\n" ] } ], "source": [ "print(tokenizer.chat_template)" ] }, { "cell_type": "markdown", "metadata": { "id": "_I7-bFfm5gqS" }, "source": [ "#### Chat Template - Gerar SQL" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "luwUn-oueM1C" }, "outputs": [], "source": [ "import re\n", "\n", "def replace_alias_with_table(query):\n", " # Expressão regular para encontrar tabelas com alias, capturando o nome da tabela e o alias\n", " alias_pattern = re.compile(r'(\\bFROM\\b|\\bJOIN\\b)\\s+(\\w+)\\s+AS\\s+(\\w+)', re.IGNORECASE)\n", "\n", " # Substituições de aliases encontrados no padrão\n", " aliases = {match.group(3): match.group(2) for match in alias_pattern.finditer(query)}\n", "\n", " # Substituir cada alias pelo nome da tabela correspondente\n", " for alias, table in aliases.items():\n", " query = re.sub(r'\\b' + alias + r'\\b', table, query)\n", "\n", " # Remover 'AS' e alias das cláusulas 'FROM' e 'JOIN'\n", " query = re.sub(r'\\bAS\\s+\\w+', '', query, flags=re.IGNORECASE)\n", " return query" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6RDDdVgP5gqT" }, "outputs": [], "source": [ "def to_sql(query):\n", " return sqlparse.format(replace_alias_with_table(query), reindent=True, keyword_case='upper')\n", "\n", "def apply_template(row, tokenizer=tokenizer, n_examplea=0):\n", " question = row['question_en']\n", " schema = row['schema_SQLDatabase_reduzido_tabelas']\n", " sql = to_sql(row['query'])\n", "\n", " system = \"Given a user question and the schema of a database, your task is to generate an SQL query that accurately answers the question based on the provided schema.\"\n", "\n", " chat = [\n", " {'role': 'system', 'content': system},\n", " {'role': 'user', 'content': f\"# Schema:\\n```sql\\n{schema}\\n```\\n\\n# Question: {question}\"},\n", " {'role': 'assistant', 'content': f\"```sql\\n{sql}\\n```\\n\"}\n", " ]\n", "\n", " # chat = [\n", " # {\"role\": \"schema\", \"content\": schema},\n", " # {\"role\": \"user\", \"content\": question},\n", " # {\"role\": \"assistant\", \"content\": sql},\n", " # ]\n", "\n", " row['text'] = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)\n", "\n", " return row" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bTF0pBsw5gqU" }, "outputs": [], "source": [ "df = df.apply(apply_template, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DfJvLaGR5gqV" }, "outputs": [], "source": [ "# df['n_tokens'] = df['text'].apply(lambda x: len(tokenizer.encode(x)))\n", "# import seaborn as sns\n", "# sns.histplot(df['n_tokens'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 5, "status": "ok", "timestamp": 1731590359389, "user": { "displayName": "Helena Barcelos", "userId": "07137541013699470373" }, "user_tz": 180 }, "id": "PIvSnr6Y5gqW", "outputId": "37f9f97d-23ec-4305-a8d3-ee2b3ccfe9ab" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<|system|>\n", "Given a user question and the schema of a database, your task is to generate an SQL query that accurately answers the question based on the provided schema.\n", "<|user|>\n", "# Schema:\n", "```sql\n", "CREATE TABLE Courses (\n", " course_id VARCHAR(100),\n", " course_name VARCHAR(120),\n", " course_description VARCHAR(255),\n", " other_details VARCHAR(255),\n", " PRIMARY KEY (course_id)\n", ");\n", "\n", "CREATE TABLE Student_Course_Attendance (\n", " student_id INTEGER,\n", " course_id INTEGER,\n", " date_of_attendance DATETIME,\n", " PRIMARY KEY (student_id)\n", ");\n", "```\n", "\n", "# Question: How many students are attending English courses?\n", "<|assistant|>\n", "```sql\n", "SELECT count(*)\n", "FROM courses\n", "JOIN student_course_attendance ON courses.course_id = student_course_attendance.course_id\n", "WHERE courses.course_name = \"English\"\n", "```\n", "\n", "\n" ] } ], "source": [ "print(df['text'][df.index[70]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "roZzKNOj5gqW" }, "outputs": [], "source": [ "_df = pd.DataFrame(columns=['text'])\n", "_df['text'] = df.sample(frac=1, random_state=14).reset_index(drop=True)['text']\n", "_df = Dataset.from_pandas(_df)\n", "_df = _df.train_test_split(test_size=0.01, shuffle=True, seed=14)\n", "train_dataset, valid_dataset = _df[\"train\"], _df[\"test\"]" ] }, { "cell_type": "markdown", "metadata": { "id": "b6mjOblXeMup" }, "source": [ "#### Chat Template - Schema Linking" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "executionInfo": { "elapsed": 472, "status": "ok", "timestamp": 1731936866951, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "opbdu1g6eMuq" }, "outputs": [], "source": [ "def apply_template(row, tokenizer=tokenizer, n_examplea=0):\n", " question = row['question_en']\n", " schema = row['schema_SQLDatabase']\n", " schema_linking = row['selector_correct']\n", "\n", " system = \"Given a user question and the schema of a database, your task is to generate an JSON with the the names of tables and columns of the schema that the question is referring to.\"\n", "\n", " chat = [{'role': 'user', 'content': f\"# System:\\n{system}\\n\\n# Schema:\\n```sql\\n{schema}\\n```\\n\\n# Question: {question}\"},\n", " {'role': 'assistant', 'content': f\"```json\\n{schema_linking}\\n```\"}\n", " ]\n", "\n", " row['text'] = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)\n", "\n", " return row" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "executionInfo": { "elapsed": 5753, "status": "ok", "timestamp": 1731936874927, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "x_HHKJ9VeMur" }, "outputs": [], "source": [ "df = df.apply(apply_template, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "oF0-4l8DeMus" }, "outputs": [], "source": [ "# df['n_tokens'] = df['text'].apply(lambda x: len(tokenizer.encode(x)))\n", "# import seaborn as sns\n", "# sns.histplot(df['n_tokens'])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 902, "status": "ok", "timestamp": 1731936877432, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "EFY_pTEteMut", "outputId": "e9eb0324-3e6f-4556-8943-f8fe29814987" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<|user|>\n", "# System:\n", "Given a user question and the schema of a database, your task is to generate an JSON with the the names of tables and columns of the schema that the question is referring to.\n", "\n", "# Schema:\n", "```sql\n", "CREATE TABLE Addresses (\n", " address_id INTEGER,\n", " line_1 VARCHAR(80),\n", " line_2 VARCHAR(80),\n", " city VARCHAR(50),\n", " zip_postcode CHAR(20),\n", " state_province_county VARCHAR(50),\n", " country VARCHAR(50),\n", " PRIMARY KEY (address_id)\n", ");\n", "\n", "CREATE TABLE People (\n", " person_id INTEGER,\n", " first_name VARCHAR(255),\n", " middle_name VARCHAR(255),\n", " last_name VARCHAR(255),\n", " cell_mobile_number VARCHAR(40),\n", " email_address VARCHAR(40),\n", " login_name VARCHAR(40),\n", " password VARCHAR(40),\n", " PRIMARY KEY (person_id)\n", ");\n", "\n", "CREATE TABLE Students (\n", " student_id INTEGER,\n", " student_details VARCHAR(255),\n", " PRIMARY KEY (student_id),\n", " FOREIGN KEY (student_id) REFERENCES People(person_id)\n", ");\n", "\n", "CREATE TABLE Courses (\n", " course_id VARCHAR(100),\n", " course_name VARCHAR(120),\n", " course_description VARCHAR(255),\n", " other_details VARCHAR(255),\n", " PRIMARY KEY (course_id)\n", ");\n", "\n", "CREATE TABLE People_Addresses (\n", " person_address_id INTEGER,\n", " person_id INTEGER,\n", " address_id INTEGER,\n", " date_from DATETIME,\n", " date_to DATETIME,\n", " PRIMARY KEY (person_address_id),\n", " FOREIGN KEY (address_id) REFERENCES Addresses(address_id),\n", " FOREIGN KEY (person_id) REFERENCES People(person_id)\n", ");\n", "\n", "CREATE TABLE Student_Course_Registrations (\n", " student_id INTEGER,\n", " course_id INTEGER,\n", " registration_date DATETIME,\n", " PRIMARY KEY (student_id),\n", " FOREIGN KEY (course_id) REFERENCES Courses(course_id),\n", " FOREIGN KEY (student_id) REFERENCES Students(student_id)\n", ");\n", "\n", "CREATE TABLE Student_Course_Attendance (\n", " student_id INTEGER,\n", " course_id INTEGER,\n", " date_of_attendance DATETIME,\n", " PRIMARY KEY (student_id),\n", " FOREIGN KEY (student_id) REFERENCES Student_Course_Registrations(student_id),\n", " FOREIGN KEY (course_id) REFERENCES Student_Course_Registrations(course_id)\n", ");\n", "\n", "CREATE TABLE Candidates (\n", " candidate_id INTEGER,\n", " candidate_details VARCHAR(255),\n", " PRIMARY KEY (candidate_id),\n", " FOREIGN KEY (candidate_id) REFERENCES People(person_id)\n", ");\n", "\n", "CREATE TABLE Candidate_Assessments (\n", " candidate_id INTEGER,\n", " qualification CHAR(15),\n", " assessment_date DATETIME,\n", " asessment_outcome_code CHAR(15),\n", " PRIMARY KEY (candidate_id),\n", " FOREIGN KEY (candidate_id) REFERENCES Candidates(candidate_id)\n", ");\n", "```\n", "\n", "# Question: How many students are attending English courses?\n", "<|assistant|>\n", "```json\n", "{\n", " 'Courses': ['course_id', 'course_name'],\n", " 'Student_Course_Attendance': ['student_id', 'course_id']\n", "}\n", "```\n", "\n" ] } ], "source": [ "print(df['text'][df.index[70]])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "executionInfo": { "elapsed": 1025, "status": "ok", "timestamp": 1731936939803, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "puYY-BqFeMuu" }, "outputs": [], "source": [ "_df = pd.DataFrame(columns=['text'])\n", "_df['text'] = df.sample(frac=1, random_state=14).reset_index(drop=True)['text']\n", "_df = Dataset.from_pandas(_df)\n", "_df = _df.train_test_split(test_size=0.01, shuffle=True, seed=14)\n", "train_dataset, valid_dataset = _df[\"train\"], _df[\"test\"]" ] }, { "cell_type": "markdown", "metadata": { "id": "DWpXeuO_KlLS" }, "source": [ "### Finetuning" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "executionInfo": { "elapsed": 441, "status": "ok", "timestamp": 1731936943786, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "0oVpZDj1AXY9" }, "outputs": [], "source": [ "from huggingface_hub import login, create_repo\n", "from google.colab import userdata\n", "import wandb\n", "import os\n", "\n", "#token = userdata.get('hf_write')\n", "token = WRITE_TOKEN\n", "login(token=token)\n", "set_seed(1234)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "executionInfo": { "elapsed": 1178, "status": "ok", "timestamp": 1731936946753, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "KRhO7UJ-Q4Y8" }, "outputs": [], "source": [ "def find_all_linear_names(model, new_tokens=False):\n", " lora_module_names = set()\n", " for name, module in model.named_modules():\n", " if isinstance(module, bnb.nn.Linear4bit) or isinstance(module, bnb.nn.Linear8bitLt):\n", " names = name.split(\".\")\n", " lora_module_names.add(names[0] if len(names) == 1 else names[-1])\n", " if(new_tokens):\n", " lora_module_names.add(\"lm_head\")\n", " return list(lora_module_names)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 9, "status": "ok", "timestamp": 1731936948654, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "L0qqP5Y9PtRh", "outputId": "7a98bf2b-bd0b-43a6-8a10-0022ef625b94" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 7 modules to quantize: ['gate_proj', 'down_proj', 'q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj']\n" ] } ], "source": [ "modules = find_all_linear_names(model)\n", "print(f\"Found {len(modules)} modules to quantize: {modules}\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "executionInfo": { "elapsed": 449, "status": "ok", "timestamp": 1731936953049, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "uFUnJrbjPwAT" }, "outputs": [], "source": [ "peft_config = LoraConfig(\n", " lora_alpha=128, #primeira versão = 16\n", " lora_dropout=0.1,\n", " r=64,\n", " # bias=\"none\",\n", " # task_type=\"CAUSAL_LM\",\n", " target_modules=modules,\n", " # modules_to_save=[\"embed_tokens\"], #quando adicionar tokens speciais\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "executionInfo": { "elapsed": 452, "status": "ok", "timestamp": 1731936955578, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "buh0o2P2jwbx" }, "outputs": [], "source": [ "torch.cuda.empty_cache()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 754, "referenced_widgets": [ "ad3ac767a2c248808dee99fdbc1d2937", "b5052a4d41034f62bee7339dc4999e1e", "5aef8ebf74e44fa9a02a4ff4829c56cc", "0e6a937064d141408991844313281cc9", "e5cd83f2fd7a457bbb59ef07834ca7bb", "159f0280e4054f489fa743febe30bf9d", "2778aa7ff5d74de9a15e8d4e4b42041d", "2e01bb63b48942df9a776ed3a9d44b7e", "fdc6fc25703046d895a0acebae84383a", "23cfbb0f5ff3488d8e99ecbe9c6059ad", "c6e1891def5d4d2db0d4081cf9293f06", "4a3618a5780d48a293c544a2de033055", "9f0e188f83fe470dab4dab18ff358512", "e3f5bc1354c342a0a13ce96a72f9d568", "eab8c8f550524a17859428c8216c9e9f", "91ae7904841c44e4a61b608a4e960c50", "dfe9b54e2db0411580b8151b7dad22d1", "40696d24b5594853a1ad7f8a2663a581", "76e3edf9e227489a93823bd089b66635", "beb84ff766404983b02e0809098522a1", "aa01b2f327cd4d6699206771a0a49456", "d66a7b38c5a343fd91a707e0830de86f" ] }, "executionInfo": { "elapsed": 6412790, "status": "ok", "timestamp": 1731943371078, "user": { "displayName": "Letícia Silva", "userId": "03927092514722669722" }, "user_tz": 180 }, "id": "9bD7ea0F-GQn", "outputId": "42806561-3e72-4768-f2f7-637b40df9267" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': dataset_text_field, max_seq_length. Will not be supported from version '0.13.0'.\n", "\n", "Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.\n", " warnings.warn(message, FutureWarning)\n", "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py:300: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n", "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py:328: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ad3ac767a2c248808dee99fdbc1d2937", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/8569 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4a3618a5780d48a293c544a2de033055", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/87 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.\n", "/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py:632: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n", " return fn(*args, **kwargs)\n" ] }, { "data": { "text/html": [ "\n", "
Step | \n", "Training Loss | \n", "Validation Loss | \n", "
---|---|---|
250 | \n", "0.286900 | \n", "0.149068 | \n", "
500 | \n", "0.101400 | \n", "0.087871 | \n", "
750 | \n", "0.072500 | \n", "0.076253 | \n", "
1000 | \n", "0.063400 | \n", "0.073357 | \n", "
"
],
"text/plain": [
"