{ "cells": [ { "cell_type": "markdown", "id": "9c3e4532", "metadata": { "papermill": { "duration": 0.941841, "end_time": "2023-10-22T00:33:18.570079", "exception": false, "start_time": "2023-10-22T00:33:17.628238", "status": "completed" }, "tags": [] }, "source": [ "# Train models using HuggingFace libraries\n", "\n", "This notebook takes parameters from a params.json file which is automatically\n", "created by Substratus K8s operator.\n", "\n", "The following parameters influence what happens in this notebook:\n", "- `dataset_urls`: A comma separated list of URLs. The URLs should point to\n", " json files that contain your training dataset. If unset a json or jsonl\n", " file should be present under the `/content/data/` directory.\n", "- `prompt_template`: The prompt template to use for training\n", "- `push_to_hub`: if this variable is set a repo id, then the trained\n", " model will get pushed to HuggingFace hub. For example,\n", " set it to \"substratusai/my-model\" to publish to substratusai HF org." ] }, { "cell_type": "code", "execution_count": 1, "id": "86ccd646", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:33:20.446183Z", "iopub.status.busy": "2023-10-22T00:33:20.445407Z", "iopub.status.idle": "2023-10-22T00:33:20.458365Z", "shell.execute_reply": "2023-10-22T00:33:20.457645Z" }, "papermill": { "duration": 0.922166, "end_time": "2023-10-22T00:33:20.459935", "exception": false, "start_time": "2023-10-22T00:33:19.537769", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithRetrieval-Random-Train-80/resolve/main/WithRetrieval-Random-Train-80.json',\n", " 'inference_prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n```graphql\\n',\n", " 'logging_steps': 50,\n", " 'modules_to_save': 'embed_tokens, lm_head',\n", " 'num_train_epochs': 3,\n", " 'per_device_eval_batch_size': 1,\n", " 'per_device_train_batch_size': 1,\n", " 'prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n{output}\\n',\n", " 'push_to_hub': 'substratusai/wgql-WithRetrieval-Random-Train-80',\n", " 'save_steps': 50,\n", " 'target_modules': 'q_proj, up_proj, o_proj, k_proj, down_proj, gate_proj, v_proj',\n", " 'warmup_steps': 100}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "from pathlib import Path\n", "\n", "params = {}\n", "params_path = Path(\"/content/params.json\")\n", "if params_path.is_file():\n", " with params_path.open(\"r\", encoding=\"UTF-8\") as params_file:\n", " params = json.load(params_file)\n", "\n", "\n", "params" ] }, { "cell_type": "code", "execution_count": 2, "id": "9fafd16b-d8c9-47bf-9116-c27b1d43a019", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:33:22.506977Z", "iopub.status.busy": "2023-10-22T00:33:22.506580Z", "iopub.status.idle": "2023-10-22T00:33:25.872338Z", "shell.execute_reply": "2023-10-22T00:33:25.871610Z" }, "papermill": { "duration": 4.499567, "end_time": "2023-10-22T00:33:25.873924", "exception": false, "start_time": "2023-10-22T00:33:21.374357", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithRetrieval-Random-Train-80/resolve/main/WithRetrieval-Random-Train-80.json']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "54ab7cdf53f047abbc1942959916933d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00\n" ] } ], "source": [ "default_prompt = \"\"\"\n", "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", "### Instruction:\n", "{prompt}\n", "### Response:\n", "{completion}\n", "\"\"\"\n", "\n", "prompt = params.get(\"prompt_template\", default_prompt)\n", "\n", "eos_token = tokenizer.convert_ids_to_tokens(model.config.eos_token_id)\n", "if prompt[-len(eos_token):] != eos_token:\n", " prompt = prompt + eos_token\n", "\n", "print(prompt)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "0abf96e1-3bc1-4ae7-80ac-c2e585e9c7c1", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:37:55.546481Z", "iopub.status.busy": "2023-10-22T00:37:55.545754Z", "iopub.status.idle": "2023-10-22T00:37:56.401487Z", "shell.execute_reply": "2023-10-22T00:37:56.400701Z" }, "papermill": { "duration": 1.720149, "end_time": "2023-10-22T00:37:56.403253", "exception": false, "start_time": "2023-10-22T00:37:54.683104", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sun Oct 22 00:37:55 2023 \r\n", "+-----------------------------------------------------------------------------+\r\n", "| NVIDIA-SMI 525.105.17 Driver Version: 525.105.17 CUDA Version: 12.0 |\r\n", "|-------------------------------+----------------------+----------------------+\r\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", "| | | MIG M. |\r\n", "|===============================+======================+======================|\r\n", "| 0 NVIDIA L4 Off | 00000000:00:04.0 Off | 0 |\r\n", "| N/A 60C P0 31W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 1 NVIDIA L4 Off | 00000000:00:05.0 Off | 0 |\r\n", "| N/A 58C P0 30W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "| 2 NVIDIA L4 Off | 00000000:00:06.0 Off | 0 |\r\n", "| N/A 56C P0 30W / 72W | 4096MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", "| 3 NVIDIA L4 Off | 00000000:00:07.0 Off | 0 |\r\n", "| N/A 60C P0 32W / 72W | 3570MiB / 23034MiB | 0% Default |\r\n", "| | | N/A |\r\n", "+-------------------------------+----------------------+----------------------+\r\n", " \r\n", "+-----------------------------------------------------------------------------+\r\n", "| Processes: |\r\n", "| GPU GI CI PID Type Process name GPU Memory |\r\n", "| ID ID Usage |\r\n", "|=============================================================================|\r\n", "+-----------------------------------------------------------------------------+\r\n" ] } ], "source": [ "! nvidia-smi" ] }, { "attachments": {}, "cell_type": "markdown", "id": "4d1e1795-c783-4ddf-999e-f1de19258928", "metadata": { "papermill": { "duration": 1.050693, "end_time": "2023-10-22T00:37:58.385886", "exception": false, "start_time": "2023-10-22T00:37:57.335193", "status": "completed" }, "tags": [] }, "source": [ "Prompt before fine tuning" ] }, { "cell_type": "code", "execution_count": 7, "id": "f5dd944b-e2bd-4bfd-a5fa-55bc90239926", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:38:00.598168Z", "iopub.status.busy": "2023-10-22T00:38:00.597836Z", "iopub.status.idle": "2023-10-22T00:38:00.619438Z", "shell.execute_reply": "2023-10-22T00:38:00.618759Z" }, "papermill": { "duration": 1.264362, "end_time": "2023-10-22T00:38:00.620931", "exception": false, "start_time": "2023-10-22T00:37:59.356569", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t32000: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from typing import Dict\n", "# source: https://github.com/artidoro/qlora\n", "DEFAULT_PAD_TOKEN = params.get(\"pad_token\", \"[PAD]\")\n", "\n", "def smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict: Dict,\n", " tokenizer: transformers.PreTrainedTokenizer,\n", " model: transformers.PreTrainedModel,\n", "):\n", " \"\"\"Resize tokenizer and embedding.\n", "\n", " Note: This is the unoptimized version that may make your embedding size not be divisible by 64.\n", " \"\"\"\n", " num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)\n", " model.resize_token_embeddings(len(tokenizer))\n", " if num_new_tokens > 0:\n", " input_embeddings_data = model.get_input_embeddings().weight.data\n", " output_embeddings_data = model.get_output_embeddings().weight.data\n", "\n", " input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", " output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", "\n", " input_embeddings_data[-num_new_tokens:] = input_embeddings_avg\n", " output_embeddings_data[-num_new_tokens:] = output_embeddings_avg\n", "\n", "if tokenizer._pad_token is None:\n", " smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),\n", " tokenizer=tokenizer,\n", " model=model,\n", " )\n", "\n", "if isinstance(tokenizer, transformers.LlamaTokenizer):\n", " # LLaMA tokenizer may not have correct special tokens set.\n", " # Check and add them if missing to prevent them from being parsed into different tokens.\n", " # Note that these are present in the vocabulary.\n", " # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token.\n", " print('Adding special tokens.')\n", " tokenizer.add_special_tokens({\n", " \"eos_token\": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),\n", " \"bos_token\": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),\n", " \"unk_token\": tokenizer.convert_ids_to_tokens(\n", " model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id\n", " ),\n", " })\n", "\n", "tokenizer" ] }, { "cell_type": "code", "execution_count": 8, "id": "e78b510d", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:38:04.767476Z", "iopub.status.busy": "2023-10-22T00:38:04.766754Z", "iopub.status.idle": "2023-10-22T00:38:11.742834Z", "shell.execute_reply": "2023-10-22T00:38:11.742183Z" }, "papermill": { "duration": 7.967639, "end_time": "2023-10-22T00:38:11.744550", "exception": false, "start_time": "2023-10-22T00:38:03.776911", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "360f10151af048819a1171718b9a9448", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/3190 [00:00, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144\n" ] } ], "source": [ "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n", "\n", "target_modules = params.get(\"target_modules\")\n", "if target_modules:\n", " target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n", "\n", "modules_to_save = params.get(\"modules_to_save\")\n", "if modules_to_save:\n", " modules_to_save = [mod.strip() for mod in modules_to_save.split(\",\")]\n", "\n", "lora_config2 = LoraConfig(\n", " r=16,\n", " lora_alpha=16,\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\",\n", " target_modules=target_modules,\n", " modules_to_save = modules_to_save\n", ")\n", "print(lora_config2)\n", "\n", "model = prepare_model_for_kbit_training(model)\n", "\n", "# add LoRA adaptor\n", "model = get_peft_model(model, lora_config2)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 10, "id": "70a3e36c-62cf-45aa-8f37-0db0e40857dc", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:50.107840Z", "iopub.status.busy": "2023-10-22T00:39:50.106880Z", "iopub.status.idle": "2023-10-22T00:39:50.125663Z", "shell.execute_reply": "2023-10-22T00:39:50.125045Z" }, "papermill": { "duration": 0.989767, "end_time": "2023-10-22T00:39:50.127644", "exception": false, "start_time": "2023-10-22T00:39:49.137877", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "TrainingArguments(\n", "_n_gpu=4,\n", "adafactor=False,\n", "adam_beta1=0.9,\n", "adam_beta2=0.999,\n", "adam_epsilon=1e-08,\n", "auto_find_batch_size=False,\n", "bf16=False,\n", "bf16_full_eval=False,\n", "data_seed=None,\n", "dataloader_drop_last=False,\n", "dataloader_num_workers=0,\n", "dataloader_pin_memory=True,\n", "ddp_backend=None,\n", "ddp_broadcast_buffers=None,\n", "ddp_bucket_cap_mb=None,\n", "ddp_find_unused_parameters=None,\n", "ddp_timeout=1800,\n", "debug=[],\n", "deepspeed=None,\n", "disable_tqdm=False,\n", "dispatch_batches=None,\n", "do_eval=False,\n", "do_predict=False,\n", "do_train=False,\n", "eval_accumulation_steps=None,\n", "eval_delay=0,\n", "eval_steps=None,\n", "evaluation_strategy=no,\n", "fp16=True,\n", "fp16_backend=auto,\n", "fp16_full_eval=False,\n", "fp16_opt_level=O1,\n", "fsdp=[],\n", "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=4,\n", "gradient_checkpointing=False,\n", "greater_is_better=None,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_always_push=False,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "include_tokens_per_second=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=3e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=False,\n", "local_rank=0,\n", "log_level=passive,\n", "log_level_replica=warning,\n", "log_on_each_node=True,\n", "logging_dir=/content/artifacts/checkpoints/runs/Oct22_00-39-50_wgqlg-withretrieval-random-train-80-v2-modeller-gdth6,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=50,\n", "logging_strategy=steps,\n", "lr_scheduler_type=cosine,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=3.0,\n", "optim=paged_adamw_32bit,\n", "optim_args=None,\n", "output_dir=/content/artifacts/checkpoints,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=1,\n", "per_device_train_batch_size=1,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=/content/artifacts/checkpoints,\n", "save_on_each_node=False,\n", "save_safetensors=False,\n", "save_steps=50,\n", "save_strategy=steps,\n", "save_total_limit=None,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_cpu=False,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.02,\n", "warmup_steps=100,\n", "weight_decay=0.0,\n", ")" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils import parse_training_args\n", "\n", "training_args = parse_training_args(params)\n", "training_args" ] }, { "cell_type": "code", "execution_count": 11, "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:52.733240Z", "iopub.status.busy": "2023-10-22T00:39:52.732528Z", "iopub.status.idle": "2023-10-22T00:39:52.735862Z", "shell.execute_reply": "2023-10-22T00:39:52.735243Z" }, "papermill": { "duration": 1.548798, "end_time": "2023-10-22T00:39:52.737292", "exception": false, "start_time": "2023-10-22T00:39:51.188494", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# data = data[\"train\"].train_test_split(test_size=0.1)\n", "# data\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "5bc91439-6108-445c-8f85-e6558c9f0677", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:54.811779Z", "iopub.status.busy": "2023-10-22T00:39:54.811180Z", "iopub.status.idle": "2023-10-22T00:39:55.100635Z", "shell.execute_reply": "2023-10-22T00:39:55.099818Z" }, "papermill": { "duration": 1.304129, "end_time": "2023-10-22T00:39:55.102252", "exception": false, "start_time": "2023-10-22T00:39:53.798123", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! mkdir -p {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 13, "id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:57.094945Z", "iopub.status.busy": "2023-10-22T00:39:57.094146Z", "iopub.status.idle": "2023-10-22T04:52:40.595862Z", "shell.execute_reply": "2023-10-22T04:52:40.595167Z" }, "papermill": { "duration": 15165.144124, "end_time": "2023-10-22T04:52:41.273343", "exception": false, "start_time": "2023-10-22T00:39:56.129219", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [2391/2391 4:12:34, Epoch 2/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining Loss
501.069300
1000.515300
1500.273700
2000.173300
2500.118800
3000.084200
3500.065800
4000.054500
4500.048400
5000.044200
5500.040000
6000.039400
6500.038100
7000.034100
7500.034400
8000.032600
8500.027300
9000.026700
9500.027900
10000.026800
10500.026300
11000.026900
11500.026100
12000.025400
12500.023900
13000.025000
13500.024000
14000.025600
14500.024300
15000.023100
15500.024800
16000.023300
16500.019400
17000.019600
17500.020400
18000.019600
18500.019300
19000.019600
19500.018600
20000.019400
20500.020000
21000.020300
21500.019400
22000.019300
22500.019800
23000.019300
23500.019500

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=2391, training_loss=0.07075354540412868, metrics={'train_runtime': 15162.9574, 'train_samples_per_second': 0.631, 'train_steps_per_second': 0.158, 'total_flos': 3.0420974601928704e+17, 'train_loss': 0.07075354540412868, 'epoch': 3.0})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer = transformers.Trainer(\n", " model=model,\n", " train_dataset=data[\"train\"],\n", "# eval_dataset=data[\"test\"],\n", " args=training_args,\n", " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", ")\n", "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", "\n", "checkpoint_path = Path(\"/content/artifacts/checkpoints\")\n", "\n", "# Only set resume_from_checkpoint True when directory exists and contains files\n", "resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n", "if resume_from_checkpoint:\n", " print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n", "trainer.train(resume_from_checkpoint=resume_from_checkpoint)" ] }, { "cell_type": "code", "execution_count": 14, "id": "172e47a7-400e-4f82-a5e3-38135ecf532f", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:52:43.424539Z", "iopub.status.busy": "2023-10-22T04:52:43.423767Z", "iopub.status.idle": "2023-10-22T04:53:03.150108Z", "shell.execute_reply": "2023-10-22T04:53:03.149387Z" }, "papermill": { "duration": 21.909882, "end_time": "2023-10-22T04:53:04.171757", "exception": false, "start_time": "2023-10-22T04:52:42.261875", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "PeftModelForCausalLM(\n", " (base_model): LoraModel(\n", " (model): LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): ModulesToSaveWrapper(\n", " (original_module): Embedding(32001, 4096)\n", " (modules_to_save): ModuleDict(\n", " (default): Embedding(32001, 4096)\n", " )\n", " )\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (k_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (v_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (o_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (up_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (down_proj): Linear(\n", " in_features=11008, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=11008, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): ModulesToSaveWrapper(\n", " (original_module): Linear(in_features=4096, out_features=32001, bias=False)\n", " (modules_to_save): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=32001, bias=False)\n", " )\n", " )\n", " )\n", " )\n", ")" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path_lora)\n", "model" ] }, { "cell_type": "code", "execution_count": 15, "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:53:06.109528Z", "iopub.status.busy": "2023-10-22T04:53:06.108736Z", "iopub.status.idle": "2023-10-22T04:53:06.356699Z", "shell.execute_reply": "2023-10-22T04:53:06.355856Z" }, "papermill": { "duration": 1.205767, "end_time": "2023-10-22T04:53:06.358311", "exception": false, "start_time": "2023-10-22T04:53:05.152544", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 1.2G\r\n", " 512 -rw-r--r-- 1 root 3003 88 Oct 22 04:52 README.md\r\n", "1.0K -rw-r--r-- 1 root 3003 550 Oct 22 04:53 adapter_config.json\r\n", "1.2G -rw-r--r-- 1 root 3003 1.2G Oct 22 04:52 adapter_model.bin\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 16, "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:53:08.298375Z", "iopub.status.busy": "2023-10-22T04:53:08.297543Z", "iopub.status.idle": "2023-10-22T04:54:00.325080Z", "shell.execute_reply": "2023-10-22T04:54:00.324374Z" }, "papermill": { "duration": 54.039738, "end_time": "2023-10-22T04:54:01.415212", "exception": false, "start_time": "2023-10-22T04:53:07.375474", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(32001, 4096)\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): Linear(in_features=4096, out_features=32001, bias=False)\n", ")" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = model.merge_and_unload().half()\n", "model" ] }, { "cell_type": "code", "execution_count": 17, "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:54:03.491633Z", "iopub.status.busy": "2023-10-22T04:54:03.490944Z", "iopub.status.idle": "2023-10-22T04:54:03.732210Z", "shell.execute_reply": "2023-10-22T04:54:03.731396Z" }, "papermill": { "duration": 1.235829, "end_time": "2023-10-22T04:54:03.733715", "exception": false, "start_time": "2023-10-22T04:54:02.497886", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 0\r\n", "drwxr-xr-x 1 root 3003 0 Oct 22 00:39 checkpoints\r\n", "drwxr-xr-x 1 root 3003 0 Oct 22 00:39 lora\r\n", "drwxr-xr-x 1 root 3003 0 Oct 22 00:33 src\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -l {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 18, "id": "260e9d79-6eb8-4516-bf8f-825a25606391", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:54:05.862618Z", "iopub.status.busy": "2023-10-22T04:54:05.861702Z", "iopub.status.idle": "2023-10-22T04:56:43.569718Z", "shell.execute_reply": "2023-10-22T04:56:43.569054Z" }, "papermill": { "duration": 159.765833, "end_time": "2023-10-22T04:56:44.594302", "exception": false, "start_time": "2023-10-22T04:54:04.828469", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "('/content/artifacts/tokenizer_config.json',\n", " '/content/artifacts/special_tokens_map.json',\n", " '/content/artifacts/tokenizer.model',\n", " '/content/artifacts/added_tokens.json',\n", " '/content/artifacts/tokenizer.json')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path)\n", "tokenizer.save_pretrained(trained_model_path)" ] }, { "cell_type": "code", "execution_count": 19, "id": "6d90a920-fb22-4291-8466-411ff41e31be", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:56:46.484015Z", "iopub.status.busy": "2023-10-22T04:56:46.483256Z", "iopub.status.idle": "2023-10-22T04:56:46.762038Z", "shell.execute_reply": "2023-10-22T04:56:46.761255Z" }, "papermill": { "duration": 1.264259, "end_time": "2023-10-22T04:56:46.763647", "exception": false, "start_time": "2023-10-22T04:56:45.499388", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 13G\r\n", " 512 -rw-r--r-- 1 root 3003 21 Oct 22 04:56 added_tokens.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 22 00:39 checkpoints\r\n", "1.0K -rw-r--r-- 1 root 3003 648 Oct 22 04:54 config.json\r\n", " 512 -rw-r--r-- 1 root 3003 183 Oct 22 04:54 generation_config.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 22 00:39 lora\r\n", "9.3G -rw-r--r-- 1 root 3003 9.3G Oct 22 04:54 pytorch_model-00001-of-00002.bin\r\n", "3.3G -rw-r--r-- 1 root 3003 3.3G Oct 22 04:56 pytorch_model-00002-of-00002.bin\r\n", " 24K -rw-r--r-- 1 root 3003 24K Oct 22 04:56 pytorch_model.bin.index.json\r\n", "1.0K -rw-r--r-- 1 root 3003 552 Oct 22 04:56 special_tokens_map.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 22 00:33 src\r\n", "1.8M -rw-r--r-- 1 root 3003 1.8M Oct 22 04:56 tokenizer.json\r\n", "489K -rw-r--r-- 1 root 3003 489K Oct 22 04:56 tokenizer.model\r\n", "1.5K -rw-r--r-- 1 root 3003 1.1K Oct 22 04:56 tokenizer_config.json\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 20, "id": "202a694a", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:56:48.598532Z", "iopub.status.busy": "2023-10-22T04:56:48.597715Z" }, "papermill": { "duration": null, "end_time": null, "exception": false, "start_time": "2023-10-22T04:56:47.688302", "status": "running" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fd6c61a8f74449ab8cd067d50d2265ad", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload 2 LFS files: 0%| | 0/2 [00:00