{ "cells": [ { "cell_type": "markdown", "id": "9c3e4532", "metadata": { "papermill": { "duration": 0.941841, "end_time": "2023-10-22T00:33:18.570079", "exception": false, "start_time": "2023-10-22T00:33:17.628238", "status": "completed" }, "tags": [] }, "source": [ "# Train models using HuggingFace libraries\n", "\n", "This notebook takes parameters from a params.json file which is automatically\n", "created by Substratus K8s operator.\n", "\n", "The following parameters influence what happens in this notebook:\n", "- `dataset_urls`: A comma separated list of URLs. The URLs should point to\n", " json files that contain your training dataset. If unset a json or jsonl\n", " file should be present under the `/content/data/` directory.\n", "- `prompt_template`: The prompt template to use for training\n", "- `push_to_hub`: if this variable is set a repo id, then the trained\n", " model will get pushed to HuggingFace hub. For example,\n", " set it to \"substratusai/my-model\" to publish to substratusai HF org." ] }, { "cell_type": "code", "execution_count": 1, "id": "86ccd646", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:33:20.446183Z", "iopub.status.busy": "2023-10-22T00:33:20.445407Z", "iopub.status.idle": "2023-10-22T00:33:20.458365Z", "shell.execute_reply": "2023-10-22T00:33:20.457645Z" }, "papermill": { "duration": 0.922166, "end_time": "2023-10-22T00:33:20.459935", "exception": false, "start_time": "2023-10-22T00:33:19.537769", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "{'dataset_urls': 'https://huggingface.co/datasets/weaviate/WithRetrieval-Random-Train-80/resolve/main/WithRetrieval-Random-Train-80.json',\n", " 'inference_prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n```graphql\\n',\n", " 'logging_steps': 50,\n", " 'modules_to_save': 'embed_tokens, lm_head',\n", " 'num_train_epochs': 3,\n", " 'per_device_eval_batch_size': 1,\n", " 'per_device_train_batch_size': 1,\n", " 'prompt_template': '## Instruction\\nYour task is to write GraphQL for the Natural Language Query provided. Use the provided API reference and Schema to generate the GraphQL. The GraphQL should be valid for Weaviate.\\n\\nOnly use the API reference to understand the syntax of the request.\\n\\n## Natural Language Query\\n{nlcommand}\\n\\n## Schema\\n{schema}\\n\\n## API reference\\n{apiRef}\\n\\n## Answer\\n{output}\\n',\n", " 'push_to_hub': 'substratusai/wgql-WithRetrieval-Random-Train-80',\n", " 'save_steps': 50,\n", " 'target_modules': 'q_proj, up_proj, o_proj, k_proj, down_proj, gate_proj, v_proj',\n", " 'warmup_steps': 100}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "from pathlib import Path\n", "\n", "params = {}\n", "params_path = Path(\"/content/params.json\")\n", "if params_path.is_file():\n", " with params_path.open(\"r\", encoding=\"UTF-8\") as params_file:\n", " params = json.load(params_file)\n", "\n", "\n", "params" ] }, { "cell_type": "code", "execution_count": 2, "id": "9fafd16b-d8c9-47bf-9116-c27b1d43a019", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:33:22.506977Z", "iopub.status.busy": "2023-10-22T00:33:22.506580Z", "iopub.status.idle": "2023-10-22T00:33:25.872338Z", "shell.execute_reply": "2023-10-22T00:33:25.871610Z" }, "papermill": { "duration": 4.499567, "end_time": "2023-10-22T00:33:25.873924", "exception": false, "start_time": "2023-10-22T00:33:21.374357", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using the following URLs for the dataset: ['https://huggingface.co/datasets/weaviate/WithRetrieval-Random-Train-80/resolve/main/WithRetrieval-Random-Train-80.json']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "54ab7cdf53f047abbc1942959916933d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00\n" ] } ], "source": [ "default_prompt = \"\"\"\n", "Below is an instruction that describes a task. { "execution": { "iopub.execute_input": "2023-10-22T00:38:00.598168Z", "iopub.status.busy": "2023-10-22T00:38:00.597836Z", "iopub.status.idle": "2023-10-22T00:38:00.619438Z", "shell.execute_reply": "2023-10-22T00:38:00.618759Z" }, "papermill": { "duration": 1.264362, "end_time": "2023-10-22T00:38:00.620931", "exception": false, "start_time": "2023-10-22T00:37:59.356569", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaTokenizerFast(name_or_path='/content/model/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t32000: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from typing import Dict\n", "# source: https://github.com/artidoro/qlora\n", "DEFAULT_PAD_TOKEN = params.get(\"pad_token\", \"[PAD]\")\n", "\n", "def smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict: Dict,\n", " tokenizer: transformers.PreTrainedTokenizer,\n", " model: transformers.PreTrainedModel,\n", "):\n", " \"\"\"Resize tokenizer and embedding.\n", "\n", " Note: This is the unoptimized version that may make your embedding size not be divisible by 64.\n", " \"\"\"\n", " num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)\n", " model.resize_token_embeddings(len(tokenizer))\n", " if num_new_tokens > 0:\n", " input_embeddings_data = model.get_input_embeddings().weight.data\n", " output_embeddings_data = model.get_output_embeddings().weight.data\n", "\n", " input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", " output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)\n", "\n", " input_embeddings_data[-num_new_tokens:] = input_embeddings_avg\n", " output_embeddings_data[-num_new_tokens:] = output_embeddings_avg\n", "\n", "if tokenizer._pad_token is None:\n", " smart_tokenizer_and_embedding_resize(\n", " special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),\n", " tokenizer=tokenizer,\n", " model=model,\n", " )\n", "\n", "if isinstance(tokenizer, transformers.LlamaTokenizer):\n", " # LLaMA tokenizer may not have correct special tokens set.\n", " # Check and add them if missing to prevent them from being parsed into different tokens.\n", " # Note that these are present in the vocabulary.\n", " # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token.\n", " print('Adding special tokens.')\n", " tokenizer.add_special_tokens({\n", " \"eos_token\": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),\n", " \"bos_token\": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),\n", " \"unk_token\": tokenizer.convert_ids_to_tokens(\n", " model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id\n", " ),\n", " })\n", "\n", "tokenizer" ] }, { "cell_type": "code", "execution_count": 8, "id": "e78b510d", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:38:04.767476Z", "iopub.status.busy": "2023-10-22T00:38:04.766754Z", "iopub.status.idle": "2023-10-22T00:38:11.742834Z", "shell.execute_reply": "2023-10-22T00:38:11.742183Z" }, "papermill": { "duration": 7.967639, "end_time": "2023-10-22T00:38:11.744550", "exception": false, "start_time": "2023-10-22T00:38:03.776911", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "360f10151af048819a1171718b9a9448", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/3190 [00:00, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules=['q_proj', 'up_proj', 'o_proj', 'k_proj', 'down_proj', 'gate_proj', 'v_proj'], lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', modules_to_save=['embed_tokens', 'lm_head'], init_lora_weights=True, layers_to_transform=None, layers_pattern=None)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "trainable params: 564,281,344 || all params: 7,040,552,960 || trainable%: 8.01473047935144\n" ] } ], "source": [ "from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training\n", "\n", "target_modules = params.get(\"target_modules\")\n", "if target_modules:\n", " target_modules = [mod.strip() for mod in target_modules.split(\",\")]\n", "\n", "modules_to_save = params.get(\"modules_to_save\")\n", "if modules_to_save:\n", " modules_to_save = [mod.strip() for mod in modules_to_save.split(\",\")]\n", "\n", "lora_config2 = LoraConfig(\n", " r=16,\n", " lora_alpha=16,\n", " lora_dropout=0.05,\n", " bias=\"none\",\n", " task_type=\"CAUSAL_LM\",\n", " target_modules=target_modules,\n", " modules_to_save = modules_to_save\n", ")\n", "print(lora_config2)\n", "\n", "model = prepare_model_for_kbit_training(model)\n", "\n", "# add LoRA adaptor\n", "model = get_peft_model(model, lora_config2)\n", "model.print_trainable_parameters()" ] }, { "cell_type": "code", "execution_count": 10, "id": "70a3e36c-62cf-45aa-8f37-0db0e40857dc", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:50.107840Z", "iopub.status.busy": "2023-10-22T00:39:50.106880Z", "iopub.status.idle": "2023-10-22T00:39:50.125663Z", "shell.execute_reply": "2023-10-22T00:39:50.125045Z" }, "papermill": { "duration": 0.989767, "end_time": training_args = parse_training_args(params)
training_args "fsdp=[],\n", "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},\n", "fsdp_min_num_params=0,\n", "fsdp_transformer_layer_cls_to_wrap=None,\n", "full_determinism=False,\n", "gradient_accumulation_steps=4,\n", "gradient_checkpointing=False,\n", "greater_is_better=None,\n", "group_by_length=False,\n", "half_precision_backend=auto,\n", "hub_always_push=False,\n", "hub_model_id=None,\n", "hub_private_repo=False,\n", "hub_strategy=every_save,\n", "hub_token=,\n", "ignore_data_skip=False,\n", "include_inputs_for_metrics=False,\n", "include_tokens_per_second=False,\n", "jit_mode_eval=False,\n", "label_names=None,\n", "label_smoothing_factor=0.0,\n", "learning_rate=3e-05,\n", "length_column_name=length,\n", "load_best_model_at_end=False,\n", "local_rank=0,\n", "log_level=passive,\n", "log_level_replica=warning,\n", "log_on_each_node=True,\n", "logging_dir=/content/artifacts/checkpoints/runs/Oct22_00-39-50_wgqlg-withretrieval-random-train-80-v2-modeller-gdth6,\n", "logging_first_step=False,\n", "logging_nan_inf_filter=True,\n", "logging_steps=50,\n", "logging_strategy=steps,\n", "lr_scheduler_type=cosine,\n", "max_grad_norm=1.0,\n", "max_steps=-1,\n", "metric_for_best_model=None,\n", "mp_parameters=,\n", "no_cuda=False,\n", "num_train_epochs=3.0,\n", "optim=paged_adamw_32bit,\n", "optim_args=None,\n", "output_dir=/content/artifacts/checkpoints,\n", "overwrite_output_dir=False,\n", "past_index=-1,\n", "per_device_eval_batch_size=1,\n", "per_device_train_batch_size=1,\n", "prediction_loss_only=False,\n", "push_to_hub=False,\n", "push_to_hub_model_id=None,\n", "push_to_hub_organization=None,\n", "push_to_hub_token=,\n", "ray_scope=last,\n", "remove_unused_columns=True,\n", "report_to=[],\n", "resume_from_checkpoint=None,\n", "run_name=/content/artifacts/checkpoints,\n", "save_on_each_node=False,\n", "save_safetensors=False,\n", "save_steps=50,\n", "save_strategy=steps,\n", "save_total_limit=None,\n", "seed=42,\n", "sharded_ddp=[],\n", "skip_memory_metrics=True,\n", "tf32=None,\n", "torch_compile=False,\n", "torch_compile_backend=None,\n", "torch_compile_mode=None,\n", "torchdynamo=None,\n", "tpu_metrics_debug=False,\n", "tpu_num_cores=None,\n", "use_cpu=False,\n", "use_ipex=False,\n", "use_legacy_prediction_loop=False,\n", "use_mps_device=False,\n", "warmup_ratio=0.02,\n", "warmup_steps=100,\n", "weight_decay=0.0,\n", ")" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from utils import parse_training_args\n", "\n", "training_args = parse_training_args(params)\n", "training_args" ] }, { "cell_type": "code", "execution_count": 11, "id": "2ae3e5f9-e28e-457b-b6bf-a62a472241bf", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:52.733240Z", "iopub.status.busy": "2023-10-22T00:39:52.732528Z", "iopub.status.idle": "2023-10-22T00:39:52.735862Z", "shell.execute_reply": "2023-10-22T00:39:52.735243Z" }, "papermill": { "duration": 1.548798, "end_time": "2023-10-22T00:39:52.737292", "exception": false, "start_time": "2023-10-22T00:39:51.188494", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# data = data[\"train\"].train_test_split(test_size=0.1)\n", "# data\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "5bc91439-6108-445c-8f85-e6558c9f0677", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:54.811779Z", "iopub.status.busy": "2023-10-22T00:39:54.811180Z", "iopub.status.idle": "2023-10-22T00:39:55.100635Z", "shell.execute_reply": "2023-10-22T00:39:55.099818Z" }, "papermill": { "duration": 1.304129, "end_time": "2023-10-22T00:39:55.102252", "exception": false, "start_time": "2023-10-22T00:39:53.798123", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! mkdir -p {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 13, "id": "b33e407a-9d4f-49f6-a74b-b80db8cc3a8a", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T00:39:57.094945Z", "iopub.status.busy": "2023-10-22T00:39:57.094146Z", "iopub.status.idle": "2023-10-22T04:52:40.595862Z", "shell.execute_reply": "2023-10-22T04:52:40.595167Z" }, "papermill": { "duration": 15165.144124, "end_time": "2023-10-22T04:52:41.273343", "exception": false, "start_time": "2023-10-22T00:39:56.129219", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a LlamaTokenizerFast tokenizer. 
\n", " \n", " \n", " [2391/2391 4:12:34, Epoch 2/3]\n", "
StepTraining Loss
StepTraining Loss

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "TrainOutput(global_step=2391, training_loss=0.07075354540412868, metrics={'train_runtime': 15162.9574, 'train_samples_per_second': 0.631, 'train_steps_per_second': 0.158, 'total_flos': 3.0420974601928704e+17, 'train_loss': 0.07075354540412868, 'epoch': 3.0})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer = transformers.Trainer(\n", " model=model,\n", " train_dataset=data[\"train\"],\n", "# eval_dataset=data[\"test\"],\n", " args=training_args,\n", " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", ")\n", "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", "\n", "checkpoint_path = Path(\"/content/artifacts/checkpoints\")\n", "\n", "# Only set resume_from_checkpoint True when directory exists and contains files\n", "resume_from_checkpoint = checkpoint_path.is_dir() and any(checkpoint_path.iterdir())\n", "if resume_from_checkpoint:\n", " print(\"Resuming from checkpoint:\", list(checkpoint_path.rglob(\"\")))\n", "trainer.train(resume_from_checkpoint=resume_from_checkpoint)" ] }, { "cell_type": "code", "execution_count": 14, "id": "172e47a7-400e-4f82-a5e3-38135ecf532f", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:52:43.424539Z", "iopub.status.busy": "2023-10-22T04:52:43.423767Z", "iopub.status.idle": "2023-10-22T04:53:03.150108Z", "shell.execute_reply": "2023-10-22T04:53:03.149387Z" }, "papermill": { "duration": 21.909882, "end_time": "2023-10-22T04:53:04.171757", "exception": false, "start_time": "2023-10-22T04:52:42.261875", "status": "completed" }, "tags": [] }, "outputs": [ { model.save_pretrained(trained_model_path_lora)
model ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (v_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (o_proj): Linear(\n", " in_features=4096, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (up_proj): Linear(\n", " in_features=4096, out_features=11008, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=11008, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (down_proj): Linear(\n", " in_features=11008, out_features=4096, bias=False\n", " (lora_dropout): ModuleDict(\n", " (default): Dropout(p=0.05, inplace=False)\n", " )\n", " (lora_A): ModuleDict(\n", " (default): Linear(in_features=11008, out_features=16, bias=False)\n", " )\n", " (lora_B): ModuleDict(\n", " (default): Linear(in_features=16, out_features=4096, bias=False)\n", " )\n", " (lora_embedding_A): ParameterDict()\n", " (lora_embedding_B): ParameterDict()\n", " )\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): ModulesToSaveWrapper(\n", " (original_module): Linear(in_features=4096, out_features=32001, bias=False)\n", " (modules_to_save): ModuleDict(\n", " (default): Linear(in_features=4096, out_features=32001, bias=False)\n", " )\n", " )\n", " )\n", " )\n", ")" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path_lora)\n", "model" ] }, { "cell_type": "code", "execution_count": 15, "id": "dea4e68e-57a7-48bd-bad9-f03dfe3f8a06", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:53:06.109528Z", "iopub.status.busy": "2023-10-22T04:53:06.108736Z", "iopub.status.idle": "2023-10-22T04:53:06.356699Z", "shell.execute_reply": "2023-10-22T04:53:06.355856Z" }, "papermill": { "duration": 1.205767, "end_time": "2023-10-22T04:53:06.358311", "exception": false, "start_time": "2023-10-22T04:53:05.152544", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 1.2G\r\n", " 512 -rw-r--r-- 1 root 3003 88 Oct 22 04:52 README.md\r\n", "1.0K -rw-r--r-- 1 root 3003 550 Oct 22 04:53 adapter_config.json\r\n", "1.2G -rw-r--r-- 1 root 3003 1.2G Oct 22 04:52 adapter_model.bin\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path_lora}" ] }, { "cell_type": "code", "execution_count": 16, "id": "09db36b7-ead6-4368-9bfb-13ba1ba800a5", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:53:08.298375Z", "iopub.status.busy": "2023-10-22T04:53:08.297543Z", "iopub.status.idle": "2023-10-22T04:54:00.325080Z", "shell.execute_reply": "2023-10-22T04:54:00.324374Z" }, "papermill": { "duration": 54.039738, "end_time": "2023-10-22T04:54:01.415212", "exception": false, "start_time": "2023-10-22T04:53:07.375474", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(32001, 4096)\n", " (layers): ModuleList(\n", " (0-31): 32 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n", " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n", " (act_fn): SiLUActivation()\n", " )\n", " (input_layernorm): LlamaRMSNorm()\n", " (post_attention_layernorm): LlamaRMSNorm()\n", " )\n", " )\n", " (norm): LlamaRMSNorm()\n", " )\n", " (lm_head): Linear(in_features=4096, out_features=32001, bias=False)\n", ")" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = model.merge_and_unload().half()\n", "model" ] }, { "cell_type": "code", "execution_count": 17, "id": "270a9a72-3a12-4d83-aa7d-2d167cb28cb4", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:54:03.491633Z", "iopub.status.busy": "2023-10-22T04:54:03.490944Z", "iopub.status.idle": "2023-10-22T04:54:03.732210Z", "shell.execute_reply": "2023-10-22T04:54:03.731396Z" }, "papermill": { "duration": 1.235829, "end_time": "2023-10-22T04:54:03.733715", "exception": false, "start_time": "2023-10-22T04:54:02.497886", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 0\r\n", "drwxr-xr-x 1 root 3003 0 Oct 22 00:39 checkpoints\r\n", "drwxr-xr-x 1 root 3003 0 Oct 22 00:39 lora\r\n", "drwxr-xr-x 1 root 3003 0 Oct 22 00:33 src\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -l {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 18, "id": "260e9d79-6eb8-4516-bf8f-825a25606391", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:54:05.862618Z", "iopub.status.busy": "2023-10-22T04:54:05.861702Z", "iopub.status.idle": "2023-10-22T04:56:43.569718Z", "shell.execute_reply": "2023-10-22T04:56:43.569054Z" }, "papermill": { "duration": 159.765833, "end_time": "2023-10-22T04:56:44.594302", "exception": false, "start_time": "2023-10-22T04:54:04.828469", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "('/content/artifacts/tokenizer_config.json',\n", " '/content/artifacts/special_tokens_map.json',\n", " '/content/artifacts/tokenizer.model',\n", " '/content/artifacts/added_tokens.json',\n", " '/content/artifacts/tokenizer.json')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.save_pretrained(trained_model_path)\n", "tokenizer.save_pretrained(trained_model_path)" ] }, { "cell_type": "code", "execution_count": 19, "id": "6d90a920-fb22-4291-8466-411ff41e31be", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:56:46.484015Z", "iopub.status.busy": "2023-10-22T04:56:46.483256Z", "iopub.status.idle": "2023-10-22T04:56:46.762038Z", "shell.execute_reply": "2023-10-22T04:56:46.761255Z" }, "papermill": { "duration": 1.264259, "end_time": "2023-10-22T04:56:46.763647", "exception": false, "start_time": "2023-10-22T04:56:45.499388", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 13G\r\n", " 512 -rw-r--r-- 1 root 3003 21 Oct 22 04:56 added_tokens.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 22 00:39 checkpoints\r\n", "1.0K -rw-r--r-- 1 root 3003 648 Oct 22 04:54 config.json\r\n", " 512 -rw-r--r-- 1 root 3003 183 Oct 22 04:54 generation_config.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 22 00:39 lora\r\n", "9.3G -rw-r--r-- 1 root 3003 9.3G Oct 22 04:54 pytorch_model-00001-of-00002.bin\r\n", "3.3G -rw-r--r-- 1 root 3003 3.3G Oct 22 04:56 pytorch_model-00002-of-00002.bin\r\n", " 24K -rw-r--r-- 1 root 3003 24K Oct 22 04:56 pytorch_model.bin.index.json\r\n", "1.0K -rw-r--r-- 1 root 3003 552 Oct 22 04:56 special_tokens_map.json\r\n", " 0 drwxr-xr-x 1 root 3003 0 Oct 22 00:33 src\r\n", "1.8M -rw-r--r-- 1 root 3003 1.8M Oct 22 04:56 tokenizer.json\r\n", "489K -rw-r--r-- 1 root 3003 489K Oct 22 04:56 tokenizer.model\r\n", "1.5K -rw-r--r-- 1 root 3003 1.1K Oct 22 04:56 tokenizer_config.json\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "source": [ "! ls -lash {trained_model_path}" ] }, { "cell_type": "code", "execution_count": 20, "id": "202a694a", "metadata": { "execution": { "iopub.execute_input": "2023-10-22T04:56:48.598532Z", "iopub.status.busy": "2023-10-22T04:56:48.597715Z" }, "papermill": { "duration": null, "end_time": null, "exception": false, "start_time": "2023-10-22T04:56:47.688302", "status": "running" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fd6c61a8f74449ab8cd067d50d2265ad", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Upload 2 LFS files: 0%| | 0/2 [00:00