{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "53a990e3-0d47-4e66-b928-f40d67f06584",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51fb0d43-c12b-4892-95d2-074bf5de0ce2",
   "metadata": {},
   "source": [
    "## Install addition packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9cf48779-454b-4b1d-b78f-531a1b207276",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# The Google Cloud Notebook product has specific requirements\n",
    "IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists(\"/opt/deeplearning/metadata/env_version\")\n",
    "\n",
    "# Google Cloud Notebook requires dependencies to be installed with '--user'\n",
    "USER_FLAG = \"\"\n",
    "if IS_GOOGLE_CLOUD_NOTEBOOK:\n",
    "    USER_FLAG = \"--user\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d2a3556a-ebf1-49c7-9d2c-63e30ca45f73",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip -q install {USER_FLAG} --upgrade transformers\n",
    "!pip -q install {USER_FLAG} --upgrade datasets\n",
    "!pip -q install {USER_FLAG} --upgrade tqdm\n",
    "!pip -q install {USER_FLAG} --upgrade cloudml-hypertune"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fcc3f1f6-36d3-4056-ad29-b69c57bb0bac",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip -q install {USER_FLAG} --upgrade google-cloud-aiplatform"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2214d165-356d-47f1-a4ee-4f6c50027e96",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Automatically restart kernel after installs\n",
    "import os\n",
    "\n",
    "if not os.getenv(\"IS_TESTING\"):\n",
    "    # Automatically restart kernel after installs\n",
    "    import IPython\n",
    "\n",
    "    app = IPython.Application.instance()\n",
    "    app.kernel.do_shutdown(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e8817443-c80e-475b-b54e-dd834c040b12",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install git+https://github.com/huggingface/transformers.git datasets pandas torch\n",
    "!pip install transformers[torch]\n",
    "!pip install accelerate -U"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21cc7690-95bf-4452-abef-46cd318ccfb5",
   "metadata": {},
   "source": [
    "## Set Project ID"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "30b78533-ff39-4c92-a365-f2e05ddb642f",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Project ID:  ikame-gem-ai-research\n"
     ]
    }
   ],
   "source": [
    "PROJECT_ID = \"iKame-gem-ai-research\"  # <---CHANGE THIS TO YOUR PROJECT\n",
    "\n",
    "import os\n",
    "\n",
    "# Get your Google Cloud project ID using google.auth\n",
    "if not os.getenv(\"IS_TESTING\"):\n",
    "    import google.auth\n",
    "\n",
    "    _, PROJECT_ID = google.auth.default()\n",
    "    print(\"Project ID: \", PROJECT_ID)\n",
    "\n",
    "# validate PROJECT_ID\n",
    "if PROJECT_ID == \"\" or PROJECT_ID is None or PROJECT_ID == \"iKame-gem-ai-research\":\n",
    "    print(\n",
    "        f\"Please set your project id before proceeding to next step. Currently it's set as {PROJECT_ID}\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5c4631f5-c8ba-43e9-a623-08cb2cb3a51a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TIMESTAMP = 20240108040502\n"
     ]
    }
   ],
   "source": [
    "from datetime import datetime\n",
    "\n",
    "\n",
    "def get_timestamp():\n",
    "    return datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
    "\n",
    "\n",
    "TIMESTAMP = get_timestamp()\n",
    "print(f\"TIMESTAMP = {TIMESTAMP}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "494d8009-7f9a-45d8-ba7c-3e3205d1c96b",
   "metadata": {},
   "source": [
    "## Create Cloud Storage bucket"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "303136a0-6334-4889-b43b-9f171a934311",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "BUCKET_NAME = \"gs://iKame-gem-ai-research\"  # <---CHANGE THIS TO YOUR BUCKET\n",
    "REGION = \"us-central1\"  # @param {type:\"string\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "014c6208-0b1a-4da8-888b-19c02a112474",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "if BUCKET_NAME == \"\" or BUCKET_NAME is None or BUCKET_NAME == \"gs://iKame-gem-ai-research\":\n",
    "    BUCKET_NAME = f\"gs://{PROJECT_ID}-bucket-review\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a52a28fa-591e-487c-bd53-8f770441ba63",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PROJECT_ID = ikame-gem-ai-research\n",
      "BUCKET_NAME = gs://ikame-gem-ai-research-bucket-review\n",
      "REGION = us-central1\n"
     ]
    }
   ],
   "source": [
    "print(f\"PROJECT_ID = {PROJECT_ID}\")\n",
    "print(f\"BUCKET_NAME = {BUCKET_NAME}\")\n",
    "print(f\"REGION = {REGION}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "24c35eb2-7619-4958-a04a-79b62788f257",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# ! gsutil mb -l $REGION $BUCKET_NAME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6f2ee0a0-3cff-47cb-9379-6f6e75fef9d5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      3078  2024-01-05T01:42:25Z  gs://ikame-gem-ai-research-bucket-review/batch_examples.csv#1704418945853255  metageneration=1\n",
      "                                 gs://ikame-gem-ai-research-bucket-review/pipeline_root/\n",
      "TOTAL: 1 objects, 3078 bytes (3.01 KiB)\n"
     ]
    }
   ],
   "source": [
    "! gsutil ls -al $BUCKET_NAME #validate access to your Cloud Storage bucket"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "da865a4c-5e29-465e-abf2-e443dae1b573",
   "metadata": {},
   "source": [
    "## Install libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fedbebaf-516e-4f7d-8a70-c7dc31de02df",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import base64\n",
    "import json\n",
    "import os\n",
    "import random\n",
    "import sys\n",
    "\n",
    "import google.auth\n",
    "from google.cloud import aiplatform\n",
    "from google.cloud.aiplatform import gapic as aip\n",
    "from google.cloud.aiplatform import hyperparameter_tuning as hpt\n",
    "from google.protobuf.json_format import MessageToDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0cc75279-b7a9-47cc-81a4-f8729c7d57f8",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from IPython.display import HTML, display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8856c9f3-270f-4dca-8a10-6bdee1af8bc0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import datasets\n",
    "from datasets import Dataset, DatasetDict\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import torch\n",
    "import transformers\n",
    "from datasets import ClassLabel, Sequence, load_dataset\n",
    "from transformers import (AutoModelForSequenceClassification, AutoTokenizer,BertForSequenceClassification,\n",
    "                          EvalPrediction, Trainer, TrainingArguments,PreTrainedModel,BertModel,\n",
    "                          default_data_collator)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "bbecdaa8-3cd3-4e7b-939d-f959da9301d6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from google.cloud import bigquery\n",
    "from google.cloud import storage\n",
    "\n",
    "client = bigquery.Client()\n",
    "storage_client = storage.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f693060f-c0ed-4ec3-bc66-17898f8ef854",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Notebook runtime: GPU\n",
      "PyTorch version : 2.0.0+cu118\n",
      "Transformers version : 2.16.1\n",
      "Datasets version : 4.37.0.dev0\n"
     ]
    }
   ],
   "source": [
    "print(f\"Notebook runtime: {'GPU' if torch.cuda.is_available() else 'CPU'}\")\n",
    "print(f\"PyTorch version : {torch.__version__}\")\n",
    "print(f\"Transformers version : {datasets.__version__}\")\n",
    "print(f\"Datasets version : {transformers.__version__}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "5637d9f0-d290-4107-974a-bfbda3b316b2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "3d114e96-31c2-4ed9-82d1-f2fab38f0944",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "APP_NAME = \"aift-review-classificatio-multiple-label\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "173dcb77-9908-4af1-86bb-7811c9f580e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "!cd aift-model-review-multiple-label-classification"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f383051-501f-4f8c-8017-c989c5740041",
   "metadata": {},
   "source": [
    "# Training"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db9715cc-0779-47a4-a0ed-82714b6668f6",
   "metadata": {},
   "source": [
    "## Preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "052ecc7b-c015-49a0-a359-85afbac10bbf",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_ckpt = \"distilbert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
    "\n",
    "def tokenize_and_encode(examples):\n",
    "    return tokenizer(examples[\"review\"], truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "6f5faf02-ede8-4d48-b94a-1d4619c8e610",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7a2415bdfd4a40fe80afe71e70d97976",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/556 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3b1c36309d4e4e108e79578edc45ed56",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/140 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2b79b69e8457427781c8e6fc8ad54d82",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/556 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e1e4981003d04646944fa0ce8ae0dc73",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/140 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sql = f\"\"\"\n",
    "SELECT * FROM `ikame-gem-ai-research.AIFT.reviews_multi_label_training`\n",
    "\"\"\"\n",
    "data = client.query(sql).to_dataframe()\n",
    "data= data.fillna('0')\n",
    "for i in data.columns:\n",
    "    if i != 'review':\n",
    "        data[i] = data[i].astype(int)\n",
    "\n",
    "data = Dataset.from_pandas(data).train_test_split(test_size=0.2,shuffle = True, seed=0)\n",
    "cols = data[\"train\"].column_names\n",
    "data = data.map(lambda x : {\"labels\": [x[c] for c in cols if c != \"review\"]})\n",
    "\n",
    "# Tokenize and encode\n",
    "dataset = data.map(tokenize_and_encode, batched=True, remove_columns=cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "f56a7de9-19a4-4cc8-996d-857c491cf633",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ads', 'bugs', 'positive', 'negative', 'graphic', 'gameplay', 'request']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels = [label for label in data['train'].features.keys() if label not in ['review','labels']]\n",
    "id2label = {idx:label for idx, label in enumerate(labels)}\n",
    "label2id = {label:idx for idx, label in enumerate(labels)}\n",
    "labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ad182dbc-c63d-49c9-b53c-9b63996d3746",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'labels': [0, 1, 0, 0, 0, 1, 0],\n",
       " 'input_ids': [101,\n",
       "  8795,\n",
       "  11100,\n",
       "  2024,\n",
       "  10599,\n",
       "  2030,\n",
       "  11829,\n",
       "  5999,\n",
       "  1010,\n",
       "  2437,\n",
       "  14967,\n",
       "  25198,\n",
       "  1012,\n",
       "  102],\n",
       " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset[\"train\"][0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02c2a7b2-58f1-4eac-ac61-5d54dbdc1184",
   "metadata": {},
   "source": [
    "## Fine-tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9452f6f3-2b4b-4ee7-8c9f-3c42e04e396f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "class BertForMultilabelSequenceClassification(BertForSequenceClassification):\n",
    "    def __init__(self, config):\n",
    "      super().__init__(config)\n",
    "\n",
    "    def forward(self,\n",
    "        input_ids=None,\n",
    "        attention_mask=None,\n",
    "        token_type_ids=None,\n",
    "        position_ids=None,\n",
    "        head_mask=None,\n",
    "        inputs_embeds=None,\n",
    "        labels=None,\n",
    "        output_attentions=None,\n",
    "        output_hidden_states=None,\n",
    "        return_dict=None):\n",
    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
    "\n",
    "        outputs = self.bert(input_ids,\n",
    "            attention_mask=attention_mask,\n",
    "            token_type_ids=token_type_ids,\n",
    "            position_ids=position_ids,\n",
    "            head_mask=head_mask,\n",
    "            inputs_embeds=inputs_embeds,\n",
    "            output_attentions=output_attentions,\n",
    "            output_hidden_states=output_hidden_states,\n",
    "            return_dict=return_dict)\n",
    "\n",
    "        pooled_output = outputs[1]\n",
    "        pooled_output = self.dropout(pooled_output)\n",
    "        logits = self.classifier(pooled_output)\n",
    "\n",
    "        loss = None\n",
    "        if labels is not None:\n",
    "            loss_fct = torch.nn.BCEWithLogitsLoss()\n",
    "            loss = loss_fct(logits.view(-1, self.num_labels),\n",
    "                            labels.float().view(-1, self.num_labels))\n",
    "\n",
    "        if not return_dict:\n",
    "            output = (logits,) + outputs[2:]\n",
    "            return ((loss,) + output) if loss is not None else output\n",
    "\n",
    "        return SequenceClassifierOutput(loss=loss,\n",
    "            logits=logits,\n",
    "            hidden_states=outputs.hidden_states,\n",
    "            attentions=outputs.attentions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "76035010-b10a-4398-8a85-feaa19414ca4",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.\n",
      "Some weights of BertForMultilabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.layer.11.attention.self.key.bias', 'encoder.layer.6.attention.output.LayerNorm.bias', 'encoder.layer.3.attention.output.LayerNorm.bias', 'encoder.layer.11.attention.self.query.weight', 'encoder.layer.6.attention.self.value.bias', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.4.attention.self.key.bias', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.11.attention.self.query.bias', 'encoder.layer.11.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.4.output.LayerNorm.weight', 'classifier.weight', 'encoder.layer.8.output.dense.bias', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.5.attention.self.key.bias', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.3.attention.output.LayerNorm.weight', 'encoder.layer.7.attention.output.dense.bias', 'encoder.layer.1.attention.output.LayerNorm.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.6.attention.output.LayerNorm.weight', 'encoder.layer.11.output.LayerNorm.bias', 'embeddings.token_type_embeddings.weight', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.4.attention.self.key.weight', 'encoder.layer.11.attention.output.LayerNorm.weight', 'encoder.layer.6.intermediate.dense.weight', 'encoder.layer.9.attention.self.value.weight', 'embeddings.position_embeddings.weight', 'encoder.layer.10.attention.self.query.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.10.attention.self.key.weight', 'encoder.layer.2.attention.output.dense.bias', 'encoder.layer.3.attention.self.key.weight', 'encoder.layer.7.output.LayerNorm.bias', 'encoder.layer.2.attention.output.dense.weight', 'encoder.layer.5.attention.output.dense.weight', 'encoder.layer.8.attention.output.LayerNorm.bias', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.7.attention.self.value.bias', 'encoder.layer.6.attention.self.query.bias', 'encoder.layer.7.output.LayerNorm.weight', 'encoder.layer.3.attention.self.value.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.2.attention.self.query.weight', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.5.output.dense.bias', 'encoder.layer.9.attention.output.dense.bias', 'encoder.layer.9.attention.self.value.bias', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.3.attention.output.dense.weight', 'encoder.layer.6.attention.self.key.bias', 'encoder.layer.1.attention.self.query.bias', 'encoder.layer.11.attention.self.value.weight', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.5.attention.self.key.weight', 'encoder.layer.7.intermediate.dense.weight', 'encoder.layer.2.attention.self.key.bias', 'encoder.layer.7.output.dense.weight', 'encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.5.output.LayerNorm.weight', 'encoder.layer.7.attention.output.dense.weight', 'encoder.layer.10.attention.output.LayerNorm.weight', 'encoder.layer.6.attention.output.dense.weight', 'encoder.layer.9.attention.self.query.weight', 'encoder.layer.10.attention.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.10.attention.output.dense.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.5.output.dense.weight', 'encoder.layer.5.attention.self.query.weight', 'classifier.bias', 'encoder.layer.5.intermediate.dense.weight', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.attention.output.dense.bias', 'encoder.layer.3.attention.self.query.weight', 'encoder.layer.8.output.LayerNorm.bias', 'encoder.layer.3.output.dense.weight', 'encoder.layer.10.attention.self.value.weight', 'encoder.layer.6.output.dense.weight', 'encoder.layer.8.intermediate.dense.bias', 'encoder.layer.0.output.dense.bias', 'encoder.layer.4.attention.self.value.bias', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.4.attention.output.dense.bias', 'pooler.dense.bias', 'encoder.layer.10.attention.self.value.bias', 'encoder.layer.6.attention.self.key.weight', 'encoder.layer.10.attention.self.query.weight', 'encoder.layer.7.attention.output.LayerNorm.weight', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.10.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.9.attention.output.LayerNorm.bias', 'encoder.layer.11.attention.output.dense.weight', 'encoder.layer.7.attention.self.value.weight', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.9.attention.self.query.bias', 'embeddings.LayerNorm.weight', 'encoder.layer.5.attention.output.LayerNorm.bias', 'encoder.layer.1.output.dense.bias', 'encoder.layer.11.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.6.attention.self.value.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.2.attention.self.value.bias', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.11.attention.self.key.weight', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.11.attention.output.LayerNorm.bias', 'encoder.layer.9.output.dense.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.4.output.dense.bias', 'encoder.layer.5.attention.self.value.weight', 'encoder.layer.9.output.dense.bias', 'encoder.layer.11.attention.output.dense.bias', 'encoder.layer.8.output.LayerNorm.weight', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.10.output.dense.weight', 'encoder.layer.9.output.LayerNorm.bias', 'encoder.layer.8.attention.self.query.weight', 'encoder.layer.9.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.8.attention.self.value.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.3.output.dense.bias', 'encoder.layer.4.attention.output.dense.weight', 'encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.5.attention.output.dense.bias', 'encoder.layer.11.output.dense.weight', 'encoder.layer.3.attention.self.query.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.6.attention.output.dense.bias', 'encoder.layer.7.output.dense.bias', 'encoder.layer.2.attention.output.LayerNorm.weight', 'encoder.layer.6.output.LayerNorm.bias', 'encoder.layer.10.output.dense.bias', 'pooler.dense.weight', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.3.attention.self.value.weight', 'encoder.layer.5.attention.output.LayerNorm.weight', 'encoder.layer.6.attention.self.query.weight', 'encoder.layer.8.attention.self.query.bias', 'encoder.layer.2.attention.self.query.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.8.attention.output.LayerNorm.weight', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.1.attention.self.key.weight', 'encoder.layer.3.attention.self.key.bias', 'encoder.layer.4.attention.self.query.weight', 'encoder.layer.7.attention.self.key.bias', 'encoder.layer.8.attention.self.key.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.1.attention.output.dense.weight', 'encoder.layer.4.intermediate.dense.bias', 'encoder.layer.8.attention.self.key.bias', 'encoder.layer.7.attention.self.query.bias', 'encoder.layer.1.attention.self.key.bias', 'encoder.layer.4.output.dense.weight', 'encoder.layer.4.attention.self.query.bias', 'encoder.layer.3.attention.output.dense.bias', 'encoder.layer.4.attention.self.value.weight', 'encoder.layer.4.attention.output.LayerNorm.weight', 'encoder.layer.9.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.7.attention.self.key.weight', 'encoder.layer.5.attention.self.query.bias', 'encoder.layer.8.intermediate.dense.weight', 'encoder.layer.8.attention.self.value.weight', 'encoder.layer.8.attention.output.dense.weight', 'encoder.layer.7.attention.output.LayerNorm.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.2.attention.self.value.weight', 'encoder.layer.8.output.dense.weight', 'encoder.layer.11.output.LayerNorm.weight', 'encoder.layer.9.attention.self.key.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.7.attention.self.query.weight', 'encoder.layer.5.attention.self.value.bias', 'encoder.layer.10.attention.output.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "num_labels=7\n",
    "model = BertForMultilabelSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "74af900d-0688-4f7b-b8f2-56f36f467a06",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True):\n",
    "    y_pred = torch.from_numpy(y_pred)\n",
    "    y_true = torch.from_numpy(y_true)\n",
    "    if sigmoid:\n",
    "      y_pred = y_pred.sigmoid()\n",
    "    return ((y_pred>thresh)==y_true.bool()).float().mean().item()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "db202a97-61e1-4e43-bb93-20179c2c0aa2",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "e0ab370a-fc4d-460b-9dab-dbde755dc3f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MultilabelTrainer(Trainer):\n",
    "    def compute_loss(self, model, inputs, return_outputs=False):\n",
    "        labels = inputs.pop(\"labels\")\n",
    "        outputs = model(**inputs)\n",
    "        logits = outputs.logits\n",
    "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
    "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
    "                        labels.float().view(-1, self.model.config.num_labels))\n",
    "        return (loss, outputs) if return_outputs else loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "340ade6d-1eb1-47ec-b8e6-56371083e361",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 8\n",
    "\n",
    "args = TrainingArguments(\n",
    "    output_dir=\"aift-model-review-multiple-label-classification\",\n",
    "    evaluation_strategy = \"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    per_device_train_batch_size=batch_size,\n",
    "    per_device_eval_batch_size=batch_size,\n",
    "    num_train_epochs=10,\n",
    "    weight_decay=0.01,\n",
    "    use_cpu = False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "39d8e955-9ca8-463c-899a-bd3b1d5f2c0e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "3cb96e02-f0f7-4a0a-9fe6-f88fe89826f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = MultilabelTrainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=dataset[\"train\"],\n",
    "    eval_dataset=dataset[\"test\"],\n",
    "    compute_metrics=compute_metrics,\n",
    "    tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "da79a882-f1f1-41a5-b4dd-98b070012c4c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='36' max='18' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [18/18 00:06]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 0.7062913179397583,\n",
       " 'eval_accuracy_thresh': 0.4561224579811096,\n",
       " 'eval_runtime': 0.2818,\n",
       " 'eval_samples_per_second': 496.847,\n",
       " 'eval_steps_per_second': 63.88}"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "eeefe348-a66f-4e14-9844-da6f3f3ebd80",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='700' max='700' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [700/700 00:47, Epoch 10/10]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy Thresh</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.415191</td>\n",
       "      <td>0.868367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.302631</td>\n",
       "      <td>0.901020</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.240627</td>\n",
       "      <td>0.928571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.217601</td>\n",
       "      <td>0.931633</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.203845</td>\n",
       "      <td>0.924490</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.192444</td>\n",
       "      <td>0.929592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.190031</td>\n",
       "      <td>0.926531</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>0.265200</td>\n",
       "      <td>0.186760</td>\n",
       "      <td>0.928571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>0.265200</td>\n",
       "      <td>0.180436</td>\n",
       "      <td>0.936735</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>0.265200</td>\n",
       "      <td>0.179821</td>\n",
       "      <td>0.934694</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Checkpoint destination directory aift-model-review-multiple-label-classification/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=700, training_loss=0.22303315843854632, metrics={'train_runtime': 47.1667, 'train_samples_per_second': 117.88, 'train_steps_per_second': 14.841, 'total_flos': 55632988457664.0, 'train_loss': 0.22303315843854632, 'epoch': 10.0})"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "d9c2e1e1-c20e-48e5-8f6b-e4e3222899a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: cannot create directory ‘./models’: File exists\n"
     ]
    }
   ],
   "source": [
    "saved_model_local_path = \"./models\"\n",
    "# !mkdir ./aift-model-review-multiple-label-classification/models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "c6632c17-49e2-4823-abae-a286fa06f8c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.save_model(saved_model_local_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "4af413bf-9c9d-46aa-b75b-f729c8aae546",
   "metadata": {},
   "outputs": [],
   "source": [
    "history = trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "6ee5c718-6b27-4ed8-993b-dd41468cf16a",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 0.1798214465379715,\n",
       " 'eval_accuracy_thresh': 0.9346938729286194,\n",
       " 'eval_runtime': 0.2965,\n",
       " 'eval_samples_per_second': 472.249,\n",
       " 'eval_steps_per_second': 60.718,\n",
       " 'epoch': 10.0}"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "948a6110-48c3-42f5-8950-d4dc3cfc21a5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c835ed1d2ac74d3995f59f351a5933bd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3efca8d4-a40f-40e0-b628-e9f1718b519d",
   "metadata": {},
   "source": [
    "## Predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "08a4759d-ab64-4112-ae27-4f1c4998e269",
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(text,threshold):\n",
    "    encoding = tokenizer(text, return_tensors=\"pt\")\n",
    "    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}\n",
    "\n",
    "    outputs = trainer.model(**encoding)\n",
    "    logits = outputs.logits\n",
    "    logits.shape\n",
    "    # apply sigmoid + threshold\n",
    "    sigmoid = torch.nn.Sigmoid()\n",
    "    probs = sigmoid(logits.squeeze().cpu())\n",
    "    predictions = np.zeros(probs.shape)\n",
    "    print(predictions)\n",
    "    print(probs)\n",
    "    predictions[np.where(probs >= threshold)] = 1\n",
    "    # turn predicted id's into actual label names\n",
    "    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]\n",
    "    print(predicted_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "136f3624-d752-4e62-ae67-c52c8c7413b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0. 0. 0. 0. 0. 0. 0.]\n",
      "tensor([0.9740, 0.0251, 0.1409, 0.7609, 0.0359, 0.0374, 0.0321],\n",
      "       grad_fn=<SigmoidBackward0>)\n",
      "['ads', 'negative']\n"
     ]
    }
   ],
   "source": [
    "text = \"a lot of ads\"\n",
    "predict(text,0.4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "4bdd8052-5c6f-4148-a5cd-bbd5e42aa640",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_text = id2label\n",
    "model_name_or_path=model_ckpt\n",
    "saved_model_path = saved_model_local_path\n",
    "\n",
    "\n",
    "def predict_(input_text, saved_model_path,threshold):\n",
    "    # initialize tokenizer\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)\n",
    "\n",
    "    # preprocess and encode input text\n",
    "    tokenizer_args = (input_text,)\n",
    "    predict_input = tokenizer(\n",
    "        *tokenizer_args,\n",
    "        padding=\"max_length\",\n",
    "        max_length=128,\n",
    "        truncation=True,\n",
    "        return_tensors=\"pt\",\n",
    "    )\n",
    "\n",
    "    # load trained model\n",
    "    loaded_model = AutoModelForSequenceClassification.from_pretrained(saved_model_path)\n",
    "\n",
    "    # get predictions\n",
    "    output = loaded_model(predict_input[\"input_ids\"])\n",
    "\n",
    "    # return labels\n",
    "    logits = output.logits\n",
    "    logits.shape\n",
    "    # apply sigmoid + threshold\n",
    "    sigmoid = torch.nn.Sigmoid()\n",
    "    probs = sigmoid(logits.squeeze().cpu())\n",
    "    predictions = np.zeros(probs.shape)\n",
    "    print(predictions)\n",
    "    print(probs)\n",
    "    predictions[np.where(probs >= threshold)] = 1\n",
    "    # turn predicted id's into actual label names\n",
    "    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]\n",
    "    print(predicted_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "48e96b48-db19-4c25-89f1-eb640c955614",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0. 0. 0. 0. 0. 0. 0.]\n",
      "tensor([0.5107, 0.1010, 0.5961, 0.2481, 0.2118, 0.1907, 0.1010],\n",
      "       grad_fn=<SigmoidBackward0>)\n",
      "['ads', 'positive']\n"
     ]
    }
   ],
   "source": [
    "text='ew a lot of ads'\n",
    "predict_(text, saved_model_path,0.4)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b8505cd-bc32-46e9-9387-a102830e62ef",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Custom training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "bba84d7d-5971-4e44-a977-268bc2b97e77",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI = (\n",
    "    \"us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-7:latest\"\n",
    ")\n",
    "\n",
    "PYTHON_PACKAGE_APPLICATION_DIR = \"python_package\"\n",
    "\n",
    "source_package_file_name = f\"pipeline/aift-model-review-multiple-label-classification/{PYTHON_PACKAGE_APPLICATION_DIR}/dist/trainer-0.1.tar.gz\"\n",
    "python_package_gcs_uri = (\n",
    "    f\"{BUCKET_NAME}/pytorch-on-gcp/{APP_NAME}/train/python_package/trainer-0.1.tar.gz\"\n",
    ")\n",
    "python_module_name = \"trainer.task\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "3610d07c-909a-470a-b3f7-2e68f3b8292e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# !mkdir ./python_package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "ecdc6201-d714-4cbe-9c1f-415857730700",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overwriting ./aift-model-review-multiple-label-classification/python_package/setup.py\n"
     ]
    }
   ],
   "source": [
    "%%writefile ./aift-model-review-multiple-label-classification/{PYTHON_PACKAGE_APPLICATION_DIR}/setup.py\n",
    "\n",
    "from setuptools import find_packages\n",
    "from setuptools import setup\n",
    "import setuptools\n",
    "\n",
    "from distutils.command.build import build as _build\n",
    "import subprocess\n",
    "\n",
    "\n",
    "REQUIRED_PACKAGES = [\n",
    "    'transformers',\n",
    "    'datasets',\n",
    "    'tqdm',\n",
    "    'cloudml-hypertune'\n",
    "]\n",
    "\n",
    "setup(\n",
    "    name='trainer',\n",
    "    version='0.1',\n",
    "    install_requires=REQUIRED_PACKAGES,\n",
    "    packages=find_packages(),\n",
    "    include_package_data=True,\n",
    "    description='Vertex AI | Training | PyTorch | Text Classification | Python Package'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "d001cdca-a207-4f23-b6e5-33106c252004",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "running sdist\n",
      "running egg_info\n",
      "creating trainer.egg-info\n",
      "writing trainer.egg-info/PKG-INFO\n",
      "writing dependency_links to trainer.egg-info/dependency_links.txt\n",
      "writing requirements to trainer.egg-info/requires.txt\n",
      "writing top-level names to trainer.egg-info/top_level.txt\n",
      "writing manifest file 'trainer.egg-info/SOURCES.txt'\n",
      "reading manifest file 'trainer.egg-info/SOURCES.txt'\n",
      "writing manifest file 'trainer.egg-info/SOURCES.txt'\n",
      "running check\n",
      "creating trainer-0.1\n",
      "creating trainer-0.1/trainer\n",
      "creating trainer-0.1/trainer.egg-info\n",
      "copying files to trainer-0.1...\n",
      "copying README.md -> trainer-0.1\n",
      "copying setup.py -> trainer-0.1\n",
      "copying trainer/__init__.py -> trainer-0.1/trainer\n",
      "copying trainer/experiment.py -> trainer-0.1/trainer\n",
      "copying trainer/metadata.py -> trainer-0.1/trainer\n",
      "copying trainer/model.py -> trainer-0.1/trainer\n",
      "copying trainer/task.py -> trainer-0.1/trainer\n",
      "copying trainer/utils.py -> trainer-0.1/trainer\n",
      "copying trainer.egg-info/PKG-INFO -> trainer-0.1/trainer.egg-info\n",
      "copying trainer.egg-info/SOURCES.txt -> trainer-0.1/trainer.egg-info\n",
      "copying trainer.egg-info/dependency_links.txt -> trainer-0.1/trainer.egg-info\n",
      "copying trainer.egg-info/requires.txt -> trainer-0.1/trainer.egg-info\n",
      "copying trainer.egg-info/top_level.txt -> trainer-0.1/trainer.egg-info\n",
      "Writing trainer-0.1/setup.cfg\n",
      "creating dist\n",
      "Creating tar archive\n",
      "removing 'trainer-0.1' (and everything under it)\n"
     ]
    }
   ],
   "source": [
    "!cd aift-model-review-multiple-label-classification/{PYTHON_PACKAGE_APPLICATION_DIR} && python3 setup.py sdist --formats=gztar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "7a296aa0-ead6-456f-a93a-657fed393bd2",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Copying file://python_package/dist/trainer-0.1.tar.gz [Content-Type=application/x-tar]...\n",
      "/ [1 files][  916.0 B/  916.0 B]                                                \n",
      "Operation completed over 1 objects/916.0 B.                                      \n"
     ]
    }
   ],
   "source": [
    "!gsutil cp {source_package_file_name} {python_package_gcs_uri}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "087fcdaa-0d99-4104-8e61-74455d4bf734",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       916  2024-01-08T07:48:19Z  gs://ikame-gem-ai-research-bucket-review/pytorch-on-gcp/aift-review-classificatio-multiple-label/train/python_package/trainer-0.1.tar.gz\n",
      "TOTAL: 1 objects, 916 bytes (916 B)\n"
     ]
    }
   ],
   "source": [
    "!gsutil ls -l {python_package_gcs_uri}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "4dce414a-063a-4952-8197-75586909e098",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# !cd {PYTHON_PACKAGE_APPLICATION_DIR} && python -m trainer.task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "a7698349-f5f4-4032-a9b2-1fc659f4b022",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "112e1b67-5bb0-444a-94c6-a2f010e24fe9",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "APP_NAME=aift-review-classificatio-multiple-label\n",
      "PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI=us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-7:latest\n",
      "python_package_gcs_uri=gs://ikame-gem-ai-research-bucket-review/pytorch-on-gcp/aift-review-classificatio-multiple-label/train/python_package/trainer-0.1.tar.gz\n",
      "python_module_name=trainer.task\n"
     ]
    }
   ],
   "source": [
    "print(f\"APP_NAME={APP_NAME}\")\n",
    "print(\n",
    "    f\"PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI={PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI}\"\n",
    ")\n",
    "print(f\"python_package_gcs_uri={python_package_gcs_uri}\")\n",
    "print(f\"python_module_name={python_module_name}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "c0fa20a0-0831-49ab-9fce-423016e98db6",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "JOB_NAME=aift-review-classificatio-multiple-label-pytorch-pkg-ar-20240108075109\n"
     ]
    }
   ],
   "source": [
    "JOB_NAME = f\"{APP_NAME}-pytorch-pkg-ar-{get_timestamp()}\"\n",
    "print(f\"JOB_NAME={JOB_NAME}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "86922169-8509-48ff-acc9-c06bc9a4ecd1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "job = aiplatform.CustomPythonPackageTrainingJob(\n",
    "    display_name=f\"{JOB_NAME}\",\n",
    "    python_package_gcs_uri=python_package_gcs_uri,\n",
    "    python_module_name=python_module_name,\n",
    "    container_uri=PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "a7909b64-fedb-4da8-bc61-80b4806117d3",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training Output directory:\n",
      "gs://ikame-gem-ai-research-bucket-review/aiplatform-custom-training-2024-01-08-07:51:20.301 \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:Training Output directory:\n",
      "gs://ikame-gem-ai-research-bucket-review/aiplatform-custom-training-2024-01-08-07:51:20.301 \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "View Training:\n",
      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/2282426366479564800?project=763889829809\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:View Training:\n",
      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/2282426366479564800?project=763889829809\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "View backing custom job:\n",
      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/7832101356516147200?project=763889829809\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:View backing custom job:\n",
      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/7832101356516147200?project=763889829809\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
      "PipelineState.PIPELINE_STATE_RUNNING\n"
     ]
    }
   ],
   "source": [
    "training_args = [\"--num-epochs\", \"2\", \"--model-name\", \"finetuned-bert-classifier\"]\n",
    "\n",
    "model = job.run(\n",
    "    replica_count=1,\n",
    "    machine_type=\"n1-standard-8\",\n",
    "    accelerator_type=\"NVIDIA_TESLA_V100\",\n",
    "    accelerator_count=1,\n",
    "    args=training_args,\n",
    "    sync=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e681913-680e-4664-9c6a-083f350915bc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "environment": {
   "kernel": "python3",
   "name": ".m114",
   "type": "gcloud",
   "uri": "gcr.io/deeplearning-platform-release/:m114"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}