Cielciel commited on Jan 8, 2024

Commit

bbc5ecf

1 Parent(s): 01f0016

Cielciel/aift-model-review-multiple-label-classification

Browse files

Files changed (31) hide show

.ipynb_checkpoints/Aift-review-multiple-label-classification-workflow-checkpoint.ipynb +6 -0
Aift-review-multiple-label-classification-workflow.ipynb +1613 -0
README.md +68 -0
config.json +42 -0
custom_container/Dockerfile +21 -0
custom_container/README.md +66 -0
custom_container/scripts/train-cloud.sh +80 -0
model.safetensors +3 -0
python_package/README.md +58 -0
python_package/dist/trainer-0.1.tar.gz +3 -0
python_package/scripts/train-cloud.sh +70 -0
python_package/setup.py +24 -0
python_package/trainer.egg-info/PKG-INFO +8 -0
python_package/trainer.egg-info/SOURCES.txt +13 -0
python_package/trainer.egg-info/dependency_links.txt +1 -0
python_package/trainer.egg-info/requires.txt +4 -0
python_package/trainer.egg-info/top_level.txt +1 -0
python_package/trainer/__init__.py +0 -0
python_package/trainer/experiment.py +137 -0
python_package/trainer/metadata.py +31 -0
python_package/trainer/model.py +31 -0
python_package/trainer/task.py +104 -0
python_package/trainer/utils.py +99 -0
runs/Jan08_04-05-34_aift-review-classification-multiple-label/events.out.tfevents.1704686768.aift-review-classification-multiple-label +3 -0
runs/Jan08_04-07-17_aift-review-classification-multiple-label/events.out.tfevents.1704686842.aift-review-classification-multiple-label +3 -0
runs/Jan08_04-07-17_aift-review-classification-multiple-label/events.out.tfevents.1704687081.aift-review-classification-multiple-label +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +55 -0
training_args.bin +3 -0
vocab.txt +0 -0

.ipynb_checkpoints/Aift-review-multiple-label-classification-workflow-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Aift-review-multiple-label-classification-workflow.ipynb ADDED Viewed

	@@ -0,0 +1,1613 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "53a990e3-0d47-4e66-b928-f40d67f06584",
+   "metadata": {},
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51fb0d43-c12b-4892-95d2-074bf5de0ce2",
+   "metadata": {},
+   "source": [
+    "## Install addition packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9cf48779-454b-4b1d-b78f-531a1b207276",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# The Google Cloud Notebook product has specific requirements\n",
+    "IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists(\"/opt/deeplearning/metadata/env_version\")\n",
+    "\n",
+    "# Google Cloud Notebook requires dependencies to be installed with '--user'\n",
+    "USER_FLAG = \"\"\n",
+    "if IS_GOOGLE_CLOUD_NOTEBOOK:\n",
+    "    USER_FLAG = \"--user\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d2a3556a-ebf1-49c7-9d2c-63e30ca45f73",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip -q install {USER_FLAG} --upgrade transformers\n",
+    "!pip -q install {USER_FLAG} --upgrade datasets\n",
+    "!pip -q install {USER_FLAG} --upgrade tqdm\n",
+    "!pip -q install {USER_FLAG} --upgrade cloudml-hypertune"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fcc3f1f6-36d3-4056-ad29-b69c57bb0bac",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip -q install {USER_FLAG} --upgrade google-cloud-aiplatform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2214d165-356d-47f1-a4ee-4f6c50027e96",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Automatically restart kernel after installs\n",
+    "import os\n",
+    "\n",
+    "if not os.getenv(\"IS_TESTING\"):\n",
+    "    # Automatically restart kernel after installs\n",
+    "    import IPython\n",
+    "\n",
+    "    app = IPython.Application.instance()\n",
+    "    app.kernel.do_shutdown(True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e8817443-c80e-475b-b54e-dd834c040b12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "!pip install git+https://github.com/huggingface/transformers.git datasets pandas torch\n",
+    "!pip install transformers[torch]\n",
+    "!pip install accelerate -U"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21cc7690-95bf-4452-abef-46cd318ccfb5",
+   "metadata": {},
+   "source": [
+    "## Set Project ID"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "30b78533-ff39-4c92-a365-f2e05ddb642f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Project ID:  ikame-gem-ai-research\n"
+     ]
+    }
+   ],
+   "source": [
+    "PROJECT_ID = \"iKame-gem-ai-research\"  # <---CHANGE THIS TO YOUR PROJECT\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# Get your Google Cloud project ID using google.auth\n",
+    "if not os.getenv(\"IS_TESTING\"):\n",
+    "    import google.auth\n",
+    "\n",
+    "    _, PROJECT_ID = google.auth.default()\n",
+    "    print(\"Project ID: \", PROJECT_ID)\n",
+    "\n",
+    "# validate PROJECT_ID\n",
+    "if PROJECT_ID == \"\" or PROJECT_ID is None or PROJECT_ID == \"iKame-gem-ai-research\":\n",
+    "    print(\n",
+    "        f\"Please set your project id before proceeding to next step. Currently it's set as {PROJECT_ID}\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5c4631f5-c8ba-43e9-a623-08cb2cb3a51a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TIMESTAMP = 20240108040502\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datetime import datetime\n",
+    "\n",
+    "\n",
+    "def get_timestamp():\n",
+    "    return datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
+    "\n",
+    "\n",
+    "TIMESTAMP = get_timestamp()\n",
+    "print(f\"TIMESTAMP = {TIMESTAMP}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "494d8009-7f9a-45d8-ba7c-3e3205d1c96b",
+   "metadata": {},
+   "source": [
+    "## Create Cloud Storage bucket"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "303136a0-6334-4889-b43b-9f171a934311",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "BUCKET_NAME = \"gs://iKame-gem-ai-research\"  # <---CHANGE THIS TO YOUR BUCKET\n",
+    "REGION = \"us-central1\"  # @param {type:\"string\"}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "014c6208-0b1a-4da8-888b-19c02a112474",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "if BUCKET_NAME == \"\" or BUCKET_NAME is None or BUCKET_NAME == \"gs://iKame-gem-ai-research\":\n",
+    "    BUCKET_NAME = f\"gs://{PROJECT_ID}-bucket-review\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a52a28fa-591e-487c-bd53-8f770441ba63",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "PROJECT_ID = ikame-gem-ai-research\n",
+      "BUCKET_NAME = gs://ikame-gem-ai-research-bucket-review\n",
+      "REGION = us-central1\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"PROJECT_ID = {PROJECT_ID}\")\n",
+    "print(f\"BUCKET_NAME = {BUCKET_NAME}\")\n",
+    "print(f\"REGION = {REGION}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "24c35eb2-7619-4958-a04a-79b62788f257",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# ! gsutil mb -l $REGION $BUCKET_NAME"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6f2ee0a0-3cff-47cb-9379-6f6e75fef9d5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      3078  2024-01-05T01:42:25Z  gs://ikame-gem-ai-research-bucket-review/batch_examples.csv#1704418945853255  metageneration=1\n",
+      "                                 gs://ikame-gem-ai-research-bucket-review/pipeline_root/\n",
+      "TOTAL: 1 objects, 3078 bytes (3.01 KiB)\n"
+     ]
+    }
+   ],
+   "source": [
+    "! gsutil ls -al $BUCKET_NAME #validate access to your Cloud Storage bucket"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da865a4c-5e29-465e-abf2-e443dae1b573",
+   "metadata": {},
+   "source": [
+    "## Install libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fedbebaf-516e-4f7d-8a70-c7dc31de02df",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import json\n",
+    "import os\n",
+    "import random\n",
+    "import sys\n",
+    "\n",
+    "import google.auth\n",
+    "from google.cloud import aiplatform\n",
+    "from google.cloud.aiplatform import gapic as aip\n",
+    "from google.cloud.aiplatform import hyperparameter_tuning as hpt\n",
+    "from google.protobuf.json_format import MessageToDict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0cc75279-b7a9-47cc-81a4-f8729c7d57f8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import HTML, display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "8856c9f3-270f-4dca-8a10-6bdee1af8bc0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "from datasets import Dataset, DatasetDict\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import torch\n",
+    "import transformers\n",
+    "from datasets import ClassLabel, Sequence, load_dataset\n",
+    "from transformers import (AutoModelForSequenceClassification, AutoTokenizer,BertForSequenceClassification,\n",
+    "                          EvalPrediction, Trainer, TrainingArguments,PreTrainedModel,BertModel,\n",
+    "                          default_data_collator)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "bbecdaa8-3cd3-4e7b-939d-f959da9301d6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from google.cloud import bigquery\n",
+    "from google.cloud import storage\n",
+    "\n",
+    "client = bigquery.Client()\n",
+    "storage_client = storage.Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "f693060f-c0ed-4ec3-bc66-17898f8ef854",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Notebook runtime: GPU\n",
+      "PyTorch version : 2.0.0+cu118\n",
+      "Transformers version : 2.16.1\n",
+      "Datasets version : 4.37.0.dev0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Notebook runtime: {'GPU' if torch.cuda.is_available() else 'CPU'}\")\n",
+    "print(f\"PyTorch version : {torch.__version__}\")\n",
+    "print(f\"Transformers version : {datasets.__version__}\")\n",
+    "print(f\"Datasets version : {transformers.__version__}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "5637d9f0-d290-4107-974a-bfbda3b316b2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "3d114e96-31c2-4ed9-82d1-f2fab38f0944",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "APP_NAME = \"aift-review-classificatio-multiple-label\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "173dcb77-9908-4af1-86bb-7811c9f580e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!cd aift-model-review-multiple-label-classification"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3f383051-501f-4f8c-8017-c989c5740041",
+   "metadata": {},
+   "source": [
+    "# Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db9715cc-0779-47a4-a0ed-82714b6668f6",
+   "metadata": {},
+   "source": [
+    "## Preprocess data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "052ecc7b-c015-49a0-a359-85afbac10bbf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "model_ckpt = \"distilbert-base-uncased\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_ckpt)\n",
+    "\n",
+    "def tokenize_and_encode(examples):\n",
+    "    return tokenizer(examples[\"review\"], truncation=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "6f5faf02-ede8-4d48-b94a-1d4619c8e610",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7a2415bdfd4a40fe80afe71e70d97976",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/556 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3b1c36309d4e4e108e79578edc45ed56",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/140 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2b79b69e8457427781c8e6fc8ad54d82",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/556 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e1e4981003d04646944fa0ce8ae0dc73",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/140 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "sql = f\"\"\"\n",
+    "SELECT * FROM `ikame-gem-ai-research.AIFT.reviews_multi_label_training`\n",
+    "\"\"\"\n",
+    "data = client.query(sql).to_dataframe()\n",
+    "data= data.fillna('0')\n",
+    "for i in data.columns:\n",
+    "    if i != 'review':\n",
+    "        data[i] = data[i].astype(int)\n",
+    "\n",
+    "data = Dataset.from_pandas(data).train_test_split(test_size=0.2,shuffle = True, seed=0)\n",
+    "cols = data[\"train\"].column_names\n",
+    "data = data.map(lambda x : {\"labels\": [x[c] for c in cols if c != \"review\"]})\n",
+    "\n",
+    "# Tokenize and encode\n",
+    "dataset = data.map(tokenize_and_encode, batched=True, remove_columns=cols)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f56a7de9-19a4-4cc8-996d-857c491cf633",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ads', 'bugs', 'positive', 'negative', 'graphic', 'gameplay', 'request']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "labels = [label for label in data['train'].features.keys() if label not in ['review','labels']]\n",
+    "id2label = {idx:label for idx, label in enumerate(labels)}\n",
+    "label2id = {label:idx for idx, label in enumerate(labels)}\n",
+    "labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "ad182dbc-c63d-49c9-b53c-9b63996d3746",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'labels': [0, 1, 0, 0, 0, 1, 0],\n",
+       " 'input_ids': [101,\n",
+       "  8795,\n",
+       "  11100,\n",
+       "  2024,\n",
+       "  10599,\n",
+       "  2030,\n",
+       "  11829,\n",
+       "  5999,\n",
+       "  1010,\n",
+       "  2437,\n",
+       "  14967,\n",
+       "  25198,\n",
+       "  1012,\n",
+       "  102],\n",
+       " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset[\"train\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "02c2a7b2-58f1-4eac-ac61-5d54dbdc1184",
+   "metadata": {},
+   "source": [
+    "## Fine-tuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "9452f6f3-2b4b-4ee7-8c9f-3c42e04e396f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "class BertForMultilabelSequenceClassification(BertForSequenceClassification):\n",
+    "    def __init__(self, config):\n",
+    "      super().__init__(config)\n",
+    "\n",
+    "    def forward(self,\n",
+    "        input_ids=None,\n",
+    "        attention_mask=None,\n",
+    "        token_type_ids=None,\n",
+    "        position_ids=None,\n",
+    "        head_mask=None,\n",
+    "        inputs_embeds=None,\n",
+    "        labels=None,\n",
+    "        output_attentions=None,\n",
+    "        output_hidden_states=None,\n",
+    "        return_dict=None):\n",
+    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
+    "\n",
+    "        outputs = self.bert(input_ids,\n",
+    "            attention_mask=attention_mask,\n",
+    "            token_type_ids=token_type_ids,\n",
+    "            position_ids=position_ids,\n",
+    "            head_mask=head_mask,\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict)\n",
+    "\n",
+    "        pooled_output = outputs[1]\n",
+    "        pooled_output = self.dropout(pooled_output)\n",
+    "        logits = self.classifier(pooled_output)\n",
+    "\n",
+    "        loss = None\n",
+    "        if labels is not None:\n",
+    "            loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+    "            loss = loss_fct(logits.view(-1, self.num_labels),\n",
+    "                            labels.float().view(-1, self.num_labels))\n",
+    "\n",
+    "        if not return_dict:\n",
+    "            output = (logits,) + outputs[2:]\n",
+    "            return ((loss,) + output) if loss is not None else output\n",
+    "\n",
+    "        return SequenceClassifierOutput(loss=loss,\n",
+    "            logits=logits,\n",
+    "            hidden_states=outputs.hidden_states,\n",
+    "            attentions=outputs.attentions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "76035010-b10a-4398-8a85-feaa19414ca4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.\n",
+      "Some weights of BertForMultilabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['encoder.layer.11.attention.self.key.bias', 'encoder.layer.6.attention.output.LayerNorm.bias', 'encoder.layer.3.attention.output.LayerNorm.bias', 'encoder.layer.11.attention.self.query.weight', 'encoder.layer.6.attention.self.value.bias', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.4.attention.self.key.bias', 'encoder.layer.9.output.LayerNorm.weight', 'encoder.layer.11.attention.self.query.bias', 'encoder.layer.11.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.4.output.LayerNorm.weight', 'classifier.weight', 'encoder.layer.8.output.dense.bias', 'encoder.layer.9.attention.self.key.bias', 'encoder.layer.5.attention.self.key.bias', 'encoder.layer.5.intermediate.dense.bias', 'encoder.layer.3.attention.output.LayerNorm.weight', 'encoder.layer.7.attention.output.dense.bias', 'encoder.layer.1.attention.output.LayerNorm.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.6.attention.output.LayerNorm.weight', 'encoder.layer.11.output.LayerNorm.bias', 'embeddings.token_type_embeddings.weight', 'encoder.layer.3.intermediate.dense.weight', 'encoder.layer.4.attention.self.key.weight', 'encoder.layer.11.attention.output.LayerNorm.weight', 'encoder.layer.6.intermediate.dense.weight', 'encoder.layer.9.attention.self.value.weight', 'embeddings.position_embeddings.weight', 'encoder.layer.10.attention.self.query.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.10.attention.self.key.weight', 'encoder.layer.2.attention.output.dense.bias', 'encoder.layer.3.attention.self.key.weight', 'encoder.layer.7.output.LayerNorm.bias', 'encoder.layer.2.attention.output.dense.weight', 'encoder.layer.5.attention.output.dense.weight', 'encoder.layer.8.attention.output.LayerNorm.bias', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.6.output.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.7.attention.self.value.bias', 'encoder.layer.6.attention.self.query.bias', 'encoder.layer.7.output.LayerNorm.weight', 'encoder.layer.3.attention.self.value.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.2.attention.self.query.weight', 'encoder.layer.8.attention.output.dense.bias', 'encoder.layer.5.output.dense.bias', 'encoder.layer.9.attention.output.dense.bias', 'encoder.layer.9.attention.self.value.bias', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.3.attention.output.dense.weight', 'encoder.layer.6.attention.self.key.bias', 'encoder.layer.1.attention.self.query.bias', 'encoder.layer.11.attention.self.value.weight', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.5.attention.self.key.weight', 'encoder.layer.7.intermediate.dense.weight', 'encoder.layer.2.attention.self.key.bias', 'encoder.layer.7.output.dense.weight', 'encoder.layer.1.attention.output.LayerNorm.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.5.output.LayerNorm.weight', 'encoder.layer.7.attention.output.dense.weight', 'encoder.layer.10.attention.output.LayerNorm.weight', 'encoder.layer.6.attention.output.dense.weight', 'encoder.layer.9.attention.self.query.weight', 'encoder.layer.10.attention.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.10.attention.output.dense.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.5.output.dense.weight', 'encoder.layer.5.attention.self.query.weight', 'classifier.bias', 'encoder.layer.5.intermediate.dense.weight', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.attention.output.dense.bias', 'encoder.layer.3.attention.self.query.weight', 'encoder.layer.8.output.LayerNorm.bias', 'encoder.layer.3.output.dense.weight', 'encoder.layer.10.attention.self.value.weight', 'encoder.layer.6.output.dense.weight', 'encoder.layer.8.intermediate.dense.bias', 'encoder.layer.0.output.dense.bias', 'encoder.layer.4.attention.self.value.bias', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.4.attention.output.dense.bias', 'pooler.dense.bias', 'encoder.layer.10.attention.self.value.bias', 'encoder.layer.6.attention.self.key.weight', 'encoder.layer.10.attention.self.query.weight', 'encoder.layer.7.attention.output.LayerNorm.weight', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.10.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.9.attention.output.LayerNorm.bias', 'encoder.layer.11.attention.output.dense.weight', 'encoder.layer.7.attention.self.value.weight', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.9.attention.self.query.bias', 'embeddings.LayerNorm.weight', 'encoder.layer.5.attention.output.LayerNorm.bias', 'encoder.layer.1.output.dense.bias', 'encoder.layer.11.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.6.attention.self.value.weight', 'embeddings.LayerNorm.bias', 'encoder.layer.2.attention.self.value.bias', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.11.attention.self.key.weight', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.11.attention.output.LayerNorm.bias', 'encoder.layer.9.output.dense.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.4.output.dense.bias', 'encoder.layer.5.attention.self.value.weight', 'encoder.layer.9.output.dense.bias', 'encoder.layer.11.attention.output.dense.bias', 'encoder.layer.8.output.LayerNorm.weight', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.10.output.dense.weight', 'encoder.layer.9.output.LayerNorm.bias', 'encoder.layer.8.attention.self.query.weight', 'encoder.layer.9.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.8.attention.self.value.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer.2.attention.output.LayerNorm.bias', 'encoder.layer.3.output.dense.bias', 'encoder.layer.4.attention.output.dense.weight', 'encoder.layer.5.output.LayerNorm.bias', 'encoder.layer.2.attention.self.key.weight', 'encoder.layer.5.attention.output.dense.bias', 'encoder.layer.11.output.dense.weight', 'encoder.layer.3.attention.self.query.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.6.attention.output.dense.bias', 'encoder.layer.7.output.dense.bias', 'encoder.layer.2.attention.output.LayerNorm.weight', 'encoder.layer.6.output.LayerNorm.bias', 'encoder.layer.10.output.dense.bias', 'pooler.dense.weight', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.3.attention.self.value.weight', 'encoder.layer.5.attention.output.LayerNorm.weight', 'encoder.layer.6.attention.self.query.weight', 'encoder.layer.8.attention.self.query.bias', 'encoder.layer.2.attention.self.query.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.4.attention.output.LayerNorm.bias', 'encoder.layer.8.attention.output.LayerNorm.weight', 'encoder.layer.9.attention.output.dense.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.1.attention.self.key.weight', 'encoder.layer.3.attention.self.key.bias', 'encoder.layer.4.attention.self.query.weight', 'encoder.layer.7.attention.self.key.bias', 'encoder.layer.8.attention.self.key.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.1.attention.output.dense.weight', 'encoder.layer.4.intermediate.dense.bias', 'encoder.layer.8.attention.self.key.bias', 'encoder.layer.7.attention.self.query.bias', 'encoder.layer.1.attention.self.key.bias', 'encoder.layer.4.output.dense.weight', 'encoder.layer.4.attention.self.query.bias', 'encoder.layer.3.attention.output.dense.bias', 'encoder.layer.4.attention.self.value.weight', 'encoder.layer.4.attention.output.LayerNorm.weight', 'encoder.layer.9.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.7.attention.self.key.weight', 'encoder.layer.5.attention.self.query.bias', 'encoder.layer.8.intermediate.dense.weight', 'encoder.layer.8.attention.self.value.weight', 'encoder.layer.8.attention.output.dense.weight', 'encoder.layer.7.attention.output.LayerNorm.bias', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.2.attention.self.value.weight', 'encoder.layer.8.output.dense.weight', 'encoder.layer.11.output.LayerNorm.weight', 'encoder.layer.9.attention.self.key.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.6.intermediate.dense.bias', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.7.attention.self.query.weight', 'encoder.layer.5.attention.self.value.bias', 'encoder.layer.10.attention.output.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_labels=7\n",
+    "model = BertForMultilabelSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "74af900d-0688-4f7b-b8f2-56f36f467a06",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True):\n",
+    "    y_pred = torch.from_numpy(y_pred)\n",
+    "    y_true = torch.from_numpy(y_true)\n",
+    "    if sigmoid:\n",
+    "      y_pred = y_pred.sigmoid()\n",
+    "    return ((y_pred>thresh)==y_true.bool()).float().mean().item()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "db202a97-61e1-4e43-bb93-20179c2c0aa2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def compute_metrics(eval_pred):\n",
+    "    predictions, labels = eval_pred\n",
+    "    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "e0ab370a-fc4d-460b-9dab-dbde755dc3f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MultilabelTrainer(Trainer):\n",
+    "    def compute_loss(self, model, inputs, return_outputs=False):\n",
+    "        labels = inputs.pop(\"labels\")\n",
+    "        outputs = model(**inputs)\n",
+    "        logits = outputs.logits\n",
+    "        loss_fct = torch.nn.BCEWithLogitsLoss()\n",
+    "        loss = loss_fct(logits.view(-1, self.model.config.num_labels),\n",
+    "                        labels.float().view(-1, self.model.config.num_labels))\n",
+    "        return (loss, outputs) if return_outputs else loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "340ade6d-1eb1-47ec-b8e6-56371083e361",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 8\n",
+    "\n",
+    "args = TrainingArguments(\n",
+    "    output_dir=\"aift-model-review-multiple-label-classification\",\n",
+    "    evaluation_strategy = \"epoch\",\n",
+    "    learning_rate=2e-5,\n",
+    "    per_device_train_batch_size=batch_size,\n",
+    "    per_device_eval_batch_size=batch_size,\n",
+    "    num_train_epochs=10,\n",
+    "    weight_decay=0.01,\n",
+    "    use_cpu = False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "39d8e955-9ca8-463c-899a-bd3b1d5f2c0e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "3cb96e02-f0f7-4a0a-9fe6-f88fe89826f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = MultilabelTrainer(\n",
+    "    model,\n",
+    "    args,\n",
+    "    train_dataset=dataset[\"train\"],\n",
+    "    eval_dataset=dataset[\"test\"],\n",
+    "    compute_metrics=compute_metrics,\n",
+    "    tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "da79a882-f1f1-41a5-b4dd-98b070012c4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='36' max='18' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [18/18 00:06]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'eval_loss': 0.7062913179397583,\n",
+       " 'eval_accuracy_thresh': 0.4561224579811096,\n",
+       " 'eval_runtime': 0.2818,\n",
+       " 'eval_samples_per_second': 496.847,\n",
+       " 'eval_steps_per_second': 63.88}"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.evaluate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "eeefe348-a66f-4e14-9844-da6f3f3ebd80",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='700' max='700' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [700/700 00:47, Epoch 10/10]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Accuracy Thresh</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.415191</td>\n",
+       "      <td>0.868367</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.302631</td>\n",
+       "      <td>0.901020</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.240627</td>\n",
+       "      <td>0.928571</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.217601</td>\n",
+       "      <td>0.931633</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.203845</td>\n",
+       "      <td>0.924490</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.192444</td>\n",
+       "      <td>0.929592</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.190031</td>\n",
+       "      <td>0.926531</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>0.265200</td>\n",
+       "      <td>0.186760</td>\n",
+       "      <td>0.928571</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>0.265200</td>\n",
+       "      <td>0.180436</td>\n",
+       "      <td>0.936735</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>0.265200</td>\n",
+       "      <td>0.179821</td>\n",
+       "      <td>0.934694</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Checkpoint destination directory aift-model-review-multiple-label-classification/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=700, training_loss=0.22303315843854632, metrics={'train_runtime': 47.1667, 'train_samples_per_second': 117.88, 'train_steps_per_second': 14.841, 'total_flos': 55632988457664.0, 'train_loss': 0.22303315843854632, 'epoch': 10.0})"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "id": "d9c2e1e1-c20e-48e5-8f6b-e4e3222899a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mkdir: cannot create directory ‘./models’: File exists\n"
+     ]
+    }
+   ],
+   "source": [
+    "saved_model_local_path = \"./models\"\n",
+    "# !mkdir ./aift-model-review-multiple-label-classification/models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "c6632c17-49e2-4823-abae-a286fa06f8c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.save_model(saved_model_local_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "4af413bf-9c9d-46aa-b75b-f729c8aae546",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = trainer.evaluate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "6ee5c718-6b27-4ed8-993b-dd41468cf16a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'eval_loss': 0.1798214465379715,\n",
+       " 'eval_accuracy_thresh': 0.9346938729286194,\n",
+       " 'eval_runtime': 0.2965,\n",
+       " 'eval_samples_per_second': 472.249,\n",
+       " 'eval_steps_per_second': 60.718,\n",
+       " 'epoch': 10.0}"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "948a6110-48c3-42f5-8950-d4dc3cfc21a5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c835ed1d2ac74d3995f59f351a5933bd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3efca8d4-a40f-40e0-b628-e9f1718b519d",
+   "metadata": {},
+   "source": [
+    "## Predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "08a4759d-ab64-4112-ae27-4f1c4998e269",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(text,threshold):\n",
+    "    encoding = tokenizer(text, return_tensors=\"pt\")\n",
+    "    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}\n",
+    "\n",
+    "    outputs = trainer.model(**encoding)\n",
+    "    logits = outputs.logits\n",
+    "    logits.shape\n",
+    "    # apply sigmoid + threshold\n",
+    "    sigmoid = torch.nn.Sigmoid()\n",
+    "    probs = sigmoid(logits.squeeze().cpu())\n",
+    "    predictions = np.zeros(probs.shape)\n",
+    "    print(predictions)\n",
+    "    print(probs)\n",
+    "    predictions[np.where(probs >= threshold)] = 1\n",
+    "    # turn predicted id's into actual label names\n",
+    "    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]\n",
+    "    print(predicted_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "136f3624-d752-4e62-ae67-c52c8c7413b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0. 0. 0. 0. 0. 0. 0.]\n",
+      "tensor([0.9740, 0.0251, 0.1409, 0.7609, 0.0359, 0.0374, 0.0321],\n",
+      "       grad_fn=<SigmoidBackward0>)\n",
+      "['ads', 'negative']\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = \"a lot of ads\"\n",
+    "predict(text,0.4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "4bdd8052-5c6f-4148-a5cd-bbd5e42aa640",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_text = id2label\n",
+    "model_name_or_path=model_ckpt\n",
+    "saved_model_path = saved_model_local_path\n",
+    "\n",
+    "\n",
+    "def predict_(input_text, saved_model_path,threshold):\n",
+    "    # initialize tokenizer\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)\n",
+    "\n",
+    "    # preprocess and encode input text\n",
+    "    tokenizer_args = (input_text,)\n",
+    "    predict_input = tokenizer(\n",
+    "        *tokenizer_args,\n",
+    "        padding=\"max_length\",\n",
+    "        max_length=128,\n",
+    "        truncation=True,\n",
+    "        return_tensors=\"pt\",\n",
+    "    )\n",
+    "\n",
+    "    # load trained model\n",
+    "    loaded_model = AutoModelForSequenceClassification.from_pretrained(saved_model_path)\n",
+    "\n",
+    "    # get predictions\n",
+    "    output = loaded_model(predict_input[\"input_ids\"])\n",
+    "\n",
+    "    # return labels\n",
+    "    logits = output.logits\n",
+    "    logits.shape\n",
+    "    # apply sigmoid + threshold\n",
+    "    sigmoid = torch.nn.Sigmoid()\n",
+    "    probs = sigmoid(logits.squeeze().cpu())\n",
+    "    predictions = np.zeros(probs.shape)\n",
+    "    print(predictions)\n",
+    "    print(probs)\n",
+    "    predictions[np.where(probs >= threshold)] = 1\n",
+    "    # turn predicted id's into actual label names\n",
+    "    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]\n",
+    "    print(predicted_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "48e96b48-db19-4c25-89f1-eb640c955614",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0. 0. 0. 0. 0. 0. 0.]\n",
+      "tensor([0.5107, 0.1010, 0.5961, 0.2481, 0.2118, 0.1907, 0.1010],\n",
+      "       grad_fn=<SigmoidBackward0>)\n",
+      "['ads', 'positive']\n"
+     ]
+    }
+   ],
+   "source": [
+    "text='ew a lot of ads'\n",
+    "predict_(text, saved_model_path,0.4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2b8505cd-bc32-46e9-9387-a102830e62ef",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Custom training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "id": "bba84d7d-5971-4e44-a977-268bc2b97e77",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI = (\n",
+    "    \"us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-7:latest\"\n",
+    ")\n",
+    "\n",
+    "PYTHON_PACKAGE_APPLICATION_DIR = \"python_package\"\n",
+    "\n",
+    "source_package_file_name = f\"pipeline/aift-model-review-multiple-label-classification/{PYTHON_PACKAGE_APPLICATION_DIR}/dist/trainer-0.1.tar.gz\"\n",
+    "python_package_gcs_uri = (\n",
+    "    f\"{BUCKET_NAME}/pytorch-on-gcp/{APP_NAME}/train/python_package/trainer-0.1.tar.gz\"\n",
+    ")\n",
+    "python_module_name = \"trainer.task\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "id": "3610d07c-909a-470a-b3f7-2e68f3b8292e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# !mkdir ./python_package"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "id": "ecdc6201-d714-4cbe-9c1f-415857730700",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ./aift-model-review-multiple-label-classification/python_package/setup.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile ./aift-model-review-multiple-label-classification/{PYTHON_PACKAGE_APPLICATION_DIR}/setup.py\n",
+    "\n",
+    "from setuptools import find_packages\n",
+    "from setuptools import setup\n",
+    "import setuptools\n",
+    "\n",
+    "from distutils.command.build import build as _build\n",
+    "import subprocess\n",
+    "\n",
+    "\n",
+    "REQUIRED_PACKAGES = [\n",
+    "    'transformers',\n",
+    "    'datasets',\n",
+    "    'tqdm',\n",
+    "    'cloudml-hypertune'\n",
+    "]\n",
+    "\n",
+    "setup(\n",
+    "    name='trainer',\n",
+    "    version='0.1',\n",
+    "    install_requires=REQUIRED_PACKAGES,\n",
+    "    packages=find_packages(),\n",
+    "    include_package_data=True,\n",
+    "    description='Vertex AI | Training | PyTorch | Text Classification | Python Package'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "id": "d001cdca-a207-4f23-b6e5-33106c252004",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "running sdist\n",
+      "running egg_info\n",
+      "creating trainer.egg-info\n",
+      "writing trainer.egg-info/PKG-INFO\n",
+      "writing dependency_links to trainer.egg-info/dependency_links.txt\n",
+      "writing requirements to trainer.egg-info/requires.txt\n",
+      "writing top-level names to trainer.egg-info/top_level.txt\n",
+      "writing manifest file 'trainer.egg-info/SOURCES.txt'\n",
+      "reading manifest file 'trainer.egg-info/SOURCES.txt'\n",
+      "writing manifest file 'trainer.egg-info/SOURCES.txt'\n",
+      "running check\n",
+      "creating trainer-0.1\n",
+      "creating trainer-0.1/trainer\n",
+      "creating trainer-0.1/trainer.egg-info\n",
+      "copying files to trainer-0.1...\n",
+      "copying README.md -> trainer-0.1\n",
+      "copying setup.py -> trainer-0.1\n",
+      "copying trainer/__init__.py -> trainer-0.1/trainer\n",
+      "copying trainer/experiment.py -> trainer-0.1/trainer\n",
+      "copying trainer/metadata.py -> trainer-0.1/trainer\n",
+      "copying trainer/model.py -> trainer-0.1/trainer\n",
+      "copying trainer/task.py -> trainer-0.1/trainer\n",
+      "copying trainer/utils.py -> trainer-0.1/trainer\n",
+      "copying trainer.egg-info/PKG-INFO -> trainer-0.1/trainer.egg-info\n",
+      "copying trainer.egg-info/SOURCES.txt -> trainer-0.1/trainer.egg-info\n",
+      "copying trainer.egg-info/dependency_links.txt -> trainer-0.1/trainer.egg-info\n",
+      "copying trainer.egg-info/requires.txt -> trainer-0.1/trainer.egg-info\n",
+      "copying trainer.egg-info/top_level.txt -> trainer-0.1/trainer.egg-info\n",
+      "Writing trainer-0.1/setup.cfg\n",
+      "creating dist\n",
+      "Creating tar archive\n",
+      "removing 'trainer-0.1' (and everything under it)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!cd aift-model-review-multiple-label-classification/{PYTHON_PACKAGE_APPLICATION_DIR} && python3 setup.py sdist --formats=gztar"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "7a296aa0-ead6-456f-a93a-657fed393bd2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Copying file://python_package/dist/trainer-0.1.tar.gz [Content-Type=application/x-tar]...\n",
+      "/ [1 files][  916.0 B/  916.0 B]                                                \n",
+      "Operation completed over 1 objects/916.0 B.                                      \n"
+     ]
+    }
+   ],
+   "source": [
+    "!gsutil cp {source_package_file_name} {python_package_gcs_uri}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "087fcdaa-0d99-4104-8e61-74455d4bf734",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "       916  2024-01-08T07:48:19Z  gs://ikame-gem-ai-research-bucket-review/pytorch-on-gcp/aift-review-classificatio-multiple-label/train/python_package/trainer-0.1.tar.gz\n",
+      "TOTAL: 1 objects, 916 bytes (916 B)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!gsutil ls -l {python_package_gcs_uri}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "4dce414a-063a-4952-8197-75586909e098",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# !cd {PYTHON_PACKAGE_APPLICATION_DIR} && python -m trainer.task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "a7698349-f5f4-4032-a9b2-1fc659f4b022",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "112e1b67-5bb0-444a-94c6-a2f010e24fe9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "APP_NAME=aift-review-classificatio-multiple-label\n",
+      "PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI=us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-7:latest\n",
+      "python_package_gcs_uri=gs://ikame-gem-ai-research-bucket-review/pytorch-on-gcp/aift-review-classificatio-multiple-label/train/python_package/trainer-0.1.tar.gz\n",
+      "python_module_name=trainer.task\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"APP_NAME={APP_NAME}\")\n",
+    "print(\n",
+    "    f\"PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI={PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI}\"\n",
+    ")\n",
+    "print(f\"python_package_gcs_uri={python_package_gcs_uri}\")\n",
+    "print(f\"python_module_name={python_module_name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "c0fa20a0-0831-49ab-9fce-423016e98db6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "JOB_NAME=aift-review-classificatio-multiple-label-pytorch-pkg-ar-20240108075109\n"
+     ]
+    }
+   ],
+   "source": [
+    "JOB_NAME = f\"{APP_NAME}-pytorch-pkg-ar-{get_timestamp()}\"\n",
+    "print(f\"JOB_NAME={JOB_NAME}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "86922169-8509-48ff-acc9-c06bc9a4ecd1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "job = aiplatform.CustomPythonPackageTrainingJob(\n",
+    "    display_name=f\"{JOB_NAME}\",\n",
+    "    python_package_gcs_uri=python_package_gcs_uri,\n",
+    "    python_module_name=python_module_name,\n",
+    "    container_uri=PRE_BUILT_TRAINING_CONTAINER_IMAGE_URI,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "a7909b64-fedb-4da8-bc61-80b4806117d3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Output directory:\n",
+      "gs://ikame-gem-ai-research-bucket-review/aiplatform-custom-training-2024-01-08-07:51:20.301 \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:Training Output directory:\n",
+      "gs://ikame-gem-ai-research-bucket-review/aiplatform-custom-training-2024-01-08-07:51:20.301 \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View Training:\n",
+      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/2282426366479564800?project=763889829809\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:View Training:\n",
+      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/2282426366479564800?project=763889829809\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View backing custom job:\n",
+      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/7832101356516147200?project=763889829809\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:View backing custom job:\n",
+      "https://console.cloud.google.com/ai/platform/locations/us-central1/training/7832101356516147200?project=763889829809\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:google.cloud.aiplatform.training_jobs:CustomPythonPackageTrainingJob projects/763889829809/locations/us-central1/trainingPipelines/2282426366479564800 current state:\n",
+      "PipelineState.PIPELINE_STATE_RUNNING\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_args = [\"--num-epochs\", \"2\", \"--model-name\", \"finetuned-bert-classifier\"]\n",
+    "\n",
+    "model = job.run(\n",
+    "    replica_count=1,\n",
+    "    machine_type=\"n1-standard-8\",\n",
+    "    accelerator_type=\"NVIDIA_TESLA_V100\",\n",
+    "    accelerator_count=1,\n",
+    "    args=training_args,\n",
+    "    sync=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e681913-680e-4664-9c6a-083f350915bc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "environment": {
+   "kernel": "python3",
+   "name": ".m114",
+   "type": "gcloud",
+   "uri": "gcr.io/deeplearning-platform-release/:m114"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+license: apache-2.0
+base_model: distilbert-base-uncased
+tags:
+- generated_from_trainer
+model-index:
+- name: aift-model-review-multiple-label-classification
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# aift-model-review-multiple-label-classification
+This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.1798
+- Accuracy Thresh: 0.9347
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: tpu
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 10
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Accuracy Thresh |
+|:-------------:|:-----:|:----:|:---------------:|:---------------:|
+| No log        | 1.0   | 70   | 0.4152          | 0.8684          |
+| No log        | 2.0   | 140  | 0.3026          | 0.9010          |
+| No log        | 3.0   | 210  | 0.2406          | 0.9286          |
+| No log        | 4.0   | 280  | 0.2176          | 0.9316          |
+| No log        | 5.0   | 350  | 0.2038          | 0.9245          |
+| No log        | 6.0   | 420  | 0.1924          | 0.9296          |
+| No log        | 7.0   | 490  | 0.1900          | 0.9265          |
+| 0.2652        | 8.0   | 560  | 0.1868          | 0.9286          |
+| 0.2652        | 9.0   | 630  | 0.1804          | 0.9367          |
+| 0.2652        | 10.0  | 700  | 0.1798          | 0.9347          |
+### Framework versions
+- Transformers 4.37.0.dev0
+- Pytorch 2.0.0+cu118
+- Datasets 2.16.1
+- Tokenizers 0.15.0

config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.37.0.dev0",
+  "vocab_size": 30522
+}

custom_container/Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+# Use pytorch GPU base image
+# FROM gcr.io/cloud-aiplatform/training/pytorch-gpu.1-7
+FROM us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-10:latest
+# set working directory
+WORKDIR /app
+# Install required packages
+RUN pip install google-cloud-storage transformers datasets tqdm cloudml-hypertune
+# Copies the trainer code to the docker image.
+COPY ./trainer/__init__.py /app/trainer/__init__.py
+COPY ./trainer/experiment.py /app/trainer/experiment.py
+COPY ./trainer/utils.py /app/trainer/utils.py
+COPY ./trainer/metadata.py /app/trainer/metadata.py
+COPY ./trainer/model.py /app/trainer/model.py
+COPY ./trainer/task.py /app/trainer/task.py
+# Set up the entry point to invoke the trainer.
+ENTRYPOINT ["python", "-m", "trainer.task"]

custom_container/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+# PyTorch Custom Containers GPU Template
+## Overview
+The directory provides code to fine tune a transformer model ([BERT-base](https://huggingface.co/bert-base-cased)) from Huggingface Transformers Library for sentiment analysis task.  [BERT](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html) (Bidirectional Encoder Representations from Transformers) is a transformers model pre-trained on a large corpus of unlabeled text in a self-supervised fashion. In this sample, we use [IMDB sentiment classification dataset](https://huggingface.co/datasets/imdb) for the task. We show you packaging a PyTorch training model to submit it to Vertex AI using pre-built PyTorch containers and handling Python dependencies using [Vertex Training custom containers](https://cloud.google.com/vertex-ai/docs/training/create-custom-container?hl=hr).
+## Prerequisites
+* Setup your project by following the instructions from [documentation](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)
+* [Setup docker with Cloud Container Registry](https://cloud.google.com/container-registry/docs/pushing-and-pulling)
+* Change the directory to this sample and run
+`Note:` These instructions are used for local testing. When you submit a training job, no code will be executed on your local machine.
+## Directory Structure
+* `trainer` directory: all Python modules to train the model.
+* `scripts` directory: command-line scripts to train the model on Vertex AI.
+* `setup.py`: `setup.py` scripts specifies Python dependencies required for the training job. Vertex Training uses pip to install the package on the training instances allocated for the job.
+### Trainer Modules
+| File Name | Purpose |
+| :-------- | :------ |
+| [metadata.py](trainer/metadata.py) | Defines: metadata for classification task such as predefined model dataset name, target labels. |
+| [utils.py](trainer/utils.py) | Includes: utility functions such as data input functions to read data, save model to GCS bucket. |
+| [model.py](trainer/model.py) | Includes: function to create model with a sequence classification head from a pretrained model. |
+| [experiment.py](trainer/experiment.py) | Runs the model training and evaluation experiment, and exports the final model. |
+| [task.py](trainer/task.py) | Includes: 1) Initialize and parse task arguments (hyper parameters), and 2) Entry point to the trainer. |
+### Scripts
+* [train-cloud.sh](scripts/train-cloud.sh) This script builds your Docker image locally, pushes the image to Container Registry and submits a custom container training job to Vertex AI.
+Please read the [documentation](https://cloud.google.com/vertex-ai/docs/training/containers-overview?hl=hr) on Vertex Training with Custom Containers for more details.
+## How to run
+Once the prerequisites are satisfied, you may:
+1. For local testing, run (refer [notebook](../pytorch-text-classification-vertex-ai-train-tune-deploy.ipynb) for instructions):
+    ```
+    CUSTOM_TRAIN_IMAGE_URI='gcr.io/{PROJECT_ID}/pytorch_gpu_train_{APP_NAME}'
+    cd ./custom_container/ && docker build -f Dockerfile -t $CUSTOM_TRAIN_IMAGE_URI ../python_package
+    docker run --gpus all -it --rm $CUSTOM_TRAIN_IMAGE_URI
+    ```
+2. For cloud testing, run:
+    ```
+    source ./scripts/train-cloud.sh
+    ```
+## Run on GPU
+The provided trainer code runs on a GPU if one is available including data loading and model creation.
+To run the trainer code on a different GPU configuration or latest PyTorch pre-built container image, make the following changes to the trainer script.
+* Update the PyTorch image URI to one of [PyTorch pre-built containers](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#available_container_images)
+* Update the [`worker-pool-spec`](https://cloud.google.com/vertex-ai/docs/training/configure-compute?hl=hr) in the gcloud command that includes a GPU
+Then, run the script to submit a Custom Job on Vertex Training job:
+```
+source ./scripts/train-cloud.sh
+```
+### Versions
+This script uses the pre-built PyTorch containers for PyTorch 1.7.
+* `us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-7:latest`

custom_container/scripts/train-cloud.sh ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/bin/bash
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This script performs cloud training for a PyTorch model.
+echo "Submitting PyTorch model training job to Vertex AI"
+# PROJECT_ID: Change to your project id
+PROJECT_ID=$(gcloud config list --format 'value(core.project)')
+# BUCKET_NAME: Change to your bucket name.
+BUCKET_NAME="[your-bucket-name]" # <-- CHANGE TO YOUR BUCKET NAME
+# validate bucket name
+if [ "${BUCKET_NAME}" = "[your-bucket-name]" ]
+then
+  echo "[ERROR] INVALID VALUE: Please update the variable BUCKET_NAME with valid Cloud Storage bucket name. Exiting the script..."
+  exit 1
+fi
+# JOB_NAME: the name of your job running on AI Platform.
+JOB_PREFIX="finetuned-bert-classifier-pytorch-cstm-cntr"
+JOB_NAME=${JOB_PREFIX}-$(date +%Y%m%d%H%M%S)-custom-job
+# REGION: select a region from https://cloud.google.com/vertex-ai/docs/general/locations#available_regions
+# or use the default '`us-central1`'. The region is where the job will be run.
+REGION="us-central1"
+# JOB_DIR: Where to store prepared package and upload output model.
+JOB_DIR=gs://${BUCKET_NAME}/${JOB_PREFIX}/models/${JOB_NAME}
+# IMAGE_REPO_NAME: set a local repo name to distinquish our image
+IMAGE_REPO_NAME=pytorch_gpu_train_finetuned-bert-classifier
+# IMAGE_URI: the complete URI location for Cloud Container Registry
+CUSTOM_TRAIN_IMAGE_URI=gcr.io/${PROJECT_ID}/${IMAGE_REPO_NAME}
+# Build the docker image
+docker build --no-cache -f Dockerfile -t $CUSTOM_TRAIN_IMAGE_URI ../python_package
+# Deploy the docker image to Cloud Container Registry
+docker push ${CUSTOM_TRAIN_IMAGE_URI}
+# worker pool spec
+worker_pool_spec="\
+replica-count=1,\
+machine-type=n1-standard-8,\
+accelerator-type=NVIDIA_TESLA_V100,\
+accelerator-count=1,\
+container-image-uri=${CUSTOM_TRAIN_IMAGE_URI}"
+# Submit Custom Job to Vertex AI
+gcloud beta ai custom-jobs create \
+    --display-name=${JOB_NAME} \
+    --region ${REGION} \
+    --worker-pool-spec="${worker_pool_spec}" \
+    --args="--model-name","finetuned-bert-classifier","--job-dir",$JOB_DIR
+echo "After the job is completed successfully, model files will be saved at $JOB_DIR/"
+# uncomment following lines to monitor the job progress by streaming logs
+# Stream the logs from the job
+# gcloud ai custom-jobs stream-logs $(gcloud ai custom-jobs list --region=$REGION --filter="displayName:"$JOB_NAME --format="get(name)")
+# # Verify the model was exported
+# echo "Verify the model was exported:"
+# gsutil ls ${JOB_DIR}/

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e0601c465c2b89e2c72137736ee8a82835a3651c432dbf1e6017523f91d3b7f
+size 267847948

python_package/README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# PyTorch - Python Package Training
+## Overview
+The directory provides code to fine tune a transformer model ([BERT-base](https://huggingface.co/bert-base-cased)) from Huggingface Transformers Library for sentiment analysis task.  [BERT](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html) (Bidirectional Encoder Representations from Transformers) is a transformers model pre-trained on a large corpus of unlabeled text in a self-supervised fashion. In this sample, we use [IMDB sentiment classification dataset](https://huggingface.co/datasets/imdb) for the task. We show you packaging a PyTorch training model to submit it to Vertex AI using pre-built PyTorch containers and handling Python dependencies through Python build scripts (`setup.py`).
+## Prerequisites
+* Setup your project by following the instructions from [documentation](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)
+* Change directories to this sample.
+## Directory Structure
+* `trainer` directory: all Python modules to train the model.
+* `scripts` directory: command-line scripts to train the model on Vertex AI.
+* `setup.py`: `setup.py` scripts specifies Python dependencies required for the training job. Vertex Training uses pip to install the package on the training instances allocated for the job.
+### Trainer Modules
+| File Name | Purpose |
+| :-------- | :------ |
+| [metadata.py](trainer/metadata.py) | Defines: metadata for classification task such as predefined model dataset name, target labels. |
+| [utils.py](trainer/utils.py) | Includes: utility functions such as data input functions to read data, save model to GCS bucket. |
+| [model.py](trainer/model.py) | Includes: function to create model with a sequence classification head from a pretrained model. |
+| [experiment.py](trainer/experiment.py) | Runs the model training and evaluation experiment, and exports the final model. |
+| [task.py](trainer/task.py) | Includes: 1) Initialize and parse task arguments (hyper parameters), and 2) Entry point to the trainer. |
+### Scripts
+* [train-cloud.sh](scripts/train-cloud.sh) This script submits a training job to Vertex AI
+## How to run
+For local testing, run:
+```
+!cd python_package && python -m trainer.task
+```
+For cloud training, once the prerequisites are satisfied, update the
+`BUCKET_NAME` environment variable in `scripts/train-cloud.sh`. You may then
+run the following script to submit an AI Platform Training job:
+```
+source ./python_package/scripts/train-cloud.sh
+```
+## Run on GPU
+The provided trainer code runs on a GPU if one is available including data loading and model creation.
+To run the trainer code on a different GPU configuration or latest PyTorch pre-built container image, make the following changes to the trainer script.
+* Update the PyTorch image URI to one of [PyTorch pre-built containers](https://cloud.google.com/vertex-ai/docs/training/pre-built-containers#available_container_images)
+* Update the [`worker-pool-spec`](https://cloud.google.com/vertex-ai/docs/training/configure-compute?hl=hr) in the gcloud command that includes a GPU
+Then, run the script to submit a Custom Job on Vertex Training job:
+```
+source ./scripts/train-cloud.sh
+```
+### Versions
+This script uses the pre-built PyTorch containers for PyTorch 1.7.
+* `us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-7:latest`

python_package/dist/trainer-0.1.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de00247994c728d30322eec83cc5a3976137bc7394e4213c2775c517af5163e6
+size 6337

python_package/scripts/train-cloud.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This script performs cloud training for a PyTorch model.
+echo "Submitting Custom Job to Vertex AI to train PyTorch model"
+# BUCKET_NAME: Change to your bucket name
+BUCKET_NAME="[your-bucket-name]" # <-- CHANGE TO YOUR BUCKET NAME
+# validate bucket name
+if [ "${BUCKET_NAME}" = "[your-bucket-name]" ]
+then
+  echo "[ERROR] INVALID VALUE: Please update the variable BUCKET_NAME with valid Cloud Storage bucket name. Exiting the script..."
+  exit 1
+fi
+# The PyTorch image provided by Vertex AI Training.
+IMAGE_URI="us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-7:latest"
+# JOB_NAME: the name of your job running on Vertex AI.
+JOB_PREFIX="finetuned-bert-classifier-pytorch-pkg-ar"
+JOB_NAME=${JOB_PREFIX}-$(date +%Y%m%d%H%M%S)-custom-job
+# REGION: select a region from https://cloud.google.com/vertex-ai/docs/general/locations#available_regions
+# or use the default '`us-central1`'. The region is where the job will be run.
+REGION="us-central1"
+# JOB_DIR: Where to store prepared package and upload output model.
+JOB_DIR=gs://${BUCKET_NAME}/${JOB_PREFIX}/model/${JOB_NAME}
+# worker pool spec
+worker_pool_spec="\
+replica-count=1,\
+machine-type=n1-standard-8,\
+accelerator-type=NVIDIA_TESLA_V100,\
+accelerator-count=1,\
+executor-image-uri=${IMAGE_URI},\
+python-module=trainer.task,\
+local-package-path=../python_package/"
+# Submit Custom Job to Vertex AI
+gcloud beta ai custom-jobs create \
+    --display-name=${JOB_NAME} \
+    --region ${REGION} \
+    --worker-pool-spec="${worker_pool_spec}" \
+    --args="--model-name","finetuned-bert-classifier","--job-dir",$JOB_DIR
+echo "After the job is completed successfully, model files will be saved at $JOB_DIR/"
+# uncomment following lines to monitor the job progress by streaming logs
+# Stream the logs from the job
+# gcloud ai custom-jobs stream-logs $(gcloud ai custom-jobs list --region=$REGION --filter="displayName:"$JOB_NAME --format="get(name)")
+# # Verify the model was exported
+# echo "Verify the model was exported:"
+# gsutil ls ${JOB_DIR}/

python_package/setup.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from setuptools import find_packages
+from setuptools import setup
+import setuptools
+from distutils.command.build import build as _build
+import subprocess
+REQUIRED_PACKAGES = [
+    'transformers',
+    'datasets',
+    'tqdm',
+    'cloudml-hypertune'
+]
+setup(
+    name='trainer',
+    version='0.1',
+    install_requires=REQUIRED_PACKAGES,
+    packages=find_packages(),
+    include_package_data=True,
+    description='Vertex AI | Training | PyTorch | Text Classification | Python Package'
+)

python_package/trainer.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,8 @@

+Metadata-Version: 2.1
+Name: trainer
+Version: 0.1
+Summary: Vertex AI | Training | PyTorch | Text Classification | Python Package
+Requires-Dist: transformers
+Requires-Dist: datasets
+Requires-Dist: tqdm
+Requires-Dist: cloudml-hypertune

python_package/trainer.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+README.md
+setup.py
+trainer/__init__.py
+trainer/experiment.py
+trainer/metadata.py
+trainer/model.py
+trainer/task.py
+trainer/utils.py
+trainer.egg-info/PKG-INFO
+trainer.egg-info/SOURCES.txt
+trainer.egg-info/dependency_links.txt
+trainer.egg-info/requires.txt
+trainer.egg-info/top_level.txt

python_package/trainer.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

python_package/trainer.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers
+datasets
+tqdm
+cloudml-hypertune

python_package/trainer.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ trainer

python_package/trainer/__init__.py ADDED Viewed

File without changes

python_package/trainer/experiment.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the \"License\");
+# you may not use this file except in compliance with the License.\n",
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an \"AS IS\" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy as np
+import hypertune
+from transformers import (
+    AutoTokenizer,
+    EvalPrediction,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    TrainerCallback
+)
+from trainer import model, metadata, utils
+class HPTuneCallback(TrainerCallback):
+    """
+    A custom callback class that reports a metric to hypertuner
+    at the end of each epoch.
+    """
+    def __init__(self, metric_tag, metric_value):
+        super(HPTuneCallback, self).__init__()
+        self.metric_tag = metric_tag
+        self.metric_value = metric_value
+        self.hpt = hypertune.HyperTune()
+    def on_evaluate(self, args, state, control, **kwargs):
+        print(f"HP metric {self.metric_tag}={kwargs['metrics'][self.metric_value]}")
+        self.hpt.report_hyperparameter_tuning_metric(
+            hyperparameter_metric_tag=self.metric_tag,
+            metric_value=kwargs['metrics'][self.metric_value],
+            global_step=state.epoch)
+def compute_metrics(p: EvalPrediction):
+    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+    preds = np.argmax(preds, axis=1)
+    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}
+def train(args, model, train_dataset, test_dataset):
+    """Create the training loop to load pretrained model and tokenizer and
+    start the training process
+    Args:
+      args: read arguments from the runner to set training hyperparameters
+      model: The neural network that you are training
+      train_dataset: The training dataset
+      test_dataset: The test dataset for evaluation
+    """
+    # initialize the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        metadata.PRETRAINED_MODEL_NAME,
+        use_fast=True,
+    )
+    # set training arguments
+    training_args = TrainingArguments(
+        evaluation_strategy="epoch",
+        learning_rate=args.learning_rate,
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size,
+        num_train_epochs=args.num_epochs,
+        weight_decay=args.weight_decay,
+        output_dir=os.path.join("/tmp", args.model_name)
+    )
+    # initialize our Trainer
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=train_dataset,
+        eval_dataset=test_dataset,
+        data_collator=default_data_collator,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics
+    )
+    # add hyperparameter tuning callback to report metrics when enabled
+    if args.hp_tune == "y":
+        trainer.add_callback(HPTuneCallback("accuracy", "eval_accuracy"))
+    # training
+    trainer.train()
+    return trainer
+def run(args):
+    """Load the data, train, evaluate, and export the model for serving and
+     evaluating.
+    Args:
+      args: experiment parameters.
+    """
+    # Open our dataset
+    train_dataset, test_dataset = utils.load_data(args)
+    label_list = train_dataset.unique("label")
+    num_labels = len(label_list)
+    # Create the model, loss function, and optimizer
+    text_classifier = model.create(num_labels=num_labels)
+    # Train / Test the model
+    trainer = train(args, text_classifier, train_dataset, test_dataset)
+    metrics = trainer.evaluate(eval_dataset=test_dataset)
+    trainer.save_metrics("all", metrics)
+    # Export the trained model
+    trainer.save_model(os.path.join("/tmp", args.model_name))
+    # Save the model to GCS
+    if args.job_dir:
+        utils.save_model(args)
+    else:
+        print(f"Saved model files at {os.path.join('/tmp', args.model_name)}")
+        print(f"To save model files in GCS bucket, please specify job_dir starting with gs://")

python_package/trainer/metadata.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/env python
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Task type can be either 'classification', 'regression', or 'custom'.
+# This is based on the target feature in the dataset.
+TASK_TYPE = 'classification'
+# Dataset name
+DATASET_NAME = 'imdb'
+# pre-trained model name
+PRETRAINED_MODEL_NAME = 'bert-base-cased'
+# List of the class values (labels) in a classification dataset.
+TARGET_LABELS = {1:1, 0:0, -1:0}
+# maximum sequence length
+MAX_SEQ_LENGTH = 128

python_package/trainer/model.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the \"License\");
+# you may not use this file except in compliance with the License.\n",
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an \"AS IS\" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers import AutoModelForSequenceClassification
+from trainer import metadata
+def create(num_labels):
+    """create the model by loading a pretrained model or define your
+    own
+    Args:
+      num_labels: number of target labels
+    """
+    # Create the model, loss function, and optimizer
+    model = AutoModelForSequenceClassification.from_pretrained(
+        metadata.PRETRAINED_MODEL_NAME,
+        num_labels=num_labels
+    )
+    return model

python_package/trainer/task.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the \"License\");
+# you may not use this file except in compliance with the License.\n",
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an \"AS IS\" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from trainer import experiment
+def get_args():
+    """Define the task arguments with the default values.
+    Returns:
+        experiment parameters
+    """
+    args_parser = argparse.ArgumentParser()
+    # Experiment arguments
+    args_parser.add_argument(
+        '--batch-size',
+        help='Batch size for each training and evaluation step.',
+        type=int,
+        default=16)
+    args_parser.add_argument(
+        '--num-epochs',
+        help="""\
+        Maximum number of training data epochs on which to train.
+        If both --train-size and --num-epochs are specified,
+        --train-steps will be: (train-size/train-batch-size) * num-epochs.\
+        """,
+        default=1,
+        type=int,
+    )
+    args_parser.add_argument(
+        '--seed',
+        help='Random seed (default: 42)',
+        type=int,
+        default=42,
+    )
+    # Estimator arguments
+    args_parser.add_argument(
+        '--learning-rate',
+        help='Learning rate value for the optimizers.',
+        default=2e-5,
+        type=float)
+    args_parser.add_argument(
+        '--weight-decay',
+        help="""
+      The factor by which the learning rate should decay by the end of the
+      training.
+      decayed_learning_rate =
+        learning_rate * decay_rate ^ (global_step / decay_steps)
+      If set to 0 (default), then no decay will occur.
+      If set to 0.5, then the learning rate should reach 0.5 of its original
+          value at the end of the training.
+      Note that decay_steps is set to train_steps.
+      """,
+        default=0.01,
+        type=float)
+    # Enable hyperparameter
+    args_parser.add_argument(
+        '--hp-tune',
+        default="n",
+        help='Enable hyperparameter tuning. Valida values are: "y" - enable, "n" - disable')
+    # Saved model arguments
+    args_parser.add_argument(
+        '--job-dir',
+        default=os.getenv('AIP_MODEL_DIR'),
+        help='GCS location to export models')
+    args_parser.add_argument(
+        '--model-name',
+        default="finetuned-bert-classifier",
+        help='The name of your saved model')
+    return args_parser.parse_args()
+def main():
+    """Setup / Start the experiment
+    """
+    args = get_args()
+    print(args)
+    experiment.run(args)
+if __name__ == '__main__':
+    main()

python_package/trainer/utils.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the \"License\");
+# you may not use this file except in compliance with the License.\n",
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an \"AS IS\" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import datetime
+from google.cloud import storage
+from transformers import AutoTokenizer
+from datasets import load_dataset, load_metric, ReadInstruction
+from trainer import metadata
+def preprocess_function(examples):
+    tokenizer = AutoTokenizer.from_pretrained(
+        metadata.PRETRAINED_MODEL_NAME,
+        use_fast=True,
+    )
+    # Tokenize the texts
+    tokenizer_args = (
+        (examples['text'],)
+    )
+    result = tokenizer(*tokenizer_args,
+                       padding='max_length',
+                       max_length=metadata.MAX_SEQ_LENGTH,
+                       truncation=True)
+    # TEMP: We can extract this automatically but Unique method of the dataset
+    # is not reporting the label -1 which shows up in the pre-processing
+    # Hence the additional -1 term in the dictionary
+    label_to_id = metadata.TARGET_LABELS
+    # Map labels to IDs (not necessary for GLUE tasks)
+    if label_to_id is not None and "label" in examples:
+        result["label"] = [label_to_id[l] for l in examples["label"]]
+    return result
+def load_data(args):
+    """Loads the data into two different data loaders. (Train, Test)
+        Args:
+            args: arguments passed to the python script
+    """
+    # Dataset loading repeated here to make this cell idempotent
+    # Since we are over-writing datasets variable
+    dataset = load_dataset(metadata.DATASET_NAME)
+    dataset = dataset.map(preprocess_function,
+                          batched=True,
+                          load_from_cache_file=True)
+    train_dataset, test_dataset = dataset["train"], dataset["test"]
+    return train_dataset, test_dataset
+def save_model(args):
+    """Saves the model to Google Cloud Storage or local file system
+    Args:
+      args: contains name for saved model.
+    """
+    scheme = 'gs://'
+    if args.job_dir.startswith(scheme):
+        job_dir = args.job_dir.split("/")
+        bucket_name = job_dir[2]
+        object_prefix = "/".join(job_dir[3:]).rstrip("/")
+        if object_prefix:
+            model_path = '{}/{}'.format(object_prefix, args.model_name)
+        else:
+            model_path = '{}'.format(args.model_name)
+        bucket = storage.Client().bucket(bucket_name)
+        local_path = os.path.join("/tmp", args.model_name)
+        files = [f for f in os.listdir(local_path) if os.path.isfile(os.path.join(local_path, f))]
+        for file in files:
+            local_file = os.path.join(local_path, file)
+            blob = bucket.blob("/".join([model_path, file]))
+            blob.upload_from_filename(local_file)
+        print(f"Saved model files in gs://{bucket_name}/{model_path}")
+    else:
+        print(f"Saved model files at {os.path.join('/tmp', args.model_name)}")
+        print(f"To save model files in GCS bucket, please specify job_dir starting with gs://")

runs/Jan08_04-05-34_aift-review-classification-multiple-label/events.out.tfevents.1704686768.aift-review-classification-multiple-label ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11bbe9bc3fd1a688e8527b9b8f2d874ffb7e68110df30c66f62ac9ad81315a29
+size 8687

runs/Jan08_04-07-17_aift-review-classification-multiple-label/events.out.tfevents.1704686842.aift-review-classification-multiple-label ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fc40e3f3954e05c2bc3f1492277fb35266ad0c86df183b923fb5d4b8e3340da
+size 8687

runs/Jan08_04-07-17_aift-review-classification-multiple-label/events.out.tfevents.1704687081.aift-review-classification-multiple-label ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f63a8177014d20b0ecfd65c4aa43af47460832c3c1acfa40cf70b5ba7475951
+size 700

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6f3045d2e9afe29c006bc0a52f01855c1a37d7525699533b28f06d6396c303f
+size 4347

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff