File size: 6,034 Bytes

af50acb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import BertTokenizer\n",
    "from pprint import pprint\n",
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
      " 'input_ids': [101,\n",
      "               1109,\n",
      "               20164,\n",
      "               10932,\n",
      "               2271,\n",
      "               7954,\n",
      "               10176,\n",
      "               1110,\n",
      "               2385,\n",
      "               1107,\n",
      "               7926,\n",
      "               8588,\n",
      "               102],\n",
      " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}\n"
     ]
    }
   ],
   "source": [
    "pprint(tokenizer(\"The HuggingFace Course is quite intuitive\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('./artifacts/tokenizer_config.json',\n",
       " './artifacts/special_tokens_map.json',\n",
       " './artifacts/vocab.txt',\n",
       " './artifacts/added_tokens.json')"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_pretrained(save_directory=\"./artifacts/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Breaking it down"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequence = \"The HuggingFace Course is quite intuitive\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['The', 'Hu', '##gging', '##F', '##ace', 'Course', 'is', 'quite', 'in', '##tu', '##itive']\n"
     ]
    }
   ],
   "source": [
    "tokens = tokenizer.tokenize(sequence)\n",
    "print(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1109, 20164, 10932, 2271, 7954, 10176, 1110, 2385, 1107, 7926, 8588]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "token_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "token_ids"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Try tokenization using tokenize method and the __call__ method of the tokenizer object and confirm the outputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[101, 146, 787, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119, 102]\n",
      "[CLS] I ’ ve been waiting for a HuggingFace course my whole life. [SEP]\n",
      "\n",
      "[146, 787, 1396, 1151, 2613, 1111, 170, 20164, 10932, 2271, 7954, 1736, 1139, 2006, 1297, 119]\n",
      "I ’ ve been waiting for a HuggingFace course my whole life.\n",
      "====================================================================================================\n",
      "[101, 146, 4819, 1142, 1177, 1277, 106, 102]\n",
      "[CLS] I hate this so much! [SEP]\n",
      "\n",
      "[146, 4819, 1142, 1177, 1277, 106]\n",
      "I hate this so much!\n",
      "====================================================================================================\n"
     ]
    }
   ],
   "source": [
    "sentences = [\"I’ve been waiting for a HuggingFace course my whole life.\", \"I hate this so much!\"]\n",
    "\n",
    "for sentence in sentences:\n",
    "    # 1: Perform tokenization using the default call method\n",
    "    token_ids = tokenizer(sentence)[\"input_ids\"]\n",
    "    print(token_ids)\n",
    "    print(tokenizer.decode(token_ids))\n",
    "    print()\n",
    "\n",
    "    # 2: First tokenize and then convert to ids\n",
    "    tokens = tokenizer.tokenize(sentence)\n",
    "    token_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "    print(token_ids)\n",
    "    print(tokenizer.decode(token_ids))\n",
    "\n",
    "    print(\"=\"*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[CLS] [SEP]'"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode([101, 102])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The difference in the first and last token values is because of the introduction of special tokens which is proposed in the BERT paper otherwise all the tokens are exactly the same."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}