{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using the pipeline function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
      "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import pipeline\n",
    "\n",
    "classifier = pipeline(task=\"sentiment-analysis\")\n",
    "\n",
    "inputs = [\"This was so bad I couldn´t finish it. The actresses are so bad at acting it feels like a bad comedy from minute one. The high rated reviews is obviously from friend/family and is pure BS.\",\n",
    "          \"I thought the cast was great. Brianna and Emma were exceptionaly talented in thier characters. Fun film.\"]\n",
    "\n",
    "outputs = classifier(inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'label': 'NEGATIVE', 'score': 0.9995231628417969},\n",
       " {'label': 'POSITIVE', 'score': 0.9998352527618408}]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Defining tokenizer and model manually"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "checkpoint = \"distilbert/distilbert-base-uncased-finetuned-sst-2-english\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pprint import pprint\n",
    "tokenized_inputs = tokenizer(\n",
    "    inputs, padding=True, truncation=True, return_tensors=\"pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([  101,  2023,  2001,  2061,  2919,  1045,  2481, 29658,  2102,  3926,\n",
      "         2009,  1012,  1996, 19910,  2024,  2061,  2919,  2012,  3772,  2009,\n",
      "         5683,  2066,  1037,  2919,  4038,  2013,  3371,  2028,  1012,  1996,\n",
      "         2152,  6758,  4391,  2003,  5525,  2013,  2767,  1013,  2155,  1998,\n",
      "         2003,  5760, 18667,  1012,   102])\n",
      "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
      "        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])\n"
     ]
    }
   ],
   "source": [
    "print(tokenized_inputs[\"input_ids\"][0], tokenized_inputs[\"attention_mask\"][0], sep = \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([  101,  1045,  2245,  1996,  3459,  2001,  2307,  1012, 25558,  1998,\n",
      "         5616,  2020, 11813,  2100, 10904,  1999, 16215,  3771,  3494,  1012,\n",
      "         4569,  2143,  1012,   102,     0,     0,     0,     0,     0,     0,\n",
      "            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
      "            0,     0,     0,     0,     0])\n",
      "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
      "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])\n"
     ]
    }
   ],
   "source": [
    "print(tokenized_inputs[\"input_ids\"][1], tokenized_inputs[\"attention_mask\"][1], sep = \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(45, 45)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenized_inputs[\"input_ids\"][0]), len(tokenized_inputs[\"input_ids\"][1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForSequenceClassification\n",
    "import torch\n",
    "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
    "model.eval();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "with torch.no_grad():\n",
    "    outputs = model(**tokenized_inputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['__annotations__', '__class__', '__class_getitem__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__or__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'logits', 'loss', 'move_to_end', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']\n"
     ]
    }
   ],
   "source": [
    "print(dir(outputs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[ 4.2415, -3.4063],\n",
       "        [-4.1783,  4.5328]])"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs.logits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[9.9952e-01, 4.7686e-04],\n",
       "        [1.6471e-04, 9.9984e-01]])"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch.nn.functional as F\n",
    "F.softmax(outputs.logits, dim = -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = outputs.logits.argmax(dim = -1)\n",
    "pred_probas = F.softmax(outputs.logits, dim = -1).max(dim = -1).values\n",
    "\n",
    "preds = []\n",
    "for p, pp in zip(predictions, pred_probas):\n",
    "    preds.append({'label': model.config.id2label[p.item()], 'score': pp.item()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'label': 'NEGATIVE', 'score': 0.9995231628417969},\n",
       " {'label': 'POSITIVE', 'score': 0.9998352527618408}]"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "\n",
    "Reference Output\n",
    "\n",
    "---\n",
    "\n",
    "[{'label': 'NEGATIVE', 'score': 0.9995231628417969},\n",
    " {'label': 'POSITIVE', 'score': 0.9998352527618408}]\n",
    "````"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}