{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using the pipeline function" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).\n", "Using a pipeline without specifying a model name and revision in production is not recommended.\n", "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "from transformers import pipeline\n", "\n", "classifier = pipeline(task=\"sentiment-analysis\")\n", "\n", "inputs = [\"This was so bad I couldnĀ“t finish it. The actresses are so bad at acting it feels like a bad comedy from minute one. The high rated reviews is obviously from friend/family and is pure BS.\",\n", " \"I thought the cast was great. Brianna and Emma were exceptionaly talented in thier characters. Fun film.\"]\n", "\n", "outputs = classifier(inputs)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'NEGATIVE', 'score': 0.9995231628417969},\n", " {'label': 'POSITIVE', 'score': 0.9998352527618408}]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outputs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Defining tokenizer and model manually" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenizer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = \"distilbert/distilbert-base-uncased-finetuned-sst-2-english\"\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "from pprint import pprint\n", "tokenized_inputs = tokenizer(\n", " inputs, padding=True, truncation=True, return_tensors=\"pt\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([ 101, 2023, 2001, 2061, 2919, 1045, 2481, 29658, 2102, 3926,\n", " 2009, 1012, 1996, 19910, 2024, 2061, 2919, 2012, 3772, 2009,\n", " 5683, 2066, 1037, 2919, 4038, 2013, 3371, 2028, 1012, 1996,\n", " 2152, 6758, 4391, 2003, 5525, 2013, 2767, 1013, 2155, 1998,\n", " 2003, 5760, 18667, 1012, 102])\n", "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])\n" ] } ], "source": [ "print(tokenized_inputs[\"input_ids\"][0], tokenized_inputs[\"attention_mask\"][0], sep = \"\\n\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([ 101, 1045, 2245, 1996, 3459, 2001, 2307, 1012, 25558, 1998,\n", " 5616, 2020, 11813, 2100, 10904, 1999, 16215, 3771, 3494, 1012,\n", " 4569, 2143, 1012, 102, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0])\n", "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])\n" ] } ], "source": [ "print(tokenized_inputs[\"input_ids\"][1], tokenized_inputs[\"attention_mask\"][1], sep = \"\\n\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(45, 45)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(tokenized_inputs[\"input_ids\"][0]), len(tokenized_inputs[\"input_ids\"][1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/huggingface/lib/python3.10/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "from transformers import AutoModelForSequenceClassification\n", "import torch\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "model.eval();" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "with torch.no_grad():\n", " outputs = model(**tokenized_inputs)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['__annotations__', '__class__', '__class_getitem__', '__contains__', '__dataclass_fields__', '__dataclass_params__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__ior__', '__iter__', '__le__', '__len__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__or__', '__post_init__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__ror__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'attentions', 'clear', 'copy', 'fromkeys', 'get', 'hidden_states', 'items', 'keys', 'logits', 'loss', 'move_to_end', 'pop', 'popitem', 'setdefault', 'to_tuple', 'update', 'values']\n" ] } ], "source": [ "print(dir(outputs))" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 4.2415, -3.4063],\n", " [-4.1783, 4.5328]])" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outputs.logits" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[9.9952e-01, 4.7686e-04],\n", " [1.6471e-04, 9.9984e-01]])" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch.nn.functional as F\n", "F.softmax(outputs.logits, dim = -1)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "predictions = outputs.logits.argmax(dim = -1)\n", "pred_probas = F.softmax(outputs.logits, dim = -1).max(dim = -1).values\n", "\n", "preds = []\n", "for p, pp in zip(predictions, pred_probas):\n", " preds.append({'label': model.config.id2label[p.item()], 'score': pp.item()})" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'NEGATIVE', 'score': 0.9995231628417969},\n", " {'label': 'POSITIVE', 'score': 0.9998352527618408}]" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preds" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "\n", "Reference Output\n", "\n", "---\n", "\n", "[{'label': 'NEGATIVE', 'score': 0.9995231628417969},\n", " {'label': 'POSITIVE', 'score': 0.9998352527618408}]\n", "````" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 2 }