File size: 6,722 Bytes

0db33af

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install torch transformers scikit-learn wandb accelerate tqdm\n",
    "from IPython.display import clear_output\n",
    "clear_output(wait=True)\n",
    "print(\".\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!apt-get update\n",
    "!apt-get install zstd\n",
    "!tar --use-compress-program=unzstd -xvf bert_streamed_dataset.tar.zst\n",
    "clear_output(wait=True)\n",
    "print(\".\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments\n",
    "from sklearn.model_selection import train_test_split\n",
    "from tqdm import tqdm\n",
    "import wandb\n",
    "import json\n",
    "\n",
    "# Initialize W&B\n",
    "wandb.init(project=\"distilbert-ai-text-classification\")\n",
    "\n",
    "# Check if MPS is available and set the device\n",
    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
    "print(device)\n",
    "\n",
    "# Load pre-trained DistilBERT tokenizer and model\n",
    "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
    "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)\n",
    "model.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the JSONL dataset\n",
    "data = []\n",
    "total_num_of_lines = 0\n",
    "with open('bert_reddit_vs_synth_writing_prompts.jsonl', 'r') as infile:\n",
    "    for line in tqdm(infile, desc=\"Checking dataset size\"):\n",
    "        total_num_of_lines += 1\n",
    "\n",
    "with open('bert_reddit_vs_synth_writing_prompts.jsonl', 'r') as infile:\n",
    "    for line in tqdm(infile, desc=\"Loading dataset\", total=total_num_of_lines):\n",
    "        data.append(json.loads(line))\n",
    "\n",
    "# Extract texts and labels\n",
    "print(\"Extracting texts and labels\")\n",
    "texts = [entry['text'] for entry in data]\n",
    "labels = [entry['label'] for entry in data]\n",
    "\n",
    "# Tokenize the text\n",
    "print(\"Tokenizing text\")\n",
    "inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\")\n",
    "\n",
    "# Move input tensors to the device\n",
    "print(\"Moving input tensors\")\n",
    "inputs = {key: val for key, val in inputs.items()}\n",
    "\n",
    "# Split the data into training and validation sets\n",
    "print(\"Splitting data into train and validation\")\n",
    "train_inputs, val_inputs, train_labels, val_labels = train_test_split(\n",
    "    inputs['input_ids'], labels, test_size=0.2, random_state=42)\n",
    "\n",
    "train_attention_masks, val_attention_masks, _, _ = train_test_split(\n",
    "    inputs['attention_mask'], labels, test_size=0.2, random_state=42)\n",
    "\n",
    "# Create a PyTorch dataset\n",
    "class TextDataset(torch.utils.data.Dataset):\n",
    "    def __init__(self, input_ids, attention_masks, labels):\n",
    "        self.input_ids = input_ids\n",
    "        self.attention_masks = attention_masks\n",
    "        self.labels = labels\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.labels)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        return {\n",
    "            'input_ids': self.input_ids[idx],\n",
    "            'attention_mask': self.attention_masks[idx],\n",
    "            'labels': torch.tensor(self.labels[idx])\n",
    "        }\n",
    "\n",
    "print(\"Creating pytorch datasets\")\n",
    "train_dataset = TextDataset(train_inputs, train_attention_masks, train_labels)\n",
    "val_dataset = TextDataset(val_inputs, val_attention_masks, val_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reduce eval set to X examples to speed up training\n",
    "NUM_OF_EVAL_EXAMPLES = 1000\n",
    "val_inputs_subset = val_inputs[:NUM_OF_EVAL_EXAMPLES]\n",
    "val_attention_masks_subset = val_attention_masks[:NUM_OF_EVAL_EXAMPLES]\n",
    "val_labels_subset = val_labels[:NUM_OF_EVAL_EXAMPLES]\n",
    "\n",
    "# Create a TextDataset with only X examples\n",
    "val_dataset = Textdataset(val_inputs_subset, val_attention_masks_subset, val_labels_subset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the training arguments\n",
    "training_args = TrainingArguments(\n",
    "    output_dir='./distil-bert-train-results',          \n",
    "    num_train_epochs=3,              \n",
    "    per_device_train_batch_size=16,  \n",
    "    per_device_eval_batch_size=16,   \n",
    "    warmup_steps=500,                # number of warmup steps for learning rate scheduler\n",
    "    weight_decay=0.01,               \n",
    "    logging_dir='./logs',            \n",
    "    logging_steps=10,                \n",
    "    report_to=\"wandb\",                \n",
    "    evaluation_strategy=\"steps\",  # Evaluate every logging step\n",
    "    eval_steps=100,            # Evaluate every 10 steps\n",
    "    fp16=True,\n",
    ")\n",
    "\n",
    "# Create the Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,                         # the instantiated 🤗 Transformers model to be trained\n",
    "    args=training_args,                  # training arguments, defined above\n",
    "    train_dataset=train_dataset,         # training dataset\n",
    "    eval_dataset=val_dataset             # evaluation dataset\n",
    ")\n",
    "\n",
    "# Train the model\n",
    "trainer.train()\n",
    "\n",
    "# Save the model\n",
    "model.save_pretrained('./distil-bert-train-final-result')\n",
    "\n",
    "# Finish the W&B run\n",
    "wandb.finish()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}