Upload 2 files

Browse files

Files changed (2) hide show

convert_into_distilbert_dataset.py +120 -0
fine-tune-distil-bert.ipynb +199 -0

convert_into_distilbert_dataset.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# The purpose of this file is to take given texts
+# Put AI ones into negative and human ones into positive
+# While making sure to split all the texts into word by word
+# To ensure searching before the text has finished streaming
+# Example this: "The dog walked over the pavement." will be turned into:
+# The
+# The dog
+# The dog walked
+# The dog walked over
+# The dog walked over the
+# The dog walked over the pavement
+# The dog walked over the pavement.
+# Example data row:
+# {"query": "Write a story about dogs", "pos": ["lorem ipsum..."], "neg": ["lorem ipsum..."]}
+import re
+import ujson as json
+import random
+from tqdm import tqdm
+def split_string(text):
+    """Split a given text by spaces and punctuation"""
+    # Split the text by spaces
+    words = text.split()
+    # For now we disabled further splitting because of issues
+    # # Further split each word by punctuation using regex
+    # split_words = []
+    # for word in words:
+    #     # Find all substrings that match the pattern: either a word or a punctuation mark
+    #     split_words.extend(re.findall(r'\w+|[^\w\s]', word))
+    return words
+reddit_vs_synth_writing_prompts = []
+with open("writing_prompts/reddit_vs_synth_writing_prompts.jsonl", "r") as f:
+    temp = f.read()
+for line in temp.splitlines():
+    loaded_object = json.loads(line)
+    if not "story_human" in loaded_object: # Remove ones where we don't have human data
+        continue
+    reddit_vs_synth_writing_prompts.append(loaded_object)
+dataset_entries = []
+SAVE_FILE_NAME = "bert_reddit_vs_synth_writing_prompts.jsonl"
+def add_streamed_data(data):
+    entries = []
+    data_parts = split_string(data)
+    for i in range(len(data_parts)):
+        streamed_so_far = " ".join(data_parts[:i + 1]) # Since python slicing is exclusive toward the end
+        entries.append({"text": streamed_so_far, "label": HUMAN_LABEL})
+    return entries
+with open(SAVE_FILE_NAME, "w") as f:
+    f.write("")
+NUM_OF_TURNS_TO_DUMP = 200
+i = 0
+for data in tqdm(reddit_vs_synth_writing_prompts):
+    #     {"text": "AI-generated text example 1", "label": 1},
+    # Assuming 1 means AI generated, 0 means human
+    HUMAN_LABEL = 0
+    AI_LABEL = 1
+    i += 1
+    # Below is to enable writing dataset part by part
+    if i == NUM_OF_TURNS_TO_DUMP:
+        i = 0
+        dumped_string = ""
+        dumped_entries = []
+        for entry in dataset_entries:
+            dumped_entries.append(json.dumps(entry))
+        dumped_string = "\n".join(dumped_entries) + "\n"
+        with open(SAVE_FILE_NAME, "a") as f:
+            f.write(dumped_string)
+        dataset_entries = []
+    if False: # Disable Streaming
+        # Add streamed data
+        human_entries = add_streamed_data(data["story_human"])
+        dataset_entries.extend(human_entries)
+        ai_data = []
+        if data.get("story_opus"):
+            ai_data.extend(add_streamed_data(data["story_opus"]))
+        if data.get("story_gpt_3_5"):
+            ai_data.extend(add_streamed_data(data["story_gpt_3_5"]))
+        dataset_entries.extend(ai_data)
+    else:
+        # Add without streaming
+        dataset_entries.append({"text": data["story_human"], "label": HUMAN_LABEL})
+        ai_data = []
+        if data.get("story_opus"):
+            dataset_entries.append({"text": data["story_opus"], "label": AI_LABEL})
+        if data.get("story_gpt_3_5"):
+            dataset_entries.append({"text": data["story_gpt_3_5"], "label": AI_LABEL})
+# Dump as JSONL
+dumped_string = ""
+dumped_entries = []
+for entry in dataset_entries:
+    dumped_entries.append(json.dumps(entry))
+dumped_string = "\n".join(dumped_entries) + "\n"
+with open(SAVE_FILE_NAME, "a") as f:
+    f.write(dumped_string)

fine-tune-distil-bert.ipynb ADDED Viewed

	@@ -0,0 +1,199 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install torch transformers scikit-learn wandb accelerate tqdm\n",
+    "from IPython.display import clear_output\n",
+    "clear_output(wait=True)\n",
+    "print(\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!apt-get update\n",
+    "!apt-get install zstd\n",
+    "!tar --use-compress-program=unzstd -xvf bert_streamed_dataset.tar.zst\n",
+    "clear_output(wait=True)\n",
+    "print(\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from tqdm import tqdm\n",
+    "import wandb\n",
+    "import json\n",
+    "\n",
+    "# Initialize W&B\n",
+    "wandb.init(project=\"distilbert-ai-text-classification\")\n",
+    "\n",
+    "# Check if MPS is available and set the device\n",
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
+    "print(device)\n",
+    "\n",
+    "# Load pre-trained DistilBERT tokenizer and model\n",
+    "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
+    "model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the JSONL dataset\n",
+    "data = []\n",
+    "total_num_of_lines = 0\n",
+    "with open('bert_reddit_vs_synth_writing_prompts.jsonl', 'r') as infile:\n",
+    "    for line in tqdm(infile, desc=\"Checking dataset size\"):\n",
+    "        total_num_of_lines += 1\n",
+    "\n",
+    "with open('bert_reddit_vs_synth_writing_prompts.jsonl', 'r') as infile:\n",
+    "    for line in tqdm(infile, desc=\"Loading dataset\", total=total_num_of_lines):\n",
+    "        data.append(json.loads(line))\n",
+    "\n",
+    "# Extract texts and labels\n",
+    "print(\"Extracting texts and labels\")\n",
+    "texts = [entry['text'] for entry in data]\n",
+    "labels = [entry['label'] for entry in data]\n",
+    "\n",
+    "# Tokenize the text\n",
+    "print(\"Tokenizing text\")\n",
+    "inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\")\n",
+    "\n",
+    "# Move input tensors to the device\n",
+    "print(\"Moving input tensors\")\n",
+    "inputs = {key: val for key, val in inputs.items()}\n",
+    "\n",
+    "# Split the data into training and validation sets\n",
+    "print(\"Splitting data into train and validation\")\n",
+    "train_inputs, val_inputs, train_labels, val_labels = train_test_split(\n",
+    "    inputs['input_ids'], labels, test_size=0.2, random_state=42)\n",
+    "\n",
+    "train_attention_masks, val_attention_masks, _, _ = train_test_split(\n",
+    "    inputs['attention_mask'], labels, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Create a PyTorch dataset\n",
+    "class TextDataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, input_ids, attention_masks, labels):\n",
+    "        self.input_ids = input_ids\n",
+    "        self.attention_masks = attention_masks\n",
+    "        self.labels = labels\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.labels)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return {\n",
+    "            'input_ids': self.input_ids[idx],\n",
+    "            'attention_mask': self.attention_masks[idx],\n",
+    "            'labels': torch.tensor(self.labels[idx])\n",
+    "        }\n",
+    "\n",
+    "print(\"Creating pytorch datasets\")\n",
+    "train_dataset = TextDataset(train_inputs, train_attention_masks, train_labels)\n",
+    "val_dataset = TextDataset(val_inputs, val_attention_masks, val_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reduce eval set to X examples to speed up training\n",
+    "NUM_OF_EVAL_EXAMPLES = 1000\n",
+    "val_inputs_subset = val_inputs[:NUM_OF_EVAL_EXAMPLES]\n",
+    "val_attention_masks_subset = val_attention_masks[:NUM_OF_EVAL_EXAMPLES]\n",
+    "val_labels_subset = val_labels[:NUM_OF_EVAL_EXAMPLES]\n",
+    "\n",
+    "# Create a TextDataset with only X examples\n",
+    "val_dataset = Textdataset(val_inputs_subset, val_attention_masks_subset, val_labels_subset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the training arguments\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./distil-bert-train-results',          \n",
+    "    num_train_epochs=3,              \n",
+    "    per_device_train_batch_size=16,  \n",
+    "    per_device_eval_batch_size=16,   \n",
+    "    warmup_steps=500,                # number of warmup steps for learning rate scheduler\n",
+    "    weight_decay=0.01,               \n",
+    "    logging_dir='./logs',            \n",
+    "    logging_steps=10,                \n",
+    "    report_to=\"wandb\",                \n",
+    "    evaluation_strategy=\"steps\",  # Evaluate every logging step\n",
+    "    eval_steps=100,            # Evaluate every 10 steps\n",
+    "    fp16=True,\n",
+    ")\n",
+    "\n",
+    "# Create the Trainer\n",
+    "trainer = Trainer(\n",
+    "    model=model,                         # the instantiated 🤗 Transformers model to be trained\n",
+    "    args=training_args,                  # training arguments, defined above\n",
+    "    train_dataset=train_dataset,         # training dataset\n",
+    "    eval_dataset=val_dataset             # evaluation dataset\n",
+    ")\n",
+    "\n",
+    "# Train the model\n",
+    "trainer.train()\n",
+    "\n",
+    "# Save the model\n",
+    "model.save_pretrained('./distil-bert-train-final-result')\n",
+    "\n",
+    "# Finish the W&B run\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}