{ "cells": [ { "cell_type": "markdown", "source": [ "# Install Necessary Packages" ], "metadata": { "id": "GUB8N3k9fq-E" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zt59bSq5vcnA" }, "outputs": [], "source": [ "#Necessary installations\n", "!pip install datasets evaluate transformers[sentencepiece]\n", "!pip install huggingface_hub\n", "!pip install pandas\n", "!pip install imblearn\n", "!pip install torch" ] }, { "cell_type": "markdown", "source": [ "# Load the Dataset" ], "metadata": { "id": "9lyEyWBic5RN" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QJDszQKe6oxK" }, "outputs": [], "source": [ "from datasets import Features, Value, ClassLabel\n", "import pandas as pd\n", "\n", "from datasets import load_dataset\n", "dataset = load_dataset(\"19kmunz/iot-23-preprocessed-minimumcolumns\")\n", "print(dataset.shape)" ] }, { "cell_type": "markdown", "metadata": { "id": "wRjakUpXD3D9" }, "source": [ "# Oversample the Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wzU5AHGxD2Ut" }, "outputs": [], "source": [ "from imblearn.over_sampling import SMOTE\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mT027c7R1t7n" }, "outputs": [], "source": [ "df = dataset['train'].to_pandas()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "v2l9xGpr6bZc" }, "outputs": [], "source": [ "# Separate features and target\n", "features = ['id.resp_p', 'proto', 'conn_state', 'orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n", "X = df[features]\n", "y = df['label']" ] }, { "cell_type": "markdown", "metadata": { "id": "SlFbgG_69B1K" }, "source": [ "ADASYN and SMOTE oversampling algorithm expects numeric data, but features like proto is non-numeric categorical column. SMOTE cannot handle the string values like 'tcp' in those columns. So, I applied one hot encoding to categorical columns and then applied SMOTE" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8zSNEGIiWjMZ" }, "outputs": [], "source": [ "#########################################NEWWWW#############################################\n", "# Define categorical columns to be label-encoded\n", "cat_cols = ['proto', 'conn_state']\n", "\n", "# Initialize a dictionary to store label encoders for each column\n", "label_encoders = {}\n", "label_encoded_columns = {} # Store label-encoded columns\n", "\n", "for col in cat_cols:\n", " le = LabelEncoder()\n", " label_encoded = le.fit_transform(df[col])\n", " df[col + '_label'] = label_encoded # Create new columns with label-encoded data\n", " label_encoders[col] = le\n", " label_encoded_columns[col] = label_encoded\n", "# Get numeric columns\n", "num_cols = ['id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n", "\n", "# Extract numeric columns\n", "X_num = df[num_cols]\n", "\n", "# Concatenate label-encoded columns and numeric columns\n", "X_combined = pd.concat([df[['proto_label', 'conn_state_label']], X_num], axis=1)\n", "\n", "# Store the labels in y_os\n", "y_os = df['label']\n", "y_os1 = df['label'].apply(lambda x: 0 if x == \"Benign\" else 1)\n", "\n", "# Specify desired number of samples\n", "#k_neighbors = 10000 - y_os.shape[0]\n", "\n", "# Perform oversampling using SMOTE\n", "smote = SMOTE(sampling_strategy={0: 5000, 1: 5000})\n", "X_combined_os, Y_combined_os = smote.fit_resample(X_combined, y_os1)" ] }, { "cell_type": "code", "source": [ "# Print new class counts\n", "print(Y_combined_os.value_counts())\n", "print(X_combined_os.shape)" ], "metadata": { "id": "mZ1iMnEIkVAj" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Split the Dataset" ], "metadata": { "id": "oO9g2nhlbr3o" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OzJI6451n4tE" }, "outputs": [], "source": [ "# Manually define the column names\n", "column_names = ['proto_label', 'conn_state_label', 'id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']\n", "result_column = ['label']\n", "\n", "# Create a new DataFrame with the oversampled data and specified column names\n", "X_combined_os_df = pd.DataFrame(X_combined_os, columns=column_names)\n", "Y_combined_os_df = pd.DataFrame(Y_combined_os, columns=result_column)\n", "\n", "# Print the first 5 rows of the oversampled data\n", "print(X_combined_os_df.shape)\n", "print(X_combined_os_df.head())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YwnVJ7RqKFRD" }, "outputs": [], "source": [ "# Split oversampled data\n", "\n", "# Initial split into train and temp test sets\n", "X_train, X_temp, y_train, y_temp = train_test_split(X_combined_os_df, Y_combined_os_df, test_size=0.2, random_state=42)\n", "\n", "# Split oversampled data\n", "X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)\n", "\n", "print(\"Oversampled dataset shape:\", X_combined_os.shape)\n", "print(\"X_train shape:\", X_train.shape)\n", "print(\"X_test shape:\", X_test.shape)\n", "print(\"X_val shape:\", X_val.shape)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "WHobtry9LI_d" }, "source": [ "# Tokenize the Dataset" ] }, { "cell_type": "markdown", "source": [ "### Run one of the following cell if loading from local. Otherwise x_train and y_train are already defined." ], "metadata": { "id": "3UMlgohccAPg" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "X_train = pd.read_csv('X_train.csv', index_col=0)\n", "y_train = pd.read_csv('y_train.csv', index_col=0)" ], "metadata": { "id": "z2BM318ufee_" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "train_encodings = torch.load('train_encodings.pt')\n", "val_encodings = torch.load('val_encodings.pt')\n", "test_encodings = torch.load('test_encodings.pt')" ], "metadata": { "id": "sVTr9fxIMZl9" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Otherwise, Continue running here" ], "metadata": { "id": "j0FyKqdMezWv" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "U09fvCzaMn2P" }, "outputs": [], "source": [ "# Dictionary of feature names to use in the make sentence function\n", "feature_names = {'id.resp_p':'response port',\n", " 'proto_label':'transport protocol',\n", " 'orig_pkts':'number of packets sent by the origin',\n", " 'conn_state_label':'connection state',\n", " 'orig_ip_bytes':'number of IP level bytes sent by the originator',\n", " 'resp_ip_bytes':'number of IP level bytes sent by the responder'}\n", "\n", "# Function to make sentences out of the data\n", "def make_sentence(row):\n", " sentences = {}\n", " for feature in row.keys():\n", " if feature != 'label':\n", " sentences[feature] = feature_names[feature] + \" is \" + str(row[feature]) + \".\"\n", " return sentences" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Fe_vj8hO9dNw" }, "outputs": [], "source": [ "# Take all sentence observations and make them into paragraph inputs\n", "def make_paragraphs(ser):\n", " paragraphs_list = []\n", " for index,obs in ser.items():\n", " new_para = obs['id.resp_p'] + \" \" + obs['proto_label'] + \" \" + obs['conn_state_label'] + \" \" + obs['orig_pkts'] + \" \" + obs['orig_ip_bytes'] + \" \" + obs['resp_ip_bytes']\n", " paragraphs_list.append(new_para)\n", " return pd.Series(paragraphs_list, index=ser.index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bNyv9zOlGaBm" }, "outputs": [], "source": [ "from transformers import BertTokenizer\n", "tokenizer = BertTokenizer.from_pretrained(\"bert-base-cased\")\n", "\n", "\n", "# Transform the dataset into sentences\n", "X_train_sentences = X_train.apply(make_sentence, axis=1)\n", "X_val_sentences = X_val.apply(make_sentence, axis=1)\n", "X_test_sentences = X_test.apply(make_sentence, axis=1)\n", "\n", "# Transform the sentences into paragraphs\n", "X_train_paragraphs = make_paragraphs(X_train_sentences)\n", "X_val_paragraphs = make_paragraphs(X_val_sentences)\n", "X_test_paragraphs = make_paragraphs(X_test_sentences)\n", "\n", "# Turn labels into lists of strings\n", "y_train_str = [str(y) for y in y_train['label'].tolist()]\n", "y_val_str = [str(y) for y in y_val['label'].tolist()]\n", "y_test_str = [str(y) for y in y_test['label'].tolist()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "f5bT1RIEW0O7" }, "outputs": [], "source": [ "import torch\n", "# Encode both paragraphs and the labels\n", "train_encodings = tokenizer(text=X_train_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n", "val_encodings = tokenizer(text=X_val_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n", "test_encodings = tokenizer(text=X_test_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')\n", "\n", "# Add label tensors\n", "y_train_tensor = torch.tensor(y_train['label'].values)\n", "y_val_tensor = torch.tensor(y_val['label'].values)\n", "y_test_tensor = torch.tensor(y_test['label'].values)\n", "\n", "train_encodings['labels'] = y_train_tensor\n", "val_encodings['labels'] = y_val_tensor\n", "test_encodings['labels'] = y_test_tensor" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OV600RIVGlTi" }, "outputs": [], "source": [ "torch.save(train_encodings, 'train_encodings.pt')\n", "torch.save(val_encodings, 'val_encodings.pt')\n", "torch.save(test_encodings, 'test_encodings.pt')" ] }, { "cell_type": "markdown", "source": [ "# Finally, prepare dataset as Hugging Face Dataset" ], "metadata": { "id": "gev2VE5VcnaY" } }, { "cell_type": "markdown", "metadata": { "id": "ZNmaJOCUifpD" }, "source": [ "### Optional: Load training, validation, and test encodings in from Drive or local" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "metadata": { "id": "7NlSBStpD_rO" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install torch==2.1.0\n", "!pip install -U transformers[torch]\n", "!pip install optimum[exporters]" ], "metadata": { "id": "okamUGSAmBYN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "from transformers import BertTokenizer\n", "# Load tensor data back from drive\n", "train_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/train_encodings.pt\")\n", "val_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/val_encodings.pt\")\n", "test_encodings = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/test_encodings.pt\")\n", "\n", "# Load labels tensors back from drive\n", "# y_train_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_train_tensor.pt\")\n", "# y_val_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_val_tensor.pt\")\n", "# y_test_tensor = torch.load(\"/content/drive/MyDrive/CS513 Final Project/Resources/y_test_tensor.pt\")" ], "metadata": { "id": "rVEX0OhgEAJT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# FROM LOCAL\n", "import torch\n", "train_encodings = torch.load(\"train_encodings.pt\")\n", "val_encodings = torch.load(\"val_encodings.pt\")\n", "test_encodings = torch.load(\"test_encodings.pt\")" ], "metadata": { "id": "Jxbp-oouNHsT" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(train_encodings['input_ids'].size())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "YY7xwbuZlhK4", "outputId": "3faf0705-93f8-456e-8dbf-22b406314766" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "torch.Size([8000, 67])\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Otherwise, continue running here" ], "metadata": { "id": "dlTL3uj1fKF6" } }, { "cell_type": "code", "source": [ "# Creating small datasets to test finetuning\n", "train = train_encodings\n", "eval = val_encodings\n", "test = test_encodings\n", "\n", "# Creating small datasets to test finetuning (delete :1000 for full dataset)\n", "#train = train_encodings[:1000]\n", "#eval = val_encodings[:1000]\n", "#test = test_encodings[:1000]\n", "\n", "# Replacing target tensors (delete :128 for full label tensors)\n", "# train['labels'] = y_train_tensor[:1000]\n", "# eval['labels'] = y_val_tensor[:1000]\n", "# test['labels'] = y_test_tensor[:1000]\n", "\n", "# Pytorch tensors to HF Dataset\n", "from datasets import Dataset\n", "train_dataset = Dataset.from_dict(train)\n", "eval_dataset = Dataset.from_dict(eval)\n", "test_dataset = Dataset.from_dict(test)" ], "metadata": { "id": "llZN2akWHxe5" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "SRLNGcQFvJAa" }, "source": [ "# Fine-tune BERT for benign vs malicious" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xf2CGlW1dLlH" }, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup\n", "from transformers import Trainer, TrainingArguments\n", "from torch.utils.data import DataLoader, TensorDataset, random_split\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.utils.class_weight import compute_class_weight" ] }, { "cell_type": "code", "source": [ "import numpy as np\n", "import evaluate\n", "\n", "combined_metrics = evaluate.combine([\"accuracy\", \"f1\"])" ], "metadata": { "id": "maPzffCsAS__" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def compute_metrics(eval_pred):\n", " logits, labels = eval_pred\n", " predictions = np.argmax(logits, axis=-1)\n", " results = combined_metrics.compute(predictions=predictions, references=labels)\n", " print(f\"Accuracy: {results['accuracy']:.3f}% | F1: {results['f1']:.3f}\")\n", " return results" ], "metadata": { "id": "Subi5OZxAvlh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Load pretrained BERT model\n", "model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)\n", "\n", "# OR Load local model\n", "# model = BertForSequenceClassification.from_pretrained('./model', num_labels=2)" ], "metadata": { "id": "OWLPaQn9ysMg" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Define TrainingArguments\n", "training_args = TrainingArguments(\n", " output_dir='./results',\n", " num_train_epochs=6,\n", " per_device_train_batch_size=32,\n", " # per_device_eval_batch_size=16,\n", " warmup_steps=500,\n", " weight_decay=0.01,\n", " logging_dir='./logs',\n", " # logging_steps=0.10,\n", " eval_steps=0.10,\n", " save_steps=0.10,\n", " logging_strategy='epoch',\n", " evaluation_strategy='epoch',\n", " save_strategy='epoch',\n", " save_total_limit=2,\n", " load_best_model_at_end=True\n", ")\n", "\n", "# Create Trainer instance\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset,\n", " compute_metrics=compute_metrics\n", ")\n", "\n", "# Train\n", "trainer.train()" ], "metadata": { "id": "7a-zvoP0j8C8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "print(test_dataset)" ], "metadata": { "id": "SzDiVYRf23dp" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TlxpJByQXL_w" }, "outputs": [], "source": [ "# Use test_dataset instead to test it later\n", "trainer.evaluate(eval_dataset=test_dataset)" ] }, { "cell_type": "code", "source": [ "model.save_pretrained('./model')" ], "metadata": { "id": "dqMkv8aA5Tdk" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "# Save to Hugging Face" ], "metadata": { "id": "-qKGOqJTWt3a" } }, { "cell_type": "code", "source": [ "from huggingface_hub import create_repo" ], "metadata": { "id": "m0mCacsshEhy" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install cupy --upgrade" ], "metadata": { "id": "Ba-kOs8WqQTl" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "libcuda.so.1" ], "metadata": { "id": "AK2rcA5-qyGh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install onnxruntime\n", "import onnxruntime as rt\n", "import onnx\n", "import cv2" ], "metadata": { "id": "8z2pir6uo-vM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!optimum-cli export onnx --model ./ --task question-answering ./results/checkpoint-10" ], "metadata": { "id": "WlderhErraWX" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from onnxruntime import ORTModelForSequenceClassification\n", "\n", "ort_model = ORTModelForSequenceClassification.from_pretrained(model, export=True)\n", "\n", "ort_model.save_pretrained(\"./results/checkpoint-10\")" ], "metadata": { "id": "NzPr5eIkZfi7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Export model\n", "import torch\n", "# Get input ids\n", "input_ids = train_dataset['input_ids']\n", "# Convert to torch tensor\n", "input_ids = torch.tensor(input_ids)\n", "\n", "torch.onnx.export(model, # Model being run\n", " input_ids, # Model input\n", " \"IoT23_Log_Prediction.onnx\",# Where to save the model\n", " export_params=True, # Store model parameters\n", " output_names=['labels'],\n", " opset_version=11, # ONNX version\n", " do_constant_folding=True, # Optimize\n", " input_names = ['input_ids'])" ], "metadata": { "id": "ZM8xTkjeTm0c" }, "execution_count": null, "outputs": [] } ], "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "td-xtcTdcoVO", "GUB8N3k9fq-E", "9lyEyWBic5RN", "wRjakUpXD3D9", "oO9g2nhlbr3o", "3UMlgohccAPg", "gev2VE5VcnaY", "ZNmaJOCUifpD", "L0eqXeQUTpXM" ] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }