# Install Necessary Packages

In [None]:
#Necessary installations
!pip install datasets evaluate transformers[sentencepiece]
!pip install huggingface_hub
!pip install pandas
!pip install imblearn
!pip install torch

# Load the Dataset

In [None]:
from datasets import Features, Value, ClassLabel
import pandas as pd

from datasets import load_dataset
dataset = load_dataset("19kmunz/iot-23-preprocessed-minimumcolumns")
print(dataset.shape)

# Oversample the Dataset

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df = dataset['train'].to_pandas()


In [None]:
# Separate features and target
features = ['id.resp_p', 'proto', 'conn_state', 'orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']
X = df[features]
y = df['label']

ADASYN and SMOTE oversampling algorithm expects numeric data, but features like proto is non-numeric categorical column. SMOTE cannot handle the string values like 'tcp' in those columns. So, I applied one hot encoding to categorical columns and then applied SMOTE

In [None]:
#########################################NEWWWW#############################################
# Define categorical columns to be label-encoded
cat_cols = ['proto', 'conn_state']

# Initialize a dictionary to store label encoders for each column
label_encoders = {}
label_encoded_columns = {} # Store label-encoded columns

for col in cat_cols:
 le = LabelEncoder()
 label_encoded = le.fit_transform(df[col])
 df[col + '_label'] = label_encoded # Create new columns with label-encoded data
 label_encoders[col] = le
 label_encoded_columns[col] = label_encoded
# Get numeric columns
num_cols = ['id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']

# Extract numeric columns
X_num = df[num_cols]

# Concatenate label-encoded columns and numeric columns
X_combined = pd.concat([df[['proto_label', 'conn_state_label']], X_num], axis=1)

# Store the labels in y_os
y_os = df['label']
y_os1 = df['label'].apply(lambda x: 0 if x == "Benign" else 1)

# Specify desired number of samples
#k_neighbors = 10000 - y_os.shape[0]

# Perform oversampling using SMOTE
smote = SMOTE(sampling_strategy={0: 5000, 1: 5000})
X_combined_os, Y_combined_os = smote.fit_resample(X_combined, y_os1)

In [None]:
# Print new class counts
print(Y_combined_os.value_counts())
print(X_combined_os.shape)

# Split the Dataset

In [None]:
# Manually define the column names
column_names = ['proto_label', 'conn_state_label', 'id.resp_p','orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']
result_column = ['label']

# Create a new DataFrame with the oversampled data and specified column names
X_combined_os_df = pd.DataFrame(X_combined_os, columns=column_names)
Y_combined_os_df = pd.DataFrame(Y_combined_os, columns=result_column)

# Print the first 5 rows of the oversampled data
print(X_combined_os_df.shape)
print(X_combined_os_df.head())

In [None]:
# Split oversampled data

# Initial split into train and temp test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_combined_os_df, Y_combined_os_df, test_size=0.2, random_state=42)

# Split oversampled data
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

print("Oversampled dataset shape:", X_combined_os.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("X_val shape:", X_val.shape)


# Tokenize the Dataset

### Run one of the following cell if loading from local. Otherwise x_train and y_train are already defined.

In [None]:
import pandas as pd
X_train = pd.read_csv('X_train.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)

In [None]:
train_encodings = torch.load('train_encodings.pt')
val_encodings = torch.load('val_encodings.pt')
test_encodings = torch.load('test_encodings.pt')

### Otherwise, Continue running here

In [None]:
# Dictionary of feature names to use in the make sentence function
feature_names = {'id.resp_p':'response port',
 'proto_label':'transport protocol',
 'orig_pkts':'number of packets sent by the origin',
 'conn_state_label':'connection state',
 'orig_ip_bytes':'number of IP level bytes sent by the originator',
 'resp_ip_bytes':'number of IP level bytes sent by the responder'}

# Function to make sentences out of the data
def make_sentence(row):
 sentences = {}
 for feature in row.keys():
 if feature != 'label':
 sentences[feature] = feature_names[feature] + " is " + str(row[feature]) + "."
 return sentences

In [None]:
# Take all sentence observations and make them into paragraph inputs
def make_paragraphs(ser):
 paragraphs_list = []
 for index,obs in ser.items():
 new_para = obs['id.resp_p'] + " " + obs['proto_label'] + " " + obs['conn_state_label'] + " " + obs['orig_pkts'] + " " + obs['orig_ip_bytes'] + " " + obs['resp_ip_bytes']
 paragraphs_list.append(new_para)
 return pd.Series(paragraphs_list, index=ser.index)

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")


# Transform the dataset into sentences
X_train_sentences = X_train.apply(make_sentence, axis=1)
X_val_sentences = X_val.apply(make_sentence, axis=1)
X_test_sentences = X_test.apply(make_sentence, axis=1)

# Transform the sentences into paragraphs
X_train_paragraphs = make_paragraphs(X_train_sentences)
X_val_paragraphs = make_paragraphs(X_val_sentences)
X_test_paragraphs = make_paragraphs(X_test_sentences)

# Turn labels into lists of strings
y_train_str = [str(y) for y in y_train['label'].tolist()]
y_val_str = [str(y) for y in y_val['label'].tolist()]
y_test_str = [str(y) for y in y_test['label'].tolist()]

In [None]:
import torch
# Encode both paragraphs and the labels
train_encodings = tokenizer(text=X_train_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')
val_encodings = tokenizer(text=X_val_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')
test_encodings = tokenizer(text=X_test_paragraphs.tolist(), padding='longest', truncation=True, return_tensors='pt')

# Add label tensors
y_train_tensor = torch.tensor(y_train['label'].values)
y_val_tensor = torch.tensor(y_val['label'].values)
y_test_tensor = torch.tensor(y_test['label'].values)

train_encodings['labels'] = y_train_tensor
val_encodings['labels'] = y_val_tensor
test_encodings['labels'] = y_test_tensor

In [None]:
torch.save(train_encodings, 'train_encodings.pt')
torch.save(val_encodings, 'val_encodings.pt')
torch.save(test_encodings, 'test_encodings.pt')

# Finally, prepare dataset as Hugging Face Dataset

### Optional: Load training, validation, and test encodings in from Drive or local

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install torch==2.1.0
!pip install -U transformers[torch]
!pip install optimum[exporters]

In [None]:
import torch
from transformers import BertTokenizer
# Load tensor data back from drive
train_encodings = torch.load("/content/drive/MyDrive/CS513 Final Project/Resources/train_encodings.pt")
val_encodings = torch.load("/content/drive/MyDrive/CS513 Final Project/Resources/val_encodings.pt")
test_encodings = torch.load("/content/drive/MyDrive/CS513 Final Project/Resources/test_encodings.pt")

# Load labels tensors back from drive
# y_train_tensor = torch.load("/content/drive/MyDrive/CS513 Final Project/Resources/y_train_tensor.pt")
# y_val_tensor = torch.load("/content/drive/MyDrive/CS513 Final Project/Resources/y_val_tensor.pt")
# y_test_tensor = torch.load("/content/drive/MyDrive/CS513 Final Project/Resources/y_test_tensor.pt")

In [None]:
# FROM LOCAL
import torch
train_encodings = torch.load("train_encodings.pt")
val_encodings = torch.load("val_encodings.pt")
test_encodings = torch.load("test_encodings.pt")

In [None]:
print(train_encodings['input_ids'].size())

torch.Size([8000, 67])


### Otherwise, continue running here

In [None]:
# Creating small datasets to test finetuning
train = train_encodings
eval = val_encodings
test = test_encodings

# Creating small datasets to test finetuning (delete :1000 for full dataset)
#train = train_encodings[:1000]
#eval = val_encodings[:1000]
#test = test_encodings[:1000]

# Replacing target tensors (delete :128 for full label tensors)
# train['labels'] = y_train_tensor[:1000]
# eval['labels'] = y_val_tensor[:1000]
# test['labels'] = y_test_tensor[:1000]

# Pytorch tensors to HF Dataset
from datasets import Dataset
train_dataset = Dataset.from_dict(train)
eval_dataset = Dataset.from_dict(eval)
test_dataset = Dataset.from_dict(test)

# Fine-tune BERT for benign vs malicious

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [None]:
import numpy as np
import evaluate

combined_metrics = evaluate.combine(["accuracy", "f1"])

In [None]:
def compute_metrics(eval_pred):
 logits, labels = eval_pred
 predictions = np.argmax(logits, axis=-1)
 results = combined_metrics.compute(predictions=predictions, references=labels)
 print(f"Accuracy: {results['accuracy']:.3f}% | F1: {results['f1']:.3f}")
 return results

In [None]:
# Load pretrained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# OR Load local model
# model = BertForSequenceClassification.from_pretrained('./model', num_labels=2)

In [None]:
# Define TrainingArguments
training_args = TrainingArguments(
 output_dir='./results',
 num_train_epochs=6,
 per_device_train_batch_size=32,
 # per_device_eval_batch_size=16,
 warmup_steps=500,
 weight_decay=0.01,
 logging_dir='./logs',
 # logging_steps=0.10,
 eval_steps=0.10,
 save_steps=0.10,
 logging_strategy='epoch',
 evaluation_strategy='epoch',
 save_strategy='epoch',
 save_total_limit=2,
 load_best_model_at_end=True
)

# Create Trainer instance
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=eval_dataset,
 compute_metrics=compute_metrics
)

# Train
trainer.train()

In [None]:
print(test_dataset)

In [None]:
# Use test_dataset instead to test it later
trainer.evaluate(eval_dataset=test_dataset)

In [None]:
model.save_pretrained('./model')

# Save to Hugging Face

In [None]:
from huggingface_hub import create_repo

In [None]:
!pip install cupy --upgrade

In [None]:
libcuda.so.1

In [None]:
!pip install onnxruntime
import onnxruntime as rt
import onnx
import cv2

In [None]:
!optimum-cli export onnx --model ./ --task question-answering ./results/checkpoint-10

In [None]:
from onnxruntime import ORTModelForSequenceClassification

ort_model = ORTModelForSequenceClassification.from_pretrained(model, export=True)

ort_model.save_pretrained("./results/checkpoint-10")

In [None]:
# Export model
import torch
# Get input ids
input_ids = train_dataset['input_ids']
# Convert to torch tensor
input_ids = torch.tensor(input_ids)

torch.onnx.export(model, # Model being run
 input_ids, # Model input
 "IoT23_Log_Prediction.onnx",# Where to save the model
 export_params=True, # Store model parameters
 output_names=['labels'],
 opset_version=11, # ONNX version
 do_constant_folding=True, # Optimize
 input_names = ['input_ids'])