{ "cells": [ { "cell_type": "markdown", "id": "c99a9e2c", "metadata": {}, "source": [ "# Import the necessary libraries" ] }, { "cell_type": "code", "execution_count": null, "id": "bb19171c", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pickle\n", "import re\n", "import string\n", "from collections.abc import Iterable\n", "\n", "import keras\n", "import matplotlib.pyplot as plt\n", "import nltk\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", "from keras.layers import (LSTM, Activation, Dense, Dropout, Embedding, Input,\n", " SpatialDropout1D)\n", "from keras.models import Model, Sequential\n", "from keras.optimizers import RMSprop\n", "from keras.preprocessing import sequence\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.utils import pad_sequences, to_categorical\n", "from nltk.corpus import stopwords\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", "from sklearn.metrics import confusion_matrix\n", "from sklearn.model_selection import train_test_split\n", "\n", "nltk.download('stopwords')\n", "pd.set_option('display.max_rows', None)\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_colwidth', 255)" ] }, { "cell_type": "markdown", "id": "77ee39a1", "metadata": {}, "source": [ "# Dataset" ] }, { "cell_type": "markdown", "id": "2289c89e", "metadata": {}, "source": [ "## Dataset 1" ] }, { "cell_type": "code", "execution_count": null, "id": "70bddc47", "metadata": {}, "outputs": [], "source": [ "df1 = pd.read_csv(\"/kaggle/input/twitter-hate-speech/train_E6oV3lV.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e407435d", "metadata": {}, "outputs": [], "source": [ "df1.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "4ea10f67", "metadata": {}, "outputs": [], "source": [ "sns.countplot(x='label', data=df1)" ] }, { "cell_type": "markdown", "id": "4bef62c7", "metadata": {}, "source": [ "From the above plot we can see that classes are imbalanced, we will fix it later." ] }, { "cell_type": "code", "execution_count": null, "id": "252edcb4", "metadata": {}, "outputs": [], "source": [ "# Checking the shape of the data\n", "df1.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "0e256090", "metadata": {}, "outputs": [], "source": [ "# Cheking if null values are present in the dataset or not.\n", "df1.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "8d0cc255", "metadata": {}, "outputs": [], "source": [ "# Drop unnecessary columns\n", "df1.drop('id', axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "963f8229", "metadata": {}, "outputs": [], "source": [ "df1.head()" ] }, { "cell_type": "markdown", "id": "5767e166", "metadata": {}, "source": [ "## Dataset 2" ] }, { "cell_type": "code", "execution_count": null, "id": "bd8dde1a", "metadata": {}, "outputs": [], "source": [ "df2 = pd.read_csv(\n", " \"/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv\")\n", "df2.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "a8a4a332", "metadata": {}, "outputs": [], "source": [ "df2.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "b66a6907", "metadata": {}, "outputs": [], "source": [ "df2.isnull().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "49db9d8d", "metadata": {}, "outputs": [], "source": [ "# Drop the columns which are not required for us.\n", "df2.drop(['Unnamed: 0', 'count', 'hate_speech',\n", " 'offensive_language', 'neither'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "48981e64", "metadata": {}, "outputs": [], "source": [ "df2.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "97b0500b", "metadata": {}, "outputs": [], "source": [ "# All the unique class labels\n", "df2['class'].unique()" ] }, { "cell_type": "code", "execution_count": null, "id": "71971d95", "metadata": {}, "outputs": [], "source": [ "# Plotting the countplot for our new dataset\n", "sns.countplot(x='class', data=df2)" ] }, { "cell_type": "markdown", "id": "1ce30639", "metadata": {}, "source": [ "- class 0 - hate speech; class 1 - offensive language; class 2 - neither" ] }, { "cell_type": "code", "execution_count": null, "id": "ce04999f", "metadata": {}, "outputs": [], "source": [ "# Merge class 0 and 1 into 1. Class 1 now represents hate speech\n", "df2[\"class\"].replace({0: 1}, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "499d5336", "metadata": {}, "outputs": [], "source": [ "df2[\"class\"].unique()" ] }, { "cell_type": "code", "execution_count": null, "id": "2cb91824", "metadata": {}, "outputs": [], "source": [ "sns.countplot(x=\"class\", data=df2)" ] }, { "cell_type": "code", "execution_count": null, "id": "9bf7ba3a", "metadata": {}, "outputs": [], "source": [ "# Replace the value of 2 to 0.Class 0 is now \"No hate\"\n", "df2[\"class\"].replace({2: 0}, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "16bc2c3e", "metadata": {}, "outputs": [], "source": [ "sns.countplot(x='class', data=df2)" ] }, { "cell_type": "code", "execution_count": null, "id": "d5834f0e", "metadata": {}, "outputs": [], "source": [ "# Rename 'class' to label\n", "df2.rename(columns={'class': 'label'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "0e6a6a19", "metadata": {}, "outputs": [], "source": [ "df2.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "b76458f2", "metadata": {}, "outputs": [], "source": [ "df2.iloc[0]['tweet']" ] }, { "cell_type": "markdown", "id": "42a65071", "metadata": {}, "source": [ "## Merge df1 and df2" ] }, { "cell_type": "code", "execution_count": null, "id": "77c925a5", "metadata": {}, "outputs": [], "source": [ "df = pd.concat([df1, df2])" ] }, { "cell_type": "code", "execution_count": null, "id": "b81eef43", "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "952ef123", "metadata": {}, "outputs": [], "source": [ "sns.countplot(x='label', data=df)" ] }, { "cell_type": "markdown", "id": "608c3277", "metadata": {}, "source": [ "Now we can see that the problem of imbalace data has been solved." ] }, { "cell_type": "code", "execution_count": null, "id": "293d0d21", "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "markdown", "id": "4d8117e1", "metadata": {}, "source": [ "## Data cleaning" ] }, { "cell_type": "code", "execution_count": null, "id": "e76a3db9", "metadata": {}, "outputs": [], "source": [ "# Apply regex and do cleaning.\n", "def clean_text(words: str) -> str:\n", " words = str(words).lower()\n", " words = re.sub('\\[.*?\\]', '', words)\n", " words = re.sub('https?://\\S+|www\\.\\S+', '', words)\n", " words = re.sub('<.*?>+', '', words)\n", " words = re.sub(r'@\\w+', '', words)\n", " words = re.sub('[%s]' % re.escape(string.punctuation), '', words)\n", " words = re.sub('\\n', '', words)\n", " words = re.sub('\\w*\\d\\w*', '', words)\n", "\n", " stopword = set(stopwords.words('english'))\n", " words = ' '.join(\n", " [word for word in words.split(' ') if word not in stopword])\n", "\n", " stemmer = nltk.SnowballStemmer(\"english\")\n", " words = ' '.join([stemmer.stem(word) for word in words.split(' ')])\n", "\n", " return words" ] }, { "cell_type": "code", "execution_count": null, "id": "fd98ec5a", "metadata": {}, "outputs": [], "source": [ "# Apply the data_cleaning on the data.\n", "df_cleaned = df.copy()\n", "df_cleaned['tweet'] = df['tweet'].apply(clean_text)" ] }, { "cell_type": "code", "execution_count": null, "id": "b5c6a309", "metadata": {}, "outputs": [], "source": [ "df_cleaned['tweet'][1]" ] }, { "cell_type": "code", "execution_count": null, "id": "3df4b3e0", "metadata": {}, "outputs": [], "source": [ "df_cleaned.head(10)" ] }, { "cell_type": "markdown", "id": "39e9dff5", "metadata": {}, "source": [ "## Train test split" ] }, { "cell_type": "code", "execution_count": null, "id": "060e1f76", "metadata": {}, "outputs": [], "source": [ "x = df_cleaned['tweet']\n", "y = df_cleaned['label']" ] }, { "cell_type": "code", "execution_count": null, "id": "5b39fbd9", "metadata": {}, "outputs": [], "source": [ "# Split the data into train and test\n", "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)\n", "print(len(x_train), len(y_train))\n", "print(len(x_test), len(y_test))" ] }, { "cell_type": "code", "execution_count": null, "id": "29be47f4", "metadata": {}, "outputs": [], "source": [ "type(x_test), type(y_test), type(x_train), type(y_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "402ecb50", "metadata": {}, "outputs": [], "source": [ "len(x_test)" ] }, { "cell_type": "markdown", "id": "0187c473", "metadata": {}, "source": [ "## Tokenization and padding" ] }, { "cell_type": "code", "execution_count": null, "id": "cc49a7f7", "metadata": {}, "outputs": [], "source": [ "def tokenize_and_pad(text_list: Iterable[str], tokenizer: Tokenizer, max_len: int) -> np.ndarray[np.str_]:\n", " sequences = tokenizer.texts_to_sequences(text_list)\n", " sequences_matrix = pad_sequences(sequences, maxlen=max_len)\n", " return sequences_matrix" ] }, { "cell_type": "code", "execution_count": null, "id": "e4329001", "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "max_words = 50000\n", "max_len = 300\n", "\n", "tokenizer = Tokenizer(num_words=max_words)\n", "tokenizer.fit_on_texts(x_train)\n", "\n", "x_train_tokenized = tokenize_and_pad(x_train, tokenizer, max_len)" ] }, { "cell_type": "code", "execution_count": null, "id": "21261eee", "metadata": {}, "outputs": [], "source": [ "with open('tokenizer.pickle', 'wb') as handle:\n", " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": null, "id": "5833c859", "metadata": {}, "outputs": [], "source": [ "x_train_tokenized" ] }, { "cell_type": "markdown", "id": "811f8996", "metadata": {}, "source": [ "# Model" ] }, { "cell_type": "markdown", "id": "b42ceb66", "metadata": {}, "source": [ "## Model architecture" ] }, { "cell_type": "code", "execution_count": null, "id": "15e9d814", "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "# Creating model architecture.\n", "model = Sequential()\n", "model.add(Embedding(max_words, 100, input_length=max_len))\n", "model.add(SpatialDropout1D(0.2))\n", "model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))\n", "model.add(Dense(1, activation='sigmoid'))\n", "\n", "model.summary()\n", "\n", "model.compile(loss='binary_crossentropy',\n", " optimizer=RMSprop(), metrics=['accuracy'])" ] }, { "cell_type": "markdown", "id": "ae55985d", "metadata": {}, "source": [ "## Callbacks" ] }, { "cell_type": "code", "execution_count": null, "id": "9065382d", "metadata": {}, "outputs": [], "source": [ "early_stopping_callback = EarlyStopping(\n", " monitor='val_loss', # Metric to monitor (e.g., validation loss)\n", " patience=3, # Number of epochs with no improvement to wait\n", " restore_best_weights=True # Restore model weights to the best achieved during training\n", ")" ] }, { "cell_type": "markdown", "id": "90fb2dbf", "metadata": {}, "source": [ "## Training\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fb3a5153", "metadata": {}, "outputs": [], "source": [ "# starting model training\n", "history = model.fit(\n", " x_train_tokenized, y_train,\n", " batch_size=128,\n", " epochs=20,\n", " validation_split=0.2,\n", " callbacks=[early_stopping_callback]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "b509694a", "metadata": {}, "outputs": [], "source": [ "model.save(\"model.h5\")" ] }, { "cell_type": "markdown", "id": "01484e53", "metadata": {}, "source": [ "## Evaluation and testing" ] }, { "cell_type": "code", "execution_count": null, "id": "86a6cd51", "metadata": {}, "outputs": [], "source": [ "test_sequences = tokenizer.texts_to_sequences(x_test)\n", "test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)" ] }, { "cell_type": "code", "execution_count": null, "id": "7674863a", "metadata": {}, "outputs": [], "source": [ "# Model evaluation\n", "accr = model.evaluate(test_sequences_matrix, y_test)" ] }, { "cell_type": "code", "execution_count": null, "id": "03f93f02", "metadata": {}, "outputs": [], "source": [ "lstm_prediction = model.predict(test_sequences_matrix)" ] }, { "cell_type": "code", "execution_count": null, "id": "2b04a6f5", "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "res = []\n", "for prediction in lstm_prediction:\n", " if prediction[0] < 0.5:\n", " res.append(0)\n", " else:\n", " res.append(1)" ] }, { "cell_type": "code", "execution_count": null, "id": "20ec485c", "metadata": {}, "outputs": [], "source": [ "print(confusion_matrix(y_test, res))" ] }, { "cell_type": "code", "execution_count": null, "id": "0062900e", "metadata": {}, "outputs": [], "source": [ "load_model = keras.models.load_model(\"model.h5\")\n", "with open('tokenizer.pickle', 'rb') as handle:\n", " load_tokenizer = pickle.load(handle)" ] }, { "cell_type": "code", "execution_count": null, "id": "5612cac0", "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "# Let's test our model on custom data.\n", "test = 'humans are idiots'\n", "\n", "\n", "def clean_text(text):\n", " print(text)\n", " text = str(text).lower()\n", " text = re.sub('\\[.*?\\]', '', text)\n", " text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n", " text = re.sub('<.*?>+', '', text)\n", " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n", " text = re.sub('\\n', '', text)\n", " text = re.sub('\\w*\\d\\w*', '', text)\n", " print(text)\n", " text = [word for word in text.split(' ') if word not in stopword]\n", " text = \" \".join(text)\n", " text = [stemmer.stem(word) for word in text.split(' ')]\n", " text = \" \".join(text)\n", " return text\n", "\n", "\n", "test = [clean_text(test)]\n", "print(test)\n", "seq = load_tokenizer.texts_to_sequences(test)\n", "padded = pad_sequences(seq, maxlen=300)\n", "print(seq)\n", "pred = load_model.predict(padded)\n", "print(\"pred\", pred)\n", "if pred < 0.5:\n", " print(\"no hate\")\n", "else:\n", " print(\"hate and abusive\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d90fb1eb", "metadata": {}, "outputs": [], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": null, "id": "e564ae3e", "metadata": {}, "outputs": [], "source": [ "while True:\n", " pass" ] }, { "cell_type": "code", "execution_count": null, "id": "41301aee", "metadata": {}, "outputs": [], "source": [ "# https://www.kaggle.com/soumyaprabhamaiti/hate-speech-classification/edit" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }