Spaces:

soumyaprabhamaiti
/

hate_speech_classifier

Runtime error

App Files Files Community

soumyaprabhamaiti commited on Sep 3, 2023

Commit

3abbcfd

•

1 Parent(s): 5ce506c

Add development folder

Browse files

Files changed (2) hide show

development/hate-speech-classification.ipynb +815 -0
development/requirements_dev.txt +8 -0

development/hate-speech-classification.ipynb ADDED Viewed

	@@ -0,0 +1,815 @@

+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "c99a9e2c",
+ "metadata": {},
+ "source": [
+ "# Import the necessary libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bb19171c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import pickle\n",
+ "import re\n",
+ "import string\n",
+ "from collections.abc import Iterable\n",
+ "\n",
+ "import keras\n",
+ "import matplotlib.pyplot as plt\n",
+ "import nltk\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
+ "from keras.layers import (LSTM, Activation, Dense, Dropout, Embedding, Input,\n",
+ " SpatialDropout1D)\n",
+ "from keras.models import Model, Sequential\n",
+ "from keras.optimizers import RMSprop\n",
+ "from keras.preprocessing import sequence\n",
+ "from keras.preprocessing.text import Tokenizer\n",
+ "from keras.utils import pad_sequences, to_categorical\n",
+ "from nltk.corpus import stopwords\n",
+ "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+ "from sklearn.metrics import confusion_matrix\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "nltk.download('stopwords')\n",
+ "pd.set_option('display.max_rows', None)\n",
+ "pd.set_option('display.max_columns', None)\n",
+ "pd.set_option('display.max_colwidth', 255)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "77ee39a1",
+ "metadata": {},
+ "source": [
+ "# Dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2289c89e",
+ "metadata": {},
+ "source": [
+ "## Dataset 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "70bddc47",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1 = pd.read_csv(\"/kaggle/input/twitter-hate-speech/train_E6oV3lV.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e407435d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4ea10f67",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.countplot(x='label', data=df1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4bef62c7",
+ "metadata": {},
+ "source": [
+ "From the above plot we can see that classes are imbalanced, we will fix it later."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "252edcb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Checking the shape of the data\n",
+ "df1.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e256090",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Cheking if null values are present in the dataset or not.\n",
+ "df1.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8d0cc255",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop unnecessary columns\n",
+ "df1.drop('id', axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "963f8229",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5767e166",
+ "metadata": {},
+ "source": [
+ "## Dataset 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bd8dde1a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2 = pd.read_csv(\n",
+ " \"/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv\")\n",
+ "df2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a8a4a332",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b66a6907",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "49db9d8d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop the columns which are not required for us.\n",
+ "df2.drop(['Unnamed: 0', 'count', 'hate_speech',\n",
+ " 'offensive_language', 'neither'], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48981e64",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "97b0500b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# All the unique class labels\n",
+ "df2['class'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "71971d95",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plotting the countplot for our new dataset\n",
+ "sns.countplot(x='class', data=df2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1ce30639",
+ "metadata": {},
+ "source": [
+ "- class 0 - hate speech; class 1 - offensive language; class 2 - neither"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce04999f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Merge class 0 and 1 into 1. Class 1 now represents hate speech\n",
+ "df2[\"class\"].replace({0: 1}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "499d5336",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2[\"class\"].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2cb91824",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.countplot(x=\"class\", data=df2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9bf7ba3a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Replace the value of 2 to 0.Class 0 is now \"No hate\"\n",
+ "df2[\"class\"].replace({2: 0}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "16bc2c3e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.countplot(x='class', data=df2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5834f0e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Rename 'class' to label\n",
+ "df2.rename(columns={'class': 'label'}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e6a6a19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b76458f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df2.iloc[0]['tweet']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "42a65071",
+ "metadata": {},
+ "source": [
+ "## Merge df1 and df2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "77c925a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.concat([df1, df2])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b81eef43",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "952ef123",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sns.countplot(x='label', data=df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "608c3277",
+ "metadata": {},
+ "source": [
+ "Now we can see that the problem of imbalace data has been solved."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "293d0d21",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4d8117e1",
+ "metadata": {},
+ "source": [
+ "## Data cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e76a3db9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Apply regex and do cleaning.\n",
+ "def clean_text(words: str) -> str:\n",
+ " words = str(words).lower()\n",
+ " words = re.sub('\\[.*?\\]', '', words)\n",
+ " words = re.sub('https?://\\S+|www\\.\\S+', '', words)\n",
+ " words = re.sub('<.*?>+', '', words)\n",
+ " words = re.sub(r'@\\w+', '', words)\n",
+ " words = re.sub('[%s]' % re.escape(string.punctuation), '', words)\n",
+ " words = re.sub('\\n', '', words)\n",
+ " words = re.sub('\\w*\\d\\w*', '', words)\n",
+ "\n",
+ " stopword = set(stopwords.words('english'))\n",
+ " words = ' '.join(\n",
+ " [word for word in words.split(' ') if word not in stopword])\n",
+ "\n",
+ " stemmer = nltk.SnowballStemmer(\"english\")\n",
+ " words = ' '.join([stemmer.stem(word) for word in words.split(' ')])\n",
+ "\n",
+ " return words"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fd98ec5a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Apply the data_cleaning on the data.\n",
+ "df_cleaned = df.copy()\n",
+ "df_cleaned['tweet'] = df['tweet'].apply(clean_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b5c6a309",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_cleaned['tweet'][1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3df4b3e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_cleaned.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "39e9dff5",
+ "metadata": {},
+ "source": [
+ "## Train test split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "060e1f76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x = df_cleaned['tweet']\n",
+ "y = df_cleaned['label']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5b39fbd9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Split the data into train and test\n",
+ "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)\n",
+ "print(len(x_train), len(y_train))\n",
+ "print(len(x_test), len(y_test))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "29be47f4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "type(x_test), type(y_test), type(x_train), type(y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "402ecb50",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(x_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0187c473",
+ "metadata": {},
+ "source": [
+ "## Tokenization and padding"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc49a7f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def tokenize_and_pad(text_list: Iterable[str], tokenizer: Tokenizer, max_len: int) -> np.ndarray[np.str_]:\n",
+ " sequences = tokenizer.texts_to_sequences(text_list)\n",
+ " sequences_matrix = pad_sequences(sequences, maxlen=max_len)\n",
+ " return sequences_matrix"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e4329001",
+ "metadata": {
+ "lines_to_next_cell": 2
+ },
+ "outputs": [],
+ "source": [
+ "max_words = 50000\n",
+ "max_len = 300\n",
+ "\n",
+ "tokenizer = Tokenizer(num_words=max_words)\n",
+ "tokenizer.fit_on_texts(x_train)\n",
+ "\n",
+ "x_train_tokenized = tokenize_and_pad(x_train, tokenizer, max_len)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21261eee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open('tokenizer.pickle', 'wb') as handle:\n",
+ " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5833c859",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x_train_tokenized"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "811f8996",
+ "metadata": {},
+ "source": [
+ "# Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b42ceb66",
+ "metadata": {},
+ "source": [
+ "## Model architecture"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "15e9d814",
+ "metadata": {
+ "lines_to_next_cell": 2
+ },
+ "outputs": [],
+ "source": [
+ "# Creating model architecture.\n",
+ "model = Sequential()\n",
+ "model.add(Embedding(max_words, 100, input_length=max_len))\n",
+ "model.add(SpatialDropout1D(0.2))\n",
+ "model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))\n",
+ "model.add(Dense(1, activation='sigmoid'))\n",
+ "\n",
+ "model.summary()\n",
+ "\n",
+ "model.compile(loss='binary_crossentropy',\n",
+ " optimizer=RMSprop(), metrics=['accuracy'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ae55985d",
+ "metadata": {},
+ "source": [
+ "## Callbacks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9065382d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "early_stopping_callback = EarlyStopping(\n",
+ " monitor='val_loss', # Metric to monitor (e.g., validation loss)\n",
+ " patience=3, # Number of epochs with no improvement to wait\n",
+ " restore_best_weights=True # Restore model weights to the best achieved during training\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "90fb2dbf",
+ "metadata": {},
+ "source": [
+ "## Training\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fb3a5153",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# starting model training\n",
+ "history = model.fit(\n",
+ " x_train_tokenized, y_train,\n",
+ " batch_size=128,\n",
+ " epochs=20,\n",
+ " validation_split=0.2,\n",
+ " callbacks=[early_stopping_callback]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b509694a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.save(\"model.h5\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "01484e53",
+ "metadata": {},
+ "source": [
+ "## Evaluation and testing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "86a6cd51",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_sequences = tokenizer.texts_to_sequences(x_test)\n",
+ "test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7674863a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Model evaluation\n",
+ "accr = model.evaluate(test_sequences_matrix, y_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "03f93f02",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lstm_prediction = model.predict(test_sequences_matrix)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b04a6f5",
+ "metadata": {
+ "lines_to_next_cell": 2
+ },
+ "outputs": [],
+ "source": [
+ "res = []\n",
+ "for prediction in lstm_prediction:\n",
+ " if prediction[0] < 0.5:\n",
+ " res.append(0)\n",
+ " else:\n",
+ " res.append(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "20ec485c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(confusion_matrix(y_test, res))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0062900e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "load_model = keras.models.load_model(\"model.h5\")\n",
+ "with open('tokenizer.pickle', 'rb') as handle:\n",
+ " load_tokenizer = pickle.load(handle)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5612cac0",
+ "metadata": {
+ "lines_to_next_cell": 2
+ },
+ "outputs": [],
+ "source": [
+ "# Let's test our model on custom data.\n",
+ "test = 'humans are idiots'\n",
+ "\n",
+ "\n",
+ "def clean_text(text):\n",
+ " print(text)\n",
+ " text = str(text).lower()\n",
+ " text = re.sub('\\[.*?\\]', '', text)\n",
+ " text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n",
+ " text = re.sub('<.*?>+', '', text)\n",
+ " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n",
+ " text = re.sub('\\n', '', text)\n",
+ " text = re.sub('\\w*\\d\\w*', '', text)\n",
+ " print(text)\n",
+ " text = [word for word in text.split(' ') if word not in stopword]\n",
+ " text = \" \".join(text)\n",
+ " text = [stemmer.stem(word) for word in text.split(' ')]\n",
+ " text = \" \".join(text)\n",
+ " return text\n",
+ "\n",
+ "\n",
+ "test = [clean_text(test)]\n",
+ "print(test)\n",
+ "seq = load_tokenizer.texts_to_sequences(test)\n",
+ "padded = pad_sequences(seq, maxlen=300)\n",
+ "print(seq)\n",
+ "pred = load_model.predict(padded)\n",
+ "print(\"pred\", pred)\n",
+ "if pred < 0.5:\n",
+ " print(\"no hate\")\n",
+ "else:\n",
+ " print(\"hate and abusive\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d90fb1eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e564ae3e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "while True:\n",
+ " pass"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "41301aee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# https://www.kaggle.com/soumyaprabhamaiti/hate-speech-classification/edit"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

development/requirements_dev.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+tensorflow
+numpy
+pandas
+seaborn
+matplotlib
+gradio
+nltk
+jupytext