# Import the necessary libraries

In [None]:
import os
import pickle
import re
import string
from collections.abc import Iterable

import keras
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import (LSTM, Activation, Dense, Dropout, Embedding, Input,
 SpatialDropout1D)
from keras.models import Model, Sequential
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 255)

# Dataset

## Dataset 1

In [None]:
df1 = pd.read_csv("/kaggle/input/twitter-hate-speech/train_E6oV3lV.csv")

In [None]:
df1.head()

In [None]:
sns.countplot(x='label', data=df1)

From the above plot we can see that classes are imbalanced, we will fix it later.

In [None]:
# Checking the shape of the data
df1.shape

In [None]:
# Cheking if null values are present in the dataset or not.
df1.isnull().sum()

In [None]:
# Drop unnecessary columns
df1.drop('id', axis=1, inplace=True)

In [None]:
df1.head()

## Dataset 2

In [None]:
df2 = pd.read_csv(
 "/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv")
df2.head()

In [None]:
df2.shape

In [None]:
df2.isnull().sum()

In [None]:
# Drop the columns which are not required for us.
df2.drop(['Unnamed: 0', 'count', 'hate_speech',
 'offensive_language', 'neither'], axis=1, inplace=True)

In [None]:
df2.head()

In [None]:
# All the unique class labels
df2['class'].unique()

In [None]:
# Plotting the countplot for our new dataset
sns.countplot(x='class', data=df2)

- class 0 - hate speech; class 1 - offensive language; class 2 - neither

In [None]:
# Merge class 0 and 1 into 1. Class 1 now represents hate speech
df2["class"].replace({0: 1}, inplace=True)

In [None]:
df2["class"].unique()

In [None]:
sns.countplot(x="class", data=df2)

In [None]:
# Replace the value of 2 to 0.Class 0 is now "No hate"
df2["class"].replace({2: 0}, inplace=True)

In [None]:
sns.countplot(x='class', data=df2)

In [None]:
# Rename 'class' to label
df2.rename(columns={'class': 'label'}, inplace=True)

In [None]:
df2.head()

In [None]:
df2.iloc[0]['tweet']

## Merge df1 and df2

In [None]:
df = pd.concat([df1, df2])

In [None]:
df.head()

In [None]:
sns.countplot(x='label', data=df)

Now we can see that the problem of imbalace data has been solved.

In [None]:
df.shape

## Data cleaning

In [None]:
# Apply regex and do cleaning.
def clean_text(words: str) -> str:
 words = str(words).lower()
 words = re.sub('\[.*?\]', '', words)
 words = re.sub('https?://\S+|www\.\S+', '', words)
 words = re.sub('<.*?>+', '', words)
 words = re.sub(r'@\w+', '', words)
 words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
 words = re.sub('\n', '', words)
 words = re.sub('\w*\d\w*', '', words)

 stopword = set(stopwords.words('english'))
 words = ' '.join(
 [word for word in words.split(' ') if word not in stopword])

 stemmer = nltk.SnowballStemmer("english")
 words = ' '.join([stemmer.stem(word) for word in words.split(' ')])

 return words

In [None]:
# Apply the data_cleaning on the data.
df_cleaned = df.copy()
df_cleaned['tweet'] = df['tweet'].apply(clean_text)

In [None]:
df_cleaned['tweet'][1]

In [None]:
df_cleaned.head(10)

## Train test split

In [None]:
x = df_cleaned['tweet']
y = df_cleaned['label']

In [None]:
# Split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

In [None]:
type(x_test), type(y_test), type(x_train), type(y_train)

In [None]:
len(x_test)

## Tokenization and padding

In [None]:
def tokenize_and_pad(text_list: Iterable[str], tokenizer: Tokenizer, max_len: int) -> np.ndarray[np.str_]:
 sequences = tokenizer.texts_to_sequences(text_list)
 sequences_matrix = pad_sequences(sequences, maxlen=max_len)
 return sequences_matrix

In [None]:
max_words = 50000
max_len = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

x_train_tokenized = tokenize_and_pad(x_train, tokenizer, max_len)

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
 pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
x_train_tokenized

# Model

## Model architecture

In [None]:
# Creating model architecture.
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy',
 optimizer=RMSprop(), metrics=['accuracy'])

## Callbacks

In [None]:
early_stopping_callback = EarlyStopping(
 monitor='val_loss', # Metric to monitor (e.g., validation loss)
 patience=3, # Number of epochs with no improvement to wait
 restore_best_weights=True # Restore model weights to the best achieved during training
)

## Training


In [None]:
# starting model training
history = model.fit(
 x_train_tokenized, y_train,
 batch_size=128,
 epochs=20,
 validation_split=0.2,
 callbacks=[early_stopping_callback]
)

In [None]:
model.save("model.h5")

## Evaluation and testing

In [None]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)

In [None]:
# Model evaluation
accr = model.evaluate(test_sequences_matrix, y_test)

In [None]:
lstm_prediction = model.predict(test_sequences_matrix)

In [None]:
res = []
for prediction in lstm_prediction:
 if prediction[0] < 0.5:
 res.append(0)
 else:
 res.append(1)

In [None]:
print(confusion_matrix(y_test, res))

In [None]:
load_model = keras.models.load_model("model.h5")
with open('tokenizer.pickle', 'rb') as handle:
 load_tokenizer = pickle.load(handle)

In [None]:
# Let's test our model on custom data.
test = 'humans are idiots'


def clean_text(text):
 print(text)
 text = str(text).lower()
 text = re.sub('\[.*?\]', '', text)
 text = re.sub('https?://\S+|www\.\S+', '', text)
 text = re.sub('<.*?>+', '', text)
 text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
 text = re.sub('\n', '', text)
 text = re.sub('\w*\d\w*', '', text)
 print(text)
 text = [word for word in text.split(' ') if word not in stopword]
 text = " ".join(text)
 text = [stemmer.stem(word) for word in text.split(' ')]
 text = " ".join(text)
 return text


test = [clean_text(test)]
print(test)
seq = load_tokenizer.texts_to_sequences(test)
padded = pad_sequences(seq, maxlen=300)
print(seq)
pred = load_model.predict(padded)
print("pred", pred)
if pred < 0.5:
 print("no hate")
else:
 print("hate and abusive")

In [None]:
model.summary()

In [None]:
while True:
 pass

In [None]:
# https://www.kaggle.com/soumyaprabhamaiti/hate-speech-classification/edit