soumyaprabhamaiti commited on
Commit
3abbcfd
1 Parent(s): 5ce506c

Add development folder

Browse files
development/hate-speech-classification.ipynb ADDED
@@ -0,0 +1,815 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c99a9e2c",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Import the necessary libraries"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "bb19171c",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "import pickle\n",
20
+ "import re\n",
21
+ "import string\n",
22
+ "from collections.abc import Iterable\n",
23
+ "\n",
24
+ "import keras\n",
25
+ "import matplotlib.pyplot as plt\n",
26
+ "import nltk\n",
27
+ "import numpy as np\n",
28
+ "import pandas as pd\n",
29
+ "import seaborn as sns\n",
30
+ "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
31
+ "from keras.layers import (LSTM, Activation, Dense, Dropout, Embedding, Input,\n",
32
+ " SpatialDropout1D)\n",
33
+ "from keras.models import Model, Sequential\n",
34
+ "from keras.optimizers import RMSprop\n",
35
+ "from keras.preprocessing import sequence\n",
36
+ "from keras.preprocessing.text import Tokenizer\n",
37
+ "from keras.utils import pad_sequences, to_categorical\n",
38
+ "from nltk.corpus import stopwords\n",
39
+ "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
40
+ "from sklearn.metrics import confusion_matrix\n",
41
+ "from sklearn.model_selection import train_test_split\n",
42
+ "\n",
43
+ "nltk.download('stopwords')\n",
44
+ "pd.set_option('display.max_rows', None)\n",
45
+ "pd.set_option('display.max_columns', None)\n",
46
+ "pd.set_option('display.max_colwidth', 255)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "markdown",
51
+ "id": "77ee39a1",
52
+ "metadata": {},
53
+ "source": [
54
+ "# Dataset"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "markdown",
59
+ "id": "2289c89e",
60
+ "metadata": {},
61
+ "source": [
62
+ "## Dataset 1"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "id": "70bddc47",
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "df1 = pd.read_csv(\"/kaggle/input/twitter-hate-speech/train_E6oV3lV.csv\")"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": null,
78
+ "id": "e407435d",
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "df1.head()"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "id": "4ea10f67",
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "sns.countplot(x='label', data=df1)"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "markdown",
97
+ "id": "4bef62c7",
98
+ "metadata": {},
99
+ "source": [
100
+ "From the above plot we can see that classes are imbalanced, we will fix it later."
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "id": "252edcb4",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "# Checking the shape of the data\n",
111
+ "df1.shape"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": null,
117
+ "id": "0e256090",
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "# Cheking if null values are present in the dataset or not.\n",
122
+ "df1.isnull().sum()"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "8d0cc255",
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "# Drop unnecessary columns\n",
133
+ "df1.drop('id', axis=1, inplace=True)"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "id": "963f8229",
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "df1.head()"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "markdown",
148
+ "id": "5767e166",
149
+ "metadata": {},
150
+ "source": [
151
+ "## Dataset 2"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "id": "bd8dde1a",
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "df2 = pd.read_csv(\n",
162
+ " \"/kaggle/input/hate-speech-and-offensive-language-dataset/labeled_data.csv\")\n",
163
+ "df2.head()"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": null,
169
+ "id": "a8a4a332",
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "df2.shape"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "id": "b66a6907",
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": [
183
+ "df2.isnull().sum()"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "id": "49db9d8d",
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "# Drop the columns which are not required for us.\n",
194
+ "df2.drop(['Unnamed: 0', 'count', 'hate_speech',\n",
195
+ " 'offensive_language', 'neither'], axis=1, inplace=True)"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "id": "48981e64",
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "df2.head()"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": null,
211
+ "id": "97b0500b",
212
+ "metadata": {},
213
+ "outputs": [],
214
+ "source": [
215
+ "# All the unique class labels\n",
216
+ "df2['class'].unique()"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": null,
222
+ "id": "71971d95",
223
+ "metadata": {},
224
+ "outputs": [],
225
+ "source": [
226
+ "# Plotting the countplot for our new dataset\n",
227
+ "sns.countplot(x='class', data=df2)"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "markdown",
232
+ "id": "1ce30639",
233
+ "metadata": {},
234
+ "source": [
235
+ "- class 0 - hate speech; class 1 - offensive language; class 2 - neither"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "id": "ce04999f",
242
+ "metadata": {},
243
+ "outputs": [],
244
+ "source": [
245
+ "# Merge class 0 and 1 into 1. Class 1 now represents hate speech\n",
246
+ "df2[\"class\"].replace({0: 1}, inplace=True)"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "499d5336",
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "df2[\"class\"].unique()"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": null,
262
+ "id": "2cb91824",
263
+ "metadata": {},
264
+ "outputs": [],
265
+ "source": [
266
+ "sns.countplot(x=\"class\", data=df2)"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "id": "9bf7ba3a",
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": [
276
+ "# Replace the value of 2 to 0.Class 0 is now \"No hate\"\n",
277
+ "df2[\"class\"].replace({2: 0}, inplace=True)"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": null,
283
+ "id": "16bc2c3e",
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "sns.countplot(x='class', data=df2)"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": null,
293
+ "id": "d5834f0e",
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "# Rename 'class' to label\n",
298
+ "df2.rename(columns={'class': 'label'}, inplace=True)"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": null,
304
+ "id": "0e6a6a19",
305
+ "metadata": {},
306
+ "outputs": [],
307
+ "source": [
308
+ "df2.head()"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": null,
314
+ "id": "b76458f2",
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": [
318
+ "df2.iloc[0]['tweet']"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "markdown",
323
+ "id": "42a65071",
324
+ "metadata": {},
325
+ "source": [
326
+ "## Merge df1 and df2"
327
+ ]
328
+ },
329
+ {
330
+ "cell_type": "code",
331
+ "execution_count": null,
332
+ "id": "77c925a5",
333
+ "metadata": {},
334
+ "outputs": [],
335
+ "source": [
336
+ "df = pd.concat([df1, df2])"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "execution_count": null,
342
+ "id": "b81eef43",
343
+ "metadata": {},
344
+ "outputs": [],
345
+ "source": [
346
+ "df.head()"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": null,
352
+ "id": "952ef123",
353
+ "metadata": {},
354
+ "outputs": [],
355
+ "source": [
356
+ "sns.countplot(x='label', data=df)"
357
+ ]
358
+ },
359
+ {
360
+ "cell_type": "markdown",
361
+ "id": "608c3277",
362
+ "metadata": {},
363
+ "source": [
364
+ "Now we can see that the problem of imbalace data has been solved."
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "code",
369
+ "execution_count": null,
370
+ "id": "293d0d21",
371
+ "metadata": {},
372
+ "outputs": [],
373
+ "source": [
374
+ "df.shape"
375
+ ]
376
+ },
377
+ {
378
+ "cell_type": "markdown",
379
+ "id": "4d8117e1",
380
+ "metadata": {},
381
+ "source": [
382
+ "## Data cleaning"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": null,
388
+ "id": "e76a3db9",
389
+ "metadata": {},
390
+ "outputs": [],
391
+ "source": [
392
+ "# Apply regex and do cleaning.\n",
393
+ "def clean_text(words: str) -> str:\n",
394
+ " words = str(words).lower()\n",
395
+ " words = re.sub('\\[.*?\\]', '', words)\n",
396
+ " words = re.sub('https?://\\S+|www\\.\\S+', '', words)\n",
397
+ " words = re.sub('<.*?>+', '', words)\n",
398
+ " words = re.sub(r'@\\w+', '', words)\n",
399
+ " words = re.sub('[%s]' % re.escape(string.punctuation), '', words)\n",
400
+ " words = re.sub('\\n', '', words)\n",
401
+ " words = re.sub('\\w*\\d\\w*', '', words)\n",
402
+ "\n",
403
+ " stopword = set(stopwords.words('english'))\n",
404
+ " words = ' '.join(\n",
405
+ " [word for word in words.split(' ') if word not in stopword])\n",
406
+ "\n",
407
+ " stemmer = nltk.SnowballStemmer(\"english\")\n",
408
+ " words = ' '.join([stemmer.stem(word) for word in words.split(' ')])\n",
409
+ "\n",
410
+ " return words"
411
+ ]
412
+ },
413
+ {
414
+ "cell_type": "code",
415
+ "execution_count": null,
416
+ "id": "fd98ec5a",
417
+ "metadata": {},
418
+ "outputs": [],
419
+ "source": [
420
+ "# Apply the data_cleaning on the data.\n",
421
+ "df_cleaned = df.copy()\n",
422
+ "df_cleaned['tweet'] = df['tweet'].apply(clean_text)"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": null,
428
+ "id": "b5c6a309",
429
+ "metadata": {},
430
+ "outputs": [],
431
+ "source": [
432
+ "df_cleaned['tweet'][1]"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": null,
438
+ "id": "3df4b3e0",
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "df_cleaned.head(10)"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "markdown",
447
+ "id": "39e9dff5",
448
+ "metadata": {},
449
+ "source": [
450
+ "## Train test split"
451
+ ]
452
+ },
453
+ {
454
+ "cell_type": "code",
455
+ "execution_count": null,
456
+ "id": "060e1f76",
457
+ "metadata": {},
458
+ "outputs": [],
459
+ "source": [
460
+ "x = df_cleaned['tweet']\n",
461
+ "y = df_cleaned['label']"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": null,
467
+ "id": "5b39fbd9",
468
+ "metadata": {},
469
+ "outputs": [],
470
+ "source": [
471
+ "# Split the data into train and test\n",
472
+ "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)\n",
473
+ "print(len(x_train), len(y_train))\n",
474
+ "print(len(x_test), len(y_test))"
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": null,
480
+ "id": "29be47f4",
481
+ "metadata": {},
482
+ "outputs": [],
483
+ "source": [
484
+ "type(x_test), type(y_test), type(x_train), type(y_train)"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "code",
489
+ "execution_count": null,
490
+ "id": "402ecb50",
491
+ "metadata": {},
492
+ "outputs": [],
493
+ "source": [
494
+ "len(x_test)"
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "markdown",
499
+ "id": "0187c473",
500
+ "metadata": {},
501
+ "source": [
502
+ "## Tokenization and padding"
503
+ ]
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "execution_count": null,
508
+ "id": "cc49a7f7",
509
+ "metadata": {},
510
+ "outputs": [],
511
+ "source": [
512
+ "def tokenize_and_pad(text_list: Iterable[str], tokenizer: Tokenizer, max_len: int) -> np.ndarray[np.str_]:\n",
513
+ " sequences = tokenizer.texts_to_sequences(text_list)\n",
514
+ " sequences_matrix = pad_sequences(sequences, maxlen=max_len)\n",
515
+ " return sequences_matrix"
516
+ ]
517
+ },
518
+ {
519
+ "cell_type": "code",
520
+ "execution_count": null,
521
+ "id": "e4329001",
522
+ "metadata": {
523
+ "lines_to_next_cell": 2
524
+ },
525
+ "outputs": [],
526
+ "source": [
527
+ "max_words = 50000\n",
528
+ "max_len = 300\n",
529
+ "\n",
530
+ "tokenizer = Tokenizer(num_words=max_words)\n",
531
+ "tokenizer.fit_on_texts(x_train)\n",
532
+ "\n",
533
+ "x_train_tokenized = tokenize_and_pad(x_train, tokenizer, max_len)"
534
+ ]
535
+ },
536
+ {
537
+ "cell_type": "code",
538
+ "execution_count": null,
539
+ "id": "21261eee",
540
+ "metadata": {},
541
+ "outputs": [],
542
+ "source": [
543
+ "with open('tokenizer.pickle', 'wb') as handle:\n",
544
+ " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": null,
550
+ "id": "5833c859",
551
+ "metadata": {},
552
+ "outputs": [],
553
+ "source": [
554
+ "x_train_tokenized"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "markdown",
559
+ "id": "811f8996",
560
+ "metadata": {},
561
+ "source": [
562
+ "# Model"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "markdown",
567
+ "id": "b42ceb66",
568
+ "metadata": {},
569
+ "source": [
570
+ "## Model architecture"
571
+ ]
572
+ },
573
+ {
574
+ "cell_type": "code",
575
+ "execution_count": null,
576
+ "id": "15e9d814",
577
+ "metadata": {
578
+ "lines_to_next_cell": 2
579
+ },
580
+ "outputs": [],
581
+ "source": [
582
+ "# Creating model architecture.\n",
583
+ "model = Sequential()\n",
584
+ "model.add(Embedding(max_words, 100, input_length=max_len))\n",
585
+ "model.add(SpatialDropout1D(0.2))\n",
586
+ "model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))\n",
587
+ "model.add(Dense(1, activation='sigmoid'))\n",
588
+ "\n",
589
+ "model.summary()\n",
590
+ "\n",
591
+ "model.compile(loss='binary_crossentropy',\n",
592
+ " optimizer=RMSprop(), metrics=['accuracy'])"
593
+ ]
594
+ },
595
+ {
596
+ "cell_type": "markdown",
597
+ "id": "ae55985d",
598
+ "metadata": {},
599
+ "source": [
600
+ "## Callbacks"
601
+ ]
602
+ },
603
+ {
604
+ "cell_type": "code",
605
+ "execution_count": null,
606
+ "id": "9065382d",
607
+ "metadata": {},
608
+ "outputs": [],
609
+ "source": [
610
+ "early_stopping_callback = EarlyStopping(\n",
611
+ " monitor='val_loss', # Metric to monitor (e.g., validation loss)\n",
612
+ " patience=3, # Number of epochs with no improvement to wait\n",
613
+ " restore_best_weights=True # Restore model weights to the best achieved during training\n",
614
+ ")"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "markdown",
619
+ "id": "90fb2dbf",
620
+ "metadata": {},
621
+ "source": [
622
+ "## Training\n"
623
+ ]
624
+ },
625
+ {
626
+ "cell_type": "code",
627
+ "execution_count": null,
628
+ "id": "fb3a5153",
629
+ "metadata": {},
630
+ "outputs": [],
631
+ "source": [
632
+ "# starting model training\n",
633
+ "history = model.fit(\n",
634
+ " x_train_tokenized, y_train,\n",
635
+ " batch_size=128,\n",
636
+ " epochs=20,\n",
637
+ " validation_split=0.2,\n",
638
+ " callbacks=[early_stopping_callback]\n",
639
+ ")"
640
+ ]
641
+ },
642
+ {
643
+ "cell_type": "code",
644
+ "execution_count": null,
645
+ "id": "b509694a",
646
+ "metadata": {},
647
+ "outputs": [],
648
+ "source": [
649
+ "model.save(\"model.h5\")"
650
+ ]
651
+ },
652
+ {
653
+ "cell_type": "markdown",
654
+ "id": "01484e53",
655
+ "metadata": {},
656
+ "source": [
657
+ "## Evaluation and testing"
658
+ ]
659
+ },
660
+ {
661
+ "cell_type": "code",
662
+ "execution_count": null,
663
+ "id": "86a6cd51",
664
+ "metadata": {},
665
+ "outputs": [],
666
+ "source": [
667
+ "test_sequences = tokenizer.texts_to_sequences(x_test)\n",
668
+ "test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)"
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "code",
673
+ "execution_count": null,
674
+ "id": "7674863a",
675
+ "metadata": {},
676
+ "outputs": [],
677
+ "source": [
678
+ "# Model evaluation\n",
679
+ "accr = model.evaluate(test_sequences_matrix, y_test)"
680
+ ]
681
+ },
682
+ {
683
+ "cell_type": "code",
684
+ "execution_count": null,
685
+ "id": "03f93f02",
686
+ "metadata": {},
687
+ "outputs": [],
688
+ "source": [
689
+ "lstm_prediction = model.predict(test_sequences_matrix)"
690
+ ]
691
+ },
692
+ {
693
+ "cell_type": "code",
694
+ "execution_count": null,
695
+ "id": "2b04a6f5",
696
+ "metadata": {
697
+ "lines_to_next_cell": 2
698
+ },
699
+ "outputs": [],
700
+ "source": [
701
+ "res = []\n",
702
+ "for prediction in lstm_prediction:\n",
703
+ " if prediction[0] < 0.5:\n",
704
+ " res.append(0)\n",
705
+ " else:\n",
706
+ " res.append(1)"
707
+ ]
708
+ },
709
+ {
710
+ "cell_type": "code",
711
+ "execution_count": null,
712
+ "id": "20ec485c",
713
+ "metadata": {},
714
+ "outputs": [],
715
+ "source": [
716
+ "print(confusion_matrix(y_test, res))"
717
+ ]
718
+ },
719
+ {
720
+ "cell_type": "code",
721
+ "execution_count": null,
722
+ "id": "0062900e",
723
+ "metadata": {},
724
+ "outputs": [],
725
+ "source": [
726
+ "load_model = keras.models.load_model(\"model.h5\")\n",
727
+ "with open('tokenizer.pickle', 'rb') as handle:\n",
728
+ " load_tokenizer = pickle.load(handle)"
729
+ ]
730
+ },
731
+ {
732
+ "cell_type": "code",
733
+ "execution_count": null,
734
+ "id": "5612cac0",
735
+ "metadata": {
736
+ "lines_to_next_cell": 2
737
+ },
738
+ "outputs": [],
739
+ "source": [
740
+ "# Let's test our model on custom data.\n",
741
+ "test = 'humans are idiots'\n",
742
+ "\n",
743
+ "\n",
744
+ "def clean_text(text):\n",
745
+ " print(text)\n",
746
+ " text = str(text).lower()\n",
747
+ " text = re.sub('\\[.*?\\]', '', text)\n",
748
+ " text = re.sub('https?://\\S+|www\\.\\S+', '', text)\n",
749
+ " text = re.sub('<.*?>+', '', text)\n",
750
+ " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n",
751
+ " text = re.sub('\\n', '', text)\n",
752
+ " text = re.sub('\\w*\\d\\w*', '', text)\n",
753
+ " print(text)\n",
754
+ " text = [word for word in text.split(' ') if word not in stopword]\n",
755
+ " text = \" \".join(text)\n",
756
+ " text = [stemmer.stem(word) for word in text.split(' ')]\n",
757
+ " text = \" \".join(text)\n",
758
+ " return text\n",
759
+ "\n",
760
+ "\n",
761
+ "test = [clean_text(test)]\n",
762
+ "print(test)\n",
763
+ "seq = load_tokenizer.texts_to_sequences(test)\n",
764
+ "padded = pad_sequences(seq, maxlen=300)\n",
765
+ "print(seq)\n",
766
+ "pred = load_model.predict(padded)\n",
767
+ "print(\"pred\", pred)\n",
768
+ "if pred < 0.5:\n",
769
+ " print(\"no hate\")\n",
770
+ "else:\n",
771
+ " print(\"hate and abusive\")"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": null,
777
+ "id": "d90fb1eb",
778
+ "metadata": {},
779
+ "outputs": [],
780
+ "source": [
781
+ "model.summary()"
782
+ ]
783
+ },
784
+ {
785
+ "cell_type": "code",
786
+ "execution_count": null,
787
+ "id": "e564ae3e",
788
+ "metadata": {},
789
+ "outputs": [],
790
+ "source": [
791
+ "while True:\n",
792
+ " pass"
793
+ ]
794
+ },
795
+ {
796
+ "cell_type": "code",
797
+ "execution_count": null,
798
+ "id": "41301aee",
799
+ "metadata": {},
800
+ "outputs": [],
801
+ "source": [
802
+ "# https://www.kaggle.com/soumyaprabhamaiti/hate-speech-classification/edit"
803
+ ]
804
+ }
805
+ ],
806
+ "metadata": {
807
+ "kernelspec": {
808
+ "display_name": "Python 3",
809
+ "language": "python",
810
+ "name": "python3"
811
+ }
812
+ },
813
+ "nbformat": 4,
814
+ "nbformat_minor": 5
815
+ }
development/requirements_dev.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ tensorflow
2
+ numpy
3
+ pandas
4
+ seaborn
5
+ matplotlib
6
+ gradio
7
+ nltk
8
+ jupytext