soumyaprabhamaiti commited on
Commit
5ce506c
1 Parent(s): cc07f38

Add hate classifier app

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.github/workflows/check_file_size.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check file size
2
+ on: # or directly `on: [push]` to run the action on every push on any branch
3
+ pull_request:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ check-file-size:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Check large files
14
+ uses: ActionsDesk/[email protected]
15
+ with:
16
+ filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
.github/workflows/sync_to_hub.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to Hugging Face hub
2
+ on:
3
+ push:
4
+ branches: [main]
5
+
6
+ # to run this workflow manually from the Actions tab
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ sync-to-hub:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v3
14
+ with:
15
+ fetch-depth: 0
16
+ lfs: true
17
+ - name: Push to hub
18
+ env:
19
+ HF: ${{ secrets.HF }}
20
+ run: git push --force https://soumyaprabhamaiti:[email protected]/spaces/soumyaprabhamaiti/hate_speech_classifier main
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hate Speech Classifier
3
+ emoji: 📊
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.42.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
app.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
+ import gradio as gr
4
+ import numpy as np
5
+ import tensorflow as tf
6
+
7
+ from utils import clean_text, tokenize_and_pad
8
+
9
+ # Load pre-trained TensorFlow model
10
+ model = tf.keras.models.load_model('model.h5')
11
+
12
+ # Load tokenizer
13
+ with open('tokenizer.pickle', 'rb') as handle:
14
+ tokenizer = pickle.load(handle)
15
+ print(type(tokenizer))
16
+
17
+ # Constants
18
+ MAX_LEN = 300
19
+
20
+
21
+ def predict_hate_speech(text):
22
+ # Clean the text
23
+ cleaned_text = clean_text(text)
24
+
25
+
26
+ # Tokenize and pad the text
27
+ preprocessed_text = tokenize_and_pad([cleaned_text], tokenizer, MAX_LEN)
28
+
29
+ # Make a prediction
30
+ prediction = model.predict(preprocessed_text)
31
+
32
+ # Assuming you have two classes: "Hate" and "Not Hate"
33
+ if prediction > 0.5:
34
+ result = "Hate"
35
+ else:
36
+ result = "Not Hate"
37
+
38
+ return result
39
+
40
+
41
+ # Create a Gradio interface
42
+ iface = gr.Interface(
43
+ fn=predict_hate_speech,
44
+ inputs=gr.Textbox(label="Input Text"),
45
+ outputs=gr.Textbox(label="Output Prediction"),
46
+ title="Hate Speech Classification",
47
+ description="A simple hate speech classifier. Enter a text and click submit to make a prediction."
48
+ )
49
+
50
+ # Run the Gradio app
51
+ iface.launch()
model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b836d0f75bb836d9cd0cfcd0657e35cfd659a7c4085af86ddec382cfdb9275dc
3
+ size 40676464
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ tensorflow
2
+ numpy
3
+ gradio
4
+ nltk
tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f982b4524646588f84f61f9cb9bc49998672afccc012f355af7eb787117bd1a0
3
+ size 1701049
utils.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from collections.abc import Iterable
4
+
5
+ import nltk
6
+ import numpy as np
7
+ from keras.preprocessing.text import Tokenizer
8
+ from keras.utils import pad_sequences
9
+ from nltk.corpus import stopwords
10
+
11
+ nltk.download('stopwords')
12
+
13
+
14
+ # Apply regex and do cleaning.
15
+ def clean_text(words: str) -> str:
16
+ words = str(words).lower()
17
+ words = re.sub('\[.*?\]', '', words)
18
+ words = re.sub('https?://\S+|www\.\S+', '', words)
19
+ words = re.sub('<.*?>+', '', words)
20
+ words = re.sub(r'@\w+', '', words)
21
+ words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
22
+ words = re.sub('\n', '', words)
23
+ words = re.sub('\w*\d\w*', '', words)
24
+
25
+ stopword = set(stopwords.words('english'))
26
+ words = ' '.join(
27
+ [word for word in words.split(' ') if word not in stopword])
28
+
29
+ stemmer = nltk.SnowballStemmer("english")
30
+ words = ' '.join([stemmer.stem(word) for word in words.split(' ')])
31
+
32
+ return words
33
+
34
+
35
+ def tokenize_and_pad(text_list: Iterable[str], tokenizer: Tokenizer, max_len: int) -> np.ndarray[np.str_]:
36
+ sequences = tokenizer.texts_to_sequences(text_list)
37
+ sequences_matrix = pad_sequences(sequences, maxlen=max_len)
38
+ return sequences_matrix