neelsahu commited on
Commit
f3d8098
·
1 Parent(s): 0c01d2e
__pycache__/clean.cpython-39.pyc ADDED
Binary file (1.12 kB). View file
 
__pycache__/language_detection.cpython-39.pyc ADDED
Binary file (2.3 kB). View file
 
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.components import Text
3
+ import joblib
4
+ import clean
5
+
6
+ import numpy as np
7
+ import language_detection
8
+ print("all imports worked")
9
+ # Load pre-trained model
10
+ model = joblib.load('model_joblib.pkl')
11
+ print("model load ")
12
+ tf = joblib.load('tf_joblib.pkl')
13
+ print("tfidf load ")
14
+
15
+ # Define function to predict whether sentence is abusive or not
16
+ def predict_abusive_lang(text):
17
+ print("original text ", text)
18
+
19
+ lang = language_detection.en_hi_detection(text)
20
+ print("language detected ", lang)
21
+
22
+ if lang=='eng':
23
+ cleaned_text = clean.text_cleaning(text)
24
+ print("cleaned text ", text)
25
+ text = tf.transform([cleaned_text])
26
+ print("tfidf transformation ", text)
27
+ prediction = model.predict(text)
28
+ print("prediction ", prediction)
29
+ if len(prediction)!=0 and prediction[0]==0:
30
+ return ["Not Abusive", cleaned_text]
31
+ elif len(prediction)!=0 and prediction[0]==1:
32
+ return ["Abusive",cleaned_text]
33
+ else :
34
+ return ["Please write something in the comment box..","No cleaned text"]
35
+ elif lang=='hi':
36
+ print("using hugging face api")
37
+ return ["Hindi Text abusive part coming soon.....","No cleaned text"]
38
+ else :
39
+ return ["Unknown language","No cleaned text"]
40
+
41
+
42
+ # text = '":::::: 128514 - & % ! @ # $ % ^ & * ( ) _ + I got blocked for 30 minutes, you got blocked for more than days. You is lost. www.google.com, #happydiwali, @amangupta And I don\'t even know who the fuck are you. It\'s a zero! \n"'
43
+ # predict_abusive_lang(text)
44
+
45
+ # Define the GRADIO output interfaces
46
+ output_interfaces = [
47
+ gr.outputs.Textbox(label="Result"),
48
+ gr.outputs.Textbox(label="Cleaned text")
49
+ ]
50
+ app = gr.Interface(predict_abusive_lang, inputs='text', outputs=output_interfaces, title="Abuse Classifier", description="Enter a sentence and the model will predict whether it is abusive or not.")
51
+ #Start the GRADIO app
52
+ app.launch()
clean.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from string import punctuation
2
+ import re
3
+
4
+ def text_cleaning(text):
5
+ # Remove URLs starting with http, https and www, as well as quotes
6
+ result = re.sub(r'http\S+|www\S+|\"', '', text)
7
+
8
+ # Split the text into a list of words
9
+ words = result.split()
10
+
11
+ # Remove mentions and hashtags
12
+ words = [word for word in words if not word.startswith(('@', '#'))]
13
+
14
+ # Remove leading/trailing punctuation, and individual punctuation marks
15
+ words = [word.strip(punctuation) for word in words if word not in punctuation]
16
+ filtered_list = [item for item in words if item != '']
17
+ # Remove words starting with digits
18
+ words = [word for word in filtered_list if not word[0].isdigit()]
19
+
20
+ # Convert all words to lowercase
21
+ words = [w.lower() for w in words]
22
+
23
+ return " ".join(words)
language_detection.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import nltk
3
+ from nltk.corpus import wordnet
4
+ import re
5
+ from nltk.stem import WordNetLemmatizer
6
+
7
+ stop_words = ['i',
8
+ 'me',
9
+ 'my',
10
+ 'myself',
11
+ 'we',
12
+ 'our',
13
+ 'ours',
14
+ 'ourselves',
15
+ 'you',
16
+ "you're",
17
+ "you've",
18
+ "you'll",
19
+ "you'd",
20
+ 'your',
21
+ 'yours',
22
+ 'yourself',
23
+ 'yourselves',
24
+ 'he',
25
+ 'him',
26
+ 'his',
27
+ 'himself',
28
+ 'she',
29
+ "she's",
30
+ 'her',
31
+ 'hers',
32
+ 'herself',
33
+ 'it',
34
+ "it's",
35
+ 'its',
36
+ 'itself',
37
+ 'they',
38
+ 'them',
39
+ 'their',
40
+ 'theirs',
41
+ 'themselves',
42
+ 'what',
43
+ 'which',
44
+ 'who',
45
+ 'whom',
46
+ 'this',
47
+ 'that',
48
+ "that'll",
49
+ 'these',
50
+ 'those',
51
+ 'am',
52
+ 'is',
53
+ 'are',
54
+ 'was',
55
+ 'were',
56
+ 'be',
57
+ 'been',
58
+ 'being',
59
+ 'have',
60
+ 'has',
61
+ 'had',
62
+ 'having',
63
+ 'do',
64
+ 'does',
65
+ 'did',
66
+ 'doing',
67
+ 'a',
68
+ 'an',
69
+ 'the',
70
+ 'and',
71
+ 'but',
72
+ 'if',
73
+ 'or',
74
+ 'because',
75
+ 'as',
76
+ 'until',
77
+ 'while',
78
+ 'of',
79
+ 'at',
80
+ 'by',
81
+ 'for',
82
+ 'with',
83
+ 'about',
84
+ 'against',
85
+ 'between',
86
+ 'into',
87
+ 'through',
88
+ 'during',
89
+ 'before',
90
+ 'after',
91
+ 'above',
92
+ 'below',
93
+ 'to',
94
+ 'from',
95
+ 'up',
96
+ 'down',
97
+ 'in',
98
+ 'out',
99
+ 'on',
100
+ 'off',
101
+ 'over',
102
+ 'under',
103
+ 'again',
104
+ 'further',
105
+ 'then',
106
+ 'once',
107
+ 'here',
108
+ 'there',
109
+ 'when',
110
+ 'where',
111
+ 'why',
112
+ 'how',
113
+ 'all',
114
+ 'any',
115
+ 'both',
116
+ 'each',
117
+ 'few',
118
+ 'more',
119
+ 'most',
120
+ 'other',
121
+ 'some',
122
+ 'such',
123
+ 'no',
124
+ 'nor',
125
+ 'not',
126
+ 'only',
127
+ 'own',
128
+ 'same',
129
+ 'so',
130
+ 'than',
131
+ 'too',
132
+ 'very',
133
+ 's',
134
+ 't',
135
+ 'can',
136
+ 'will',
137
+ 'just',
138
+ 'don',
139
+ "don't",
140
+ 'should',
141
+ "should've",
142
+ 'now',
143
+ 'd',
144
+ 'll',
145
+ 'm',
146
+ 'o',
147
+ 're',
148
+ 've',
149
+ 'y',
150
+ 'ain',
151
+ 'aren',
152
+ "aren't",
153
+ 'couldn',
154
+ "couldn't",
155
+ 'didn',
156
+ "didn't",
157
+ 'doesn',
158
+ "doesn't",
159
+ 'hadn',
160
+ "hadn't",
161
+ 'hasn',
162
+ "hasn't",
163
+ 'haven',
164
+ "haven't",
165
+ 'isn',
166
+ "isn't",
167
+ 'ma',
168
+ 'mightn',
169
+ "mightn't",
170
+ 'mustn',
171
+ "mustn't",
172
+ 'needn',
173
+ "needn't",
174
+ 'shan',
175
+ "shan't",
176
+ 'shouldn',
177
+ "shouldn't",
178
+ 'wasn',
179
+ "wasn't",
180
+ 'weren',
181
+ "weren't",
182
+ 'won',
183
+ "won't",
184
+ 'wouldn',
185
+ "wouldn't"]
186
+ # Create a lemmatizer object
187
+ lemmatizer = WordNetLemmatizer()
188
+
189
+ #from english_words import get_english_words_set
190
+ #web2lowerset = get_english_words_set(['web2'], lower=True)
191
+
192
+ # Define the Unicode range for Hindi letters
193
+ HINDI_UNICODE_RANGE = (0x0900, 0x097F)
194
+
195
+ # Function to check if a given character is a Hindi letter
196
+ def is_hindi_letter(c):
197
+ return ord(c) >= HINDI_UNICODE_RANGE[0] and ord(c) <= HINDI_UNICODE_RANGE[1]
198
+
199
+
200
+ # In[8]:
201
+
202
+
203
+
204
+ def en_hi_detection(text):
205
+ text = re.sub(r'[^\w\s]', ' ', text)
206
+
207
+ words = text.lower().strip().split()
208
+ count_en = 0
209
+ # Lemmatize words for all POS
210
+ for word in words:
211
+ for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
212
+ # print(f"{word} ({pos}): {lemmatizer.lemmatize(word, pos)}")
213
+ lem_word = lemmatizer.lemmatize(word, pos)
214
+ if lem_word in nltk.corpus.wordnet.words():
215
+ count_en+=1
216
+ break
217
+ elif lem_word in stop_words:
218
+ count_en+=1
219
+ break
220
+ #print("total english words found :", count_en)
221
+ #print("length of sentence :", len(words))
222
+ #print(count_en/len(words)*100, "% english words found")
223
+
224
+
225
+ count = 0
226
+ # Check each word for Hindi letters and print the results
227
+ for word in words:
228
+ hindi_letters = []
229
+ for c in word:
230
+ if is_hindi_letter(c):
231
+ hindi_letters.append(c)
232
+ if hindi_letters:
233
+ #print(f"Word '{word}' contains Hindi letters: {' '.join(hindi_letters)}")
234
+ count+=1
235
+ else:
236
+ pass
237
+ #print(f"Word '{word}' does not contain any Hindi letters.")
238
+
239
+ #print(count/len(words)*100, "% Hindi words found")
240
+ if count_en/len(words)*100>75:
241
+ return "eng"
242
+ elif count/len(words)*100>75:
243
+ return "hi"
244
+ else :
245
+ return "unknown"
246
+
model_joblib.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6308a9d0d4eb28b3ea67bc20a2e200218a9ca2c12b2fc8e17027536d1147d20f
3
+ size 318919
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ scikit-learn==1.0.2
2
+ nltk==3.8.1
3
+ joblib==1.0.1
4
+
tf_joblib.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e53104db442b78f814eab3c2d081f6fc06279a4bdec6cfaea81c8221447f5dd3
3
+ size 1441403