Update app.py
Browse files
@@ -4,6 +4,9 @@ from textblob import TextBlob
4 |
import spacy
5 |
from spacytextblob.spacytextblob import SpacyTextBlob
6 |
7 |
st.set_page_config(layout='wide', initial_sidebar_state='expanded')
8 |
st.title("Spamd: Turkish Spam Detector")
9 |
st.markdown("Enter the text you'd like to analyze for spam.")
@@ -15,41 +18,15 @@ Original file is located at
15 |
16 |
17 |
18 |
#Cuda and PyTorch Versions must match https://pytorch.org/get-started/locally/
19 |
20 |
21 |
22 |
import csv
23 |
data = []
24 |
# with open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile:
25 |
# reader = csv.reader(csvfile, skipinitialspace=True)
26 |
# data.append(tuple(next(reader)))
27 |
# for Message, Group in reader:
28 |
# data.append((int(Group), Message))
29 |
import pandas as pd
30 |
31 |
32 |
df = pd.read_csv('TurkishSMSCollection.csv', encoding='utf-8', on_bad_lines='skip', usecols= ['Group','Message'], sep=r';')
33 |
df['Group']= df['Group'].replace(2, 0)
34 |
35 |
# reader = open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile
36 |
37 |
38 |
text = df.Message.values
39 |
40 |
41 |
labels = df.Group.values
42 |
43 |
44 |
45 |
46 |
from transformers import AutoTokenizer
47 |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
48 |
49 |
import os
50 |
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
51 |
52 |
import torch
53 |
token_id = []
54 |
attention_masks = []
55 |
@@ -69,203 +46,7 @@ def preprocessing(input_text, tokenizer):
69 |
return_tensors = 'pt'
70 |
71 |
72 |
73 |
for sample in text:
74 |
encoding_dict = preprocessing(sample, tokenizer)
75 |
76 |
77 |
78 |
79 |
token_id = torch.cat(token_id, dim = 0)
80 |
attention_masks = torch.cat(attention_masks, dim = 0)
81 |
labels = torch.tensor(labels)
82 |
83 |
84 |
85 |
86 |
import random
87 |
import numpy as np
88 |
from tabulate import tabulate
89 |
def print_rand_sentence_encoding():
90 |
'''Displays tokens, token IDs and attention mask of a random text sample'''
91 |
index = random.randint(0, len(text) - 1)
92 |
tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
93 |
token_ids = [i.numpy() for i in token_id[index]]
94 |
attention = [i.numpy() for i in attention_masks[index]]
95 |
96 |
table = np.array([tokens, token_ids, attention]).T
97 |
98 |
headers = ['Tokens', 'Token IDs', 'Attention Mask'],
99 |
tablefmt = 'fancy_grid'))
100 |
101 |
102 |
103 |
104 |
from sklearn.model_selection import train_test_split
105 |
from torch.utils.data import Dataset, TensorDataset
106 |
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
107 |
108 |
109 |
val_ratio = 0.2
110 |
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
111 |
batch_size = 32
112 |
113 |
# Indices of the train and validation splits stratified by labels
114 |
train_idx, val_idx = train_test_split(
115 |
116 |
test_size = val_ratio,
117 |
shuffle = True,
118 |
stratify = labels)
119 |
120 |
# Train and validation sets
121 |
train_set = TensorDataset(token_id[train_idx],
122 |
123 |
124 |
125 |
val_set = TensorDataset(token_id[val_idx],
126 |
127 |
128 |
129 |
# Prepare DataLoader
130 |
train_dataloader = DataLoader(
131 |
132 |
sampler = RandomSampler(train_set),
133 |
batch_size = batch_size
134 |
135 |
136 |
validation_dataloader = DataLoader(
137 |
138 |
sampler = SequentialSampler(val_set),
139 |
batch_size = batch_size
140 |
141 |
142 |
def b_tp(preds, labels):
143 |
'''Returns True Positives (TP): count of correct predictions of actual class 1'''
144 |
return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])
145 |
146 |
def b_fp(preds, labels):
147 |
'''Returns False Positives (FP): count of wrong predictions of actual class 1'''
148 |
return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])
149 |
150 |
def b_tn(preds, labels):
151 |
'''Returns True Negatives (TN): count of correct predictions of actual class 0'''
152 |
return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])
153 |
154 |
def b_fn(preds, labels):
155 |
'''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
156 |
return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])
157 |
158 |
def b_metrics(preds, labels):
159 |
160 |
Returns the following metrics:
161 |
- accuracy = (TP + TN) / N
162 |
- precision = TP / (TP + FP)
163 |
- recall = TP / (TP + FN)
164 |
- specificity = TN / (TN + FP)
165 |
166 |
preds = np.argmax(preds, axis = 1).flatten()
167 |
labels = labels.flatten()
168 |
tp = b_tp(preds, labels)
169 |
tn = b_tn(preds, labels)
170 |
fp = b_fp(preds, labels)
171 |
fn = b_fn(preds, labels)
172 |
b_accuracy = (tp + tn) / len(labels)
173 |
b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
174 |
b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
175 |
b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
176 |
return b_accuracy, b_precision, b_recall, b_specificity
177 |
178 |
from transformers import AutoModel
179 |
180 |
#!pip install torch.utils
181 |
182 |
from transformers import BertForSequenceClassification, AdamW, BertConfig
183 |
184 |
model = BertForSequenceClassification.from_pretrained(
185 |
186 |
num_labels = 2,
187 |
output_attentions = False,
188 |
output_hidden_states = False)
189 |
190 |
optimizer = torch.optim.AdamW(model.parameters(),
191 |
lr = 5e-5,
192 |
eps = 1e-08
193 |
194 |
195 |
# Run on GPU
196 |
197 |
198 |
from tqdm import trange
199 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
200 |
201 |
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
202 |
epochs = 5
203 |
204 |
for _ in trange(epochs, desc = 'Epoch'):
205 |
206 |
# ========== Training ==========
207 |
208 |
# Set model to training mode
209 |
210 |
211 |
# Tracking variables
212 |
tr_loss = 0
213 |
nb_tr_examples, nb_tr_steps = 0, 0
214 |
215 |
for step, batch in enumerate(train_dataloader):
216 |
batch = tuple(t.to(device) for t in batch)
217 |
b_input_ids, b_input_mask, b_labels = batch
218 |
219 |
# Forward pass
220 |
train_output = model(b_input_ids,
221 |
token_type_ids = None,
222 |
attention_mask = b_input_mask,
223 |
labels = b_labels)
224 |
# Backward pass
225 |
226 |
227 |
# Update tracking variables
228 |
tr_loss += train_output.loss.item()
229 |
nb_tr_examples += b_input_ids.size(0)
230 |
nb_tr_steps += 1
231 |
232 |
# ========== Validation ==========
233 |
234 |
# Set model to evaluation mode
235 |
236 |
237 |
# Tracking variables
238 |
val_accuracy = []
239 |
val_precision = []
240 |
val_recall = []
241 |
val_specificity = []
242 |
243 |
for batch in validation_dataloader:
244 |
batch = tuple(t.to(device) for t in batch)
245 |
b_input_ids, b_input_mask, b_labels = batch
246 |
with torch.no_grad():
247 |
# Forward pass
248 |
eval_output = model(b_input_ids,
249 |
token_type_ids = None,
250 |
attention_mask = b_input_mask)
251 |
logits = eval_output.logits.detach().cpu().numpy()
252 |
label_ids = b_labels.to('cpu').numpy()
253 |
# Calculate validation metrics
254 |
b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
255 |
256 |
# Update precision only when (tp + fp) !=0; ignore nan
257 |
if b_precision != 'nan': val_precision.append(b_precision)
258 |
# Update recall only when (tp + fn) !=0; ignore nan
259 |
if b_recall != 'nan': val_recall.append(b_recall)
260 |
# Update specificity only when (tn + fp) !=0; ignore nan
261 |
if b_specificity != 'nan': val_specificity.append(b_specificity)
262 |
263 |
print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
264 |
print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
265 |
print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
266 |
print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
267 |
print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
268 |
269 |
#Used for printing the name if the variables. Removing it will not intrupt the project.
270 |
def namestr(obj, namespace):
271 |
return [name for name in namespace if namespace[name] is obj]
@@ -291,15 +72,14 @@ def predict(new_sentence):
291 |
prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
292 |
293 |
294 |
295 |
# Remove the namestr(new_sentence, globals()) in case of an error
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
author = {Stefan Schweter},
4 |
import spacy
5 |
from spacytextblob.spacytextblob import SpacyTextBlob
6 |
7 |
8 |
pipeline = pipeline(model="NimaKL/spamd")
9 |
10 |
st.set_page_config(layout='wide', initial_sidebar_state='expanded')
11 |
st.title("Spamd: Turkish Spam Detector")
12 |
st.markdown("Enter the text you'd like to analyze for spam.")
18 |
19 |
20 |
21 |
22 |
23 |
import torch
24 |
import numpy as np
25 |
26 |
from transformers import AutoTokenizer
27 |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
28 |
model = torch.load("drive/MyDrive/Models/spamd")
29 |
30 |
token_id = []
31 |
attention_masks = []
32 |
46 |
return_tensors = 'pt'
47 |
48 |
49 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
50 |
#Used for printing the name if the variables. Removing it will not intrupt the project.
51 |
def namestr(obj, namespace):
52 |
return [name for name in namespace if namespace[name] is obj]
72 |
prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
73 |
74 |
75 |
st.header('Input', namestr(new_sentence, globals()),': \n', new_sentence)
76 |
# Remove the namestr(new_sentence, globals()) in case of an error
77 |
st.header('Predicted Class: ', prediction,'\n----------------------------------\n')
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
author = {Stefan Schweter},