NimaKL commited on
Commit
6671142
Β·
1 Parent(s): 7912a62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -230
app.py CHANGED
@@ -4,6 +4,9 @@ from textblob import TextBlob
4
  import spacy
5
  from spacytextblob.spacytextblob import SpacyTextBlob
6
 
 
 
 
7
  st.set_page_config(layout='wide', initial_sidebar_state='expanded')
8
  st.title("Spamd: Turkish Spam Detector")
9
  st.markdown("Enter the text you'd like to analyze for spam.")
@@ -15,41 +18,15 @@ Original file is located at
15
  https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH
16
  """
17
 
18
- #Cuda and PyTorch Versions must match https://pytorch.org/get-started/locally/
19
-
20
-
21
-
22
- import csv
23
- data = []
24
- # with open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile:
25
- # reader = csv.reader(csvfile, skipinitialspace=True)
26
- # data.append(tuple(next(reader)))
27
- # for Message, Group in reader:
28
- # data.append((int(Group), Message))
29
- import pandas as pd
30
-
31
-
32
- df = pd.read_csv('TurkishSMSCollection.csv', encoding='utf-8', on_bad_lines='skip', usecols= ['Group','Message'], sep=r';')
33
- df['Group']= df['Group'].replace(2, 0)
34
-
35
- # reader = open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile
36
- print(df)
37
-
38
- text = df.Message.values
39
- len(text)
40
-
41
- labels = df.Group.values
42
- len(labels)
43
 
44
 
 
 
45
 
46
  from transformers import AutoTokenizer
47
  tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
 
48
 
49
- import os
50
- os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
51
-
52
- import torch
53
  token_id = []
54
  attention_masks = []
55
 
@@ -69,203 +46,7 @@ def preprocessing(input_text, tokenizer):
69
  return_tensors = 'pt'
70
  )
71
 
72
-
73
- for sample in text:
74
- encoding_dict = preprocessing(sample, tokenizer)
75
- token_id.append(encoding_dict['input_ids'])
76
- attention_masks.append(encoding_dict['attention_mask'])
77
-
78
-
79
- token_id = torch.cat(token_id, dim = 0)
80
- attention_masks = torch.cat(attention_masks, dim = 0)
81
- labels = torch.tensor(labels)
82
-
83
-
84
-
85
-
86
- import random
87
- import numpy as np
88
- from tabulate import tabulate
89
- def print_rand_sentence_encoding():
90
- '''Displays tokens, token IDs and attention mask of a random text sample'''
91
- index = random.randint(0, len(text) - 1)
92
- tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
93
- token_ids = [i.numpy() for i in token_id[index]]
94
- attention = [i.numpy() for i in attention_masks[index]]
95
-
96
- table = np.array([tokens, token_ids, attention]).T
97
- print(tabulate(table,
98
- headers = ['Tokens', 'Token IDs', 'Attention Mask'],
99
- tablefmt = 'fancy_grid'))
100
-
101
- print_rand_sentence_encoding()
102
-
103
-
104
- from sklearn.model_selection import train_test_split
105
- from torch.utils.data import Dataset, TensorDataset
106
- from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
107
-
108
-
109
- val_ratio = 0.2
110
- # Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
111
- batch_size = 32
112
-
113
- # Indices of the train and validation splits stratified by labels
114
- train_idx, val_idx = train_test_split(
115
- np.arange(len(labels)),
116
- test_size = val_ratio,
117
- shuffle = True,
118
- stratify = labels)
119
-
120
- # Train and validation sets
121
- train_set = TensorDataset(token_id[train_idx],
122
- attention_masks[train_idx],
123
- labels[train_idx])
124
-
125
- val_set = TensorDataset(token_id[val_idx],
126
- attention_masks[val_idx],
127
- labels[val_idx])
128
-
129
- # Prepare DataLoader
130
- train_dataloader = DataLoader(
131
- train_set,
132
- sampler = RandomSampler(train_set),
133
- batch_size = batch_size
134
- )
135
-
136
- validation_dataloader = DataLoader(
137
- val_set,
138
- sampler = SequentialSampler(val_set),
139
- batch_size = batch_size
140
- )
141
-
142
- def b_tp(preds, labels):
143
- '''Returns True Positives (TP): count of correct predictions of actual class 1'''
144
- return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])
145
-
146
- def b_fp(preds, labels):
147
- '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
148
- return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])
149
-
150
- def b_tn(preds, labels):
151
- '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
152
- return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])
153
-
154
- def b_fn(preds, labels):
155
- '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
156
- return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])
157
-
158
- def b_metrics(preds, labels):
159
- '''
160
- Returns the following metrics:
161
- - accuracy = (TP + TN) / N
162
- - precision = TP / (TP + FP)
163
- - recall = TP / (TP + FN)
164
- - specificity = TN / (TN + FP)
165
- '''
166
- preds = np.argmax(preds, axis = 1).flatten()
167
- labels = labels.flatten()
168
- tp = b_tp(preds, labels)
169
- tn = b_tn(preds, labels)
170
- fp = b_fp(preds, labels)
171
- fn = b_fn(preds, labels)
172
- b_accuracy = (tp + tn) / len(labels)
173
- b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
174
- b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
175
- b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
176
- return b_accuracy, b_precision, b_recall, b_specificity
177
-
178
- from transformers import AutoModel
179
-
180
- #!pip install torch.utils
181
-
182
- from transformers import BertForSequenceClassification, AdamW, BertConfig
183
-
184
- model = BertForSequenceClassification.from_pretrained(
185
- "dbmdz/bert-base-turkish-uncased",
186
- num_labels = 2,
187
- output_attentions = False,
188
- output_hidden_states = False)
189
-
190
- optimizer = torch.optim.AdamW(model.parameters(),
191
- lr = 5e-5,
192
- eps = 1e-08
193
- )
194
-
195
- # Run on GPU
196
- model.cuda()
197
-
198
- from tqdm import trange
199
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
200
-
201
- # Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
202
- epochs = 5
203
-
204
- for _ in trange(epochs, desc = 'Epoch'):
205
-
206
- # ========== Training ==========
207
-
208
- # Set model to training mode
209
- model.train()
210
-
211
- # Tracking variables
212
- tr_loss = 0
213
- nb_tr_examples, nb_tr_steps = 0, 0
214
-
215
- for step, batch in enumerate(train_dataloader):
216
- batch = tuple(t.to(device) for t in batch)
217
- b_input_ids, b_input_mask, b_labels = batch
218
- optimizer.zero_grad()
219
- # Forward pass
220
- train_output = model(b_input_ids,
221
- token_type_ids = None,
222
- attention_mask = b_input_mask,
223
- labels = b_labels)
224
- # Backward pass
225
- train_output.loss.backward()
226
- optimizer.step()
227
- # Update tracking variables
228
- tr_loss += train_output.loss.item()
229
- nb_tr_examples += b_input_ids.size(0)
230
- nb_tr_steps += 1
231
-
232
- # ========== Validation ==========
233
-
234
- # Set model to evaluation mode
235
- model.eval()
236
-
237
- # Tracking variables
238
- val_accuracy = []
239
- val_precision = []
240
- val_recall = []
241
- val_specificity = []
242
-
243
- for batch in validation_dataloader:
244
- batch = tuple(t.to(device) for t in batch)
245
- b_input_ids, b_input_mask, b_labels = batch
246
- with torch.no_grad():
247
- # Forward pass
248
- eval_output = model(b_input_ids,
249
- token_type_ids = None,
250
- attention_mask = b_input_mask)
251
- logits = eval_output.logits.detach().cpu().numpy()
252
- label_ids = b_labels.to('cpu').numpy()
253
- # Calculate validation metrics
254
- b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
255
- val_accuracy.append(b_accuracy)
256
- # Update precision only when (tp + fp) !=0; ignore nan
257
- if b_precision != 'nan': val_precision.append(b_precision)
258
- # Update recall only when (tp + fn) !=0; ignore nan
259
- if b_recall != 'nan': val_recall.append(b_recall)
260
- # Update specificity only when (tn + fp) !=0; ignore nan
261
- if b_specificity != 'nan': val_specificity.append(b_specificity)
262
-
263
- print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
264
- print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
265
- print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
266
- print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
267
- print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
268
-
269
  #Used for printing the name if the variables. Removing it will not intrupt the project.
270
  def namestr(obj, namespace):
271
  return [name for name in namespace if namespace[name] is obj]
@@ -291,15 +72,14 @@ def predict(new_sentence):
291
  prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
292
 
293
 
294
- print('Input', namestr(new_sentence, globals()),': \n', new_sentence)
295
  # Remove the namestr(new_sentence, globals()) in case of an error
296
- print('Predicted Class: ', prediction,'\n----------------------------------\n')
297
-
298
-
299
-
300
  predict(text)
301
 
302
 
 
303
  '''
304
  @software{stefan_schweter_2020_3770924,
305
  author = {Stefan Schweter},
 
4
  import spacy
5
  from spacytextblob.spacytextblob import SpacyTextBlob
6
 
7
+
8
+ pipeline = pipeline(model="NimaKL/spamd")
9
+
10
  st.set_page_config(layout='wide', initial_sidebar_state='expanded')
11
  st.title("Spamd: Turkish Spam Detector")
12
  st.markdown("Enter the text you'd like to analyze for spam.")
 
18
  https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH
19
  """
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
+ import torch
24
+ import numpy as np
25
 
26
  from transformers import AutoTokenizer
27
  tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
28
+ model = torch.load("drive/MyDrive/Models/spamd")
29
 
 
 
 
 
30
  token_id = []
31
  attention_masks = []
32
 
 
46
  return_tensors = 'pt'
47
  )
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  #Used for printing the name if the variables. Removing it will not intrupt the project.
51
  def namestr(obj, namespace):
52
  return [name for name in namespace if namespace[name] is obj]
 
72
  prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
73
 
74
 
75
+ st.header('Input', namestr(new_sentence, globals()),': \n', new_sentence)
76
  # Remove the namestr(new_sentence, globals()) in case of an error
77
+ st.header('Predicted Class: ', prediction,'\n----------------------------------\n')
78
+
 
 
79
  predict(text)
80
 
81
 
82
+
83
  '''
84
  @software{stefan_schweter_2020_3770924,
85
  author = {Stefan Schweter},