Spaces:

NimaKL
/

spamd

Build error

App Files Files Community

NimaKL commited on Oct 3, 2022

Commit

6671142

1 Parent(s): 7912a62

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -230

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ from textblob import TextBlob
 import spacy
 from spacytextblob.spacytextblob import SpacyTextBlob
 st.set_page_config(layout='wide', initial_sidebar_state='expanded')
 st.title("Spamd: Turkish Spam Detector")
 st.markdown("Enter the text you'd like to analyze for spam.")
@@ -15,41 +18,15 @@ Original file is located at
     https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH
 """
-#Cuda and  PyTorch Versions must match https://pytorch.org/get-started/locally/
-import csv
-data = []
-# with open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile:
-    # reader = csv.reader(csvfile, skipinitialspace=True)
-    # data.append(tuple(next(reader)))
-    # for Message, Group in reader:
-    #     data.append((int(Group), Message))
-import pandas as pd
-df = pd.read_csv('TurkishSMSCollection.csv', encoding='utf-8', on_bad_lines='skip', usecols= ['Group','Message'], sep=r';')
-df['Group']= df['Group'].replace(2, 0)
-# reader = open('TurkishSMSCollection.csv', "rt", encoding="utf-8") as csvfile
-print(df)
-text = df.Message.values
-len(text)
-labels = df.Group.values
-len(labels)
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
-import os
-os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
-import torch
 token_id = []
 attention_masks = []
@@ -69,203 +46,7 @@ def preprocessing(input_text, tokenizer):
                         return_tensors = 'pt'
                    )
-for sample in text:
-  encoding_dict = preprocessing(sample, tokenizer)
-  token_id.append(encoding_dict['input_ids'])
-  attention_masks.append(encoding_dict['attention_mask'])
-token_id = torch.cat(token_id, dim = 0)
-attention_masks = torch.cat(attention_masks, dim = 0)
-labels = torch.tensor(labels)
-import random
-import numpy as np
-from tabulate import tabulate
-def print_rand_sentence_encoding():
-  '''Displays tokens, token IDs and attention mask of a random text sample'''
-  index = random.randint(0, len(text) - 1)
-  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
-  token_ids = [i.numpy() for i in token_id[index]]
-  attention = [i.numpy() for i in attention_masks[index]]
-  table = np.array([tokens, token_ids, attention]).T
-  print(tabulate(table,
-                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
-                 tablefmt = 'fancy_grid'))
-print_rand_sentence_encoding()
-from sklearn.model_selection import train_test_split
-from torch.utils.data import Dataset, TensorDataset
-from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
-val_ratio = 0.2
-# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
-batch_size = 32
-# Indices of the train and validation splits stratified by labels
-train_idx, val_idx = train_test_split(
-    np.arange(len(labels)),
-    test_size = val_ratio,
-    shuffle = True,
-    stratify = labels)
-# Train and validation sets
-train_set = TensorDataset(token_id[train_idx],
-                          attention_masks[train_idx],
-                          labels[train_idx])
-val_set = TensorDataset(token_id[val_idx],
-                        attention_masks[val_idx],
-                        labels[val_idx])
-# Prepare DataLoader
-train_dataloader = DataLoader(
-            train_set,
-            sampler = RandomSampler(train_set),
-            batch_size = batch_size
-        )
-validation_dataloader = DataLoader(
-            val_set,
-            sampler = SequentialSampler(val_set),
-            batch_size = batch_size
-        )
-def b_tp(preds, labels):
-  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
-  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])
-def b_fp(preds, labels):
-  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
-  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])
-def b_tn(preds, labels):
-  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
-  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])
-def b_fn(preds, labels):
-  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
-  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])
-def b_metrics(preds, labels):
-  '''
-  Returns the following metrics:
-    - accuracy    = (TP + TN) / N
-    - precision   = TP / (TP + FP)
-    - recall      = TP / (TP + FN)
-    - specificity = TN / (TN + FP)
-  '''
-  preds = np.argmax(preds, axis = 1).flatten()
-  labels = labels.flatten()
-  tp = b_tp(preds, labels)
-  tn = b_tn(preds, labels)
-  fp = b_fp(preds, labels)
-  fn = b_fn(preds, labels)
-  b_accuracy = (tp + tn) / len(labels)
-  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
-  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
-  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
-  return b_accuracy, b_precision, b_recall, b_specificity
-from transformers import AutoModel
-#!pip install torch.utils
-from transformers import BertForSequenceClassification, AdamW, BertConfig
-model = BertForSequenceClassification.from_pretrained(
-    "dbmdz/bert-base-turkish-uncased",
-    num_labels = 2,
-    output_attentions = False,
-    output_hidden_states = False)
-optimizer = torch.optim.AdamW(model.parameters(),
-                              lr = 5e-5,
-                              eps = 1e-08
-                              )
-# Run on GPU
-model.cuda()
-from tqdm import trange
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
-epochs = 5
-for _ in trange(epochs, desc = 'Epoch'):
-    # ========== Training ==========
-    # Set model to training mode
-    model.train()
-    # Tracking variables
-    tr_loss = 0
-    nb_tr_examples, nb_tr_steps = 0, 0
-    for step, batch in enumerate(train_dataloader):
-        batch = tuple(t.to(device) for t in batch)
-        b_input_ids, b_input_mask, b_labels = batch
-        optimizer.zero_grad()
-        # Forward pass
-        train_output = model(b_input_ids,
-                             token_type_ids = None,
-                             attention_mask = b_input_mask,
-                             labels = b_labels)
-        # Backward pass
-        train_output.loss.backward()
-        optimizer.step()
-        # Update tracking variables
-        tr_loss += train_output.loss.item()
-        nb_tr_examples += b_input_ids.size(0)
-        nb_tr_steps += 1
-    # ========== Validation ==========
-    # Set model to evaluation mode
-    model.eval()
-    # Tracking variables
-    val_accuracy = []
-    val_precision = []
-    val_recall = []
-    val_specificity = []
-    for batch in validation_dataloader:
-        batch = tuple(t.to(device) for t in batch)
-        b_input_ids, b_input_mask, b_labels = batch
-        with torch.no_grad():
-          # Forward pass
-          eval_output = model(b_input_ids,
-                              token_type_ids = None,
-                              attention_mask = b_input_mask)
-        logits = eval_output.logits.detach().cpu().numpy()
-        label_ids = b_labels.to('cpu').numpy()
-        # Calculate validation metrics
-        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
-        val_accuracy.append(b_accuracy)
-        # Update precision only when (tp + fp) !=0; ignore nan
-        if b_precision != 'nan': val_precision.append(b_precision)
-        # Update recall only when (tp + fn) !=0; ignore nan
-        if b_recall != 'nan': val_recall.append(b_recall)
-        # Update specificity only when (tn + fp) !=0; ignore nan
-        if b_specificity != 'nan': val_specificity.append(b_specificity)
-    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
-    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
-    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
-    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
-    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')
 #Used for printing the name if the variables. Removing it will not intrupt the project.
 def namestr(obj, namespace):
     return [name for name in namespace if namespace[name] is obj]
@@ -291,15 +72,14 @@ def predict(new_sentence):
     prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
-    print('Input', namestr(new_sentence, globals()),': \n', new_sentence)
       # Remove the namestr(new_sentence, globals()) in case of an error
-    print('Predicted Class: ', prediction,'\n----------------------------------\n')
 predict(text)
 '''
 @software{stefan_schweter_2020_3770924,
   author       = {Stefan Schweter},

 import spacy
 from spacytextblob.spacytextblob import SpacyTextBlob
+pipeline = pipeline(model="NimaKL/spamd")
 st.set_page_config(layout='wide', initial_sidebar_state='expanded')
 st.title("Spamd: Turkish Spam Detector")
 st.markdown("Enter the text you'd like to analyze for spam.")
     https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH
 """
+import torch
+import numpy as np
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
+model = torch.load("drive/MyDrive/Models/spamd")
 token_id = []
 attention_masks = []
                         return_tensors = 'pt'
                    )
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 #Used for printing the name if the variables. Removing it will not intrupt the project.
 def namestr(obj, namespace):
     return [name for name in namespace if namespace[name] is obj]
     prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
+    st.header('Input', namestr(new_sentence, globals()),': \n', new_sentence)
       # Remove the namestr(new_sentence, globals()) in case of an error
+    st.header('Predicted Class: ', prediction,'\n----------------------------------\n')
 predict(text)
 '''
 @software{stefan_schweter_2020_3770924,
   author       = {Stefan Schweter},