import streamlit as st |
from transformers import pipeline |
from textblob import TextBlob |
pipe = pipeline('summarization') |
st.title("Spamd: Turkish Spam Detector") |
"""Spamd_SpamDetector_Turkish_BERT_22.09.2022.ipynb |
Original file is located at |
https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH |
""" |
import csv |
data = [] |
import pandas as pd |
df = pd.read_csv('TurkishSMSCollection.csv', encoding='utf-8', on_bad_lines='skip', usecols= ['Group','Message'], sep=r';') |
df['Group']= df['Group'].replace(2, 0) |
print(df) |
text = df.Message.values |
len(text) |
labels = df.Group.values |
len(labels) |
from transformers import AutoTokenizer |
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased") |
import os |
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" |
import torch |
token_id = [] |
attention_masks = [] |
def preprocessing(input_text, tokenizer): |
''' |
Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields: |
- input_ids: list of token ids |
- token_type_ids: list of token type ids |
- attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True). |
''' |
return tokenizer.encode_plus( |
input_text, |
add_special_tokens = True, |
max_length = 32, |
pad_to_max_length = True, |
return_attention_mask = True, |
return_tensors = 'pt' |
) |
for sample in text: |
encoding_dict = preprocessing(sample, tokenizer) |
token_id.append(encoding_dict['input_ids']) |
attention_masks.append(encoding_dict['attention_mask']) |
token_id = torch.cat(token_id, dim = 0) |
attention_masks = torch.cat(attention_masks, dim = 0) |
labels = torch.tensor(labels) |
import random |
import numpy as np |
from tabulate import tabulate |
def print_rand_sentence_encoding(): |
'''Displays tokens, token IDs and attention mask of a random text sample''' |
index = random.randint(0, len(text) - 1) |
tokens = tokenizer.tokenize(tokenizer.decode(token_id[index])) |
token_ids = [i.numpy() for i in token_id[index]] |
attention = [i.numpy() for i in attention_masks[index]] |
table = np.array([tokens, token_ids, attention]).T |
print(tabulate(table, |
headers = ['Tokens', 'Token IDs', 'Attention Mask'], |
tablefmt = 'fancy_grid')) |
print_rand_sentence_encoding() |
from sklearn.model_selection import train_test_split |
from torch.utils.data import Dataset, TensorDataset |
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler |
val_ratio = 0.2 |
batch_size = 32 |
train_idx, val_idx = train_test_split( |
np.arange(len(labels)), |
test_size = val_ratio, |
shuffle = True, |
stratify = labels) |
train_set = TensorDataset(token_id[train_idx], |
attention_masks[train_idx], |
labels[train_idx]) |
val_set = TensorDataset(token_id[val_idx], |
attention_masks[val_idx], |
labels[val_idx]) |
train_dataloader = DataLoader( |
train_set, |
sampler = RandomSampler(train_set), |
batch_size = batch_size |
) |
validation_dataloader = DataLoader( |
val_set, |
sampler = SequentialSampler(val_set), |
batch_size = batch_size |
) |
def b_tp(preds, labels): |
'''Returns True Positives (TP): count of correct predictions of actual class 1''' |
return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)]) |
def b_fp(preds, labels): |
'''Returns False Positives (FP): count of wrong predictions of actual class 1''' |
return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)]) |
def b_tn(preds, labels): |
'''Returns True Negatives (TN): count of correct predictions of actual class 0''' |
return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)]) |
def b_fn(preds, labels): |
'''Returns False Negatives (FN): count of wrong predictions of actual class 0''' |
return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)]) |
def b_metrics(preds, labels): |
''' |
Returns the following metrics: |
- accuracy = (TP + TN) / N |
- precision = TP / (TP + FP) |
- recall = TP / (TP + FN) |
- specificity = TN / (TN + FP) |
''' |
preds = np.argmax(preds, axis = 1).flatten() |
labels = labels.flatten() |
tp = b_tp(preds, labels) |
tn = b_tn(preds, labels) |
fp = b_fp(preds, labels) |
fn = b_fn(preds, labels) |
b_accuracy = (tp + tn) / len(labels) |
b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan' |
b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan' |
b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan' |
return b_accuracy, b_precision, b_recall, b_specificity |
from transformers import AutoModel |
from transformers import BertForSequenceClassification, AdamW, BertConfig |
model = BertForSequenceClassification.from_pretrained( |
"dbmdz/bert-base-turkish-uncased", |
num_labels = 2, |
output_attentions = False, |
output_hidden_states = False) |
optimizer = torch.optim.AdamW(model.parameters(), |
lr = 5e-5, |
eps = 1e-08 |
) |
model.cuda() |
from tqdm import trange |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
epochs = 5 |
for _ in trange(epochs, desc = 'Epoch'): |
model.train() |
tr_loss = 0 |
nb_tr_examples, nb_tr_steps = 0, 0 |
for step, batch in enumerate(train_dataloader): |
batch = tuple(t.to(device) for t in batch) |
b_input_ids, b_input_mask, b_labels = batch |
optimizer.zero_grad() |
train_output = model(b_input_ids, |
token_type_ids = None, |
attention_mask = b_input_mask, |
labels = b_labels) |
train_output.loss.backward() |
optimizer.step() |
tr_loss += train_output.loss.item() |
nb_tr_examples += b_input_ids.size(0) |
nb_tr_steps += 1 |
model.eval() |
val_accuracy = [] |
val_precision = [] |
val_recall = [] |
val_specificity = [] |
for batch in validation_dataloader: |
batch = tuple(t.to(device) for t in batch) |
b_input_ids, b_input_mask, b_labels = batch |
with torch.no_grad(): |
eval_output = model(b_input_ids, |
token_type_ids = None, |
attention_mask = b_input_mask) |
logits = eval_output.logits.detach().cpu().numpy() |
label_ids = b_labels.to('cpu').numpy() |
b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids) |
val_accuracy.append(b_accuracy) |
if b_precision != 'nan': val_precision.append(b_precision) |
if b_recall != 'nan': val_recall.append(b_recall) |
if b_specificity != 'nan': val_specificity.append(b_specificity) |
print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps)) |
print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy))) |
print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN') |
print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN') |
print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN') |
def namestr(obj, namespace): |
return [name for name in namespace if namespace[name] is obj] |
def predict(new_sentence): |
test_ids = [] |
test_attention_mask = [] |
encoding = preprocessing(new_sentence, tokenizer) |
test_ids.append(encoding['input_ids']) |
test_attention_mask.append(encoding['attention_mask']) |
test_ids = torch.cat(test_ids, dim = 0) |
test_attention_mask = torch.cat(test_attention_mask, dim = 0) |
with torch.no_grad(): |
output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device)) |
prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal' |
print('Input', namestr(new_sentence, globals()),': \n', new_sentence) |
print('Predicted Class: ', prediction,'\n----------------------------------\n') |
st.subheader("Enter the text you'd like to analyze for spam.") |
text = st.text_input('Enter text') |
predict(text) |
''' |
@software{stefan_schweter_2020_3770924, |
author = {Stefan Schweter}, |
title = {BERTurk - BERT models for Turkish}, |
month = apr, |
year = 2020, |
publisher = {Zenodo}, |
version = {1.0.0}, |
doi = {10.5281/zenodo.3770924}, |
url = {https://doi.org/10.5281/zenodo.3770924} |
} |
''' |