|
import streamlit as st |
|
from transformers import pipeline |
|
from textblob import TextBlob |
|
|
|
pipe = pipeline('summarization') |
|
st.title("Spamd: Turkish Spam Detector") |
|
|
|
|
|
"""Spamd_SpamDetector_Turkish_BERT_22.09.2022.ipynb |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
import csv |
|
data = [] |
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
df = pd.read_csv('TurkishSMSCollection.csv', encoding='utf-8', on_bad_lines='skip', usecols= ['Group','Message'], sep=r';') |
|
df['Group']= df['Group'].replace(2, 0) |
|
|
|
|
|
print(df) |
|
|
|
text = df.Message.values |
|
len(text) |
|
|
|
labels = df.Group.values |
|
len(labels) |
|
|
|
|
|
|
|
from transformers import AutoTokenizer |
|
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased") |
|
|
|
import os |
|
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" |
|
|
|
import torch |
|
token_id = [] |
|
attention_masks = [] |
|
|
|
def preprocessing(input_text, tokenizer): |
|
''' |
|
Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields: |
|
- input_ids: list of token ids |
|
- token_type_ids: list of token type ids |
|
- attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True). |
|
''' |
|
return tokenizer.encode_plus( |
|
input_text, |
|
add_special_tokens = True, |
|
max_length = 32, |
|
pad_to_max_length = True, |
|
return_attention_mask = True, |
|
return_tensors = 'pt' |
|
) |
|
|
|
|
|
for sample in text: |
|
encoding_dict = preprocessing(sample, tokenizer) |
|
token_id.append(encoding_dict['input_ids']) |
|
attention_masks.append(encoding_dict['attention_mask']) |
|
|
|
|
|
token_id = torch.cat(token_id, dim = 0) |
|
attention_masks = torch.cat(attention_masks, dim = 0) |
|
labels = torch.tensor(labels) |
|
|
|
|
|
|
|
|
|
import random |
|
import numpy as np |
|
from tabulate import tabulate |
|
def print_rand_sentence_encoding(): |
|
'''Displays tokens, token IDs and attention mask of a random text sample''' |
|
index = random.randint(0, len(text) - 1) |
|
tokens = tokenizer.tokenize(tokenizer.decode(token_id[index])) |
|
token_ids = [i.numpy() for i in token_id[index]] |
|
attention = [i.numpy() for i in attention_masks[index]] |
|
|
|
table = np.array([tokens, token_ids, attention]).T |
|
print(tabulate(table, |
|
headers = ['Tokens', 'Token IDs', 'Attention Mask'], |
|
tablefmt = 'fancy_grid')) |
|
|
|
print_rand_sentence_encoding() |
|
|
|
|
|
from sklearn.model_selection import train_test_split |
|
from torch.utils.data import Dataset, TensorDataset |
|
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler |
|
|
|
|
|
val_ratio = 0.2 |
|
|
|
batch_size = 32 |
|
|
|
|
|
train_idx, val_idx = train_test_split( |
|
np.arange(len(labels)), |
|
test_size = val_ratio, |
|
shuffle = True, |
|
stratify = labels) |
|
|
|
|
|
train_set = TensorDataset(token_id[train_idx], |
|
attention_masks[train_idx], |
|
labels[train_idx]) |
|
|
|
val_set = TensorDataset(token_id[val_idx], |
|
attention_masks[val_idx], |
|
labels[val_idx]) |
|
|
|
|
|
train_dataloader = DataLoader( |
|
train_set, |
|
sampler = RandomSampler(train_set), |
|
batch_size = batch_size |
|
) |
|
|
|
validation_dataloader = DataLoader( |
|
val_set, |
|
sampler = SequentialSampler(val_set), |
|
batch_size = batch_size |
|
) |
|
|
|
def b_tp(preds, labels): |
|
'''Returns True Positives (TP): count of correct predictions of actual class 1''' |
|
return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)]) |
|
|
|
def b_fp(preds, labels): |
|
'''Returns False Positives (FP): count of wrong predictions of actual class 1''' |
|
return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)]) |
|
|
|
def b_tn(preds, labels): |
|
'''Returns True Negatives (TN): count of correct predictions of actual class 0''' |
|
return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)]) |
|
|
|
def b_fn(preds, labels): |
|
'''Returns False Negatives (FN): count of wrong predictions of actual class 0''' |
|
return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)]) |
|
|
|
def b_metrics(preds, labels): |
|
''' |
|
Returns the following metrics: |
|
- accuracy = (TP + TN) / N |
|
- precision = TP / (TP + FP) |
|
- recall = TP / (TP + FN) |
|
- specificity = TN / (TN + FP) |
|
''' |
|
preds = np.argmax(preds, axis = 1).flatten() |
|
labels = labels.flatten() |
|
tp = b_tp(preds, labels) |
|
tn = b_tn(preds, labels) |
|
fp = b_fp(preds, labels) |
|
fn = b_fn(preds, labels) |
|
b_accuracy = (tp + tn) / len(labels) |
|
b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan' |
|
b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan' |
|
b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan' |
|
return b_accuracy, b_precision, b_recall, b_specificity |
|
|
|
from transformers import AutoModel |
|
|
|
|
|
|
|
from transformers import BertForSequenceClassification, AdamW, BertConfig |
|
|
|
model = BertForSequenceClassification.from_pretrained( |
|
"dbmdz/bert-base-turkish-uncased", |
|
num_labels = 2, |
|
output_attentions = False, |
|
output_hidden_states = False) |
|
|
|
optimizer = torch.optim.AdamW(model.parameters(), |
|
lr = 5e-5, |
|
eps = 1e-08 |
|
) |
|
|
|
|
|
model.cuda() |
|
|
|
from tqdm import trange |
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
|
|
|
epochs = 5 |
|
|
|
for _ in trange(epochs, desc = 'Epoch'): |
|
|
|
|
|
|
|
|
|
model.train() |
|
|
|
|
|
tr_loss = 0 |
|
nb_tr_examples, nb_tr_steps = 0, 0 |
|
|
|
for step, batch in enumerate(train_dataloader): |
|
batch = tuple(t.to(device) for t in batch) |
|
b_input_ids, b_input_mask, b_labels = batch |
|
optimizer.zero_grad() |
|
|
|
train_output = model(b_input_ids, |
|
token_type_ids = None, |
|
attention_mask = b_input_mask, |
|
labels = b_labels) |
|
|
|
train_output.loss.backward() |
|
optimizer.step() |
|
|
|
tr_loss += train_output.loss.item() |
|
nb_tr_examples += b_input_ids.size(0) |
|
nb_tr_steps += 1 |
|
|
|
|
|
|
|
|
|
model.eval() |
|
|
|
|
|
val_accuracy = [] |
|
val_precision = [] |
|
val_recall = [] |
|
val_specificity = [] |
|
|
|
for batch in validation_dataloader: |
|
batch = tuple(t.to(device) for t in batch) |
|
b_input_ids, b_input_mask, b_labels = batch |
|
with torch.no_grad(): |
|
|
|
eval_output = model(b_input_ids, |
|
token_type_ids = None, |
|
attention_mask = b_input_mask) |
|
logits = eval_output.logits.detach().cpu().numpy() |
|
label_ids = b_labels.to('cpu').numpy() |
|
|
|
b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids) |
|
val_accuracy.append(b_accuracy) |
|
|
|
if b_precision != 'nan': val_precision.append(b_precision) |
|
|
|
if b_recall != 'nan': val_recall.append(b_recall) |
|
|
|
if b_specificity != 'nan': val_specificity.append(b_specificity) |
|
|
|
print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps)) |
|
print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy))) |
|
print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN') |
|
print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN') |
|
print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN') |
|
|
|
|
|
def namestr(obj, namespace): |
|
return [name for name in namespace if namespace[name] is obj] |
|
|
|
def predict(new_sentence): |
|
|
|
test_ids = [] |
|
test_attention_mask = [] |
|
|
|
|
|
encoding = preprocessing(new_sentence, tokenizer) |
|
|
|
|
|
test_ids.append(encoding['input_ids']) |
|
test_attention_mask.append(encoding['attention_mask']) |
|
test_ids = torch.cat(test_ids, dim = 0) |
|
test_attention_mask = torch.cat(test_attention_mask, dim = 0) |
|
|
|
|
|
with torch.no_grad(): |
|
output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device)) |
|
|
|
prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal' |
|
|
|
|
|
print('Input', namestr(new_sentence, globals()),': \n', new_sentence) |
|
|
|
print('Predicted Class: ', prediction,'\n----------------------------------\n') |
|
|
|
|
|
st.subheader("Enter the text you'd like to analyze for spam.") |
|
text = st.text_input('Enter text') |
|
|
|
predict(text) |
|
|
|
|
|
''' |
|
@software{stefan_schweter_2020_3770924, |
|
author = {Stefan Schweter}, |
|
title = {BERTurk - BERT models for Turkish}, |
|
month = apr, |
|
year = 2020, |
|
publisher = {Zenodo}, |
|
version = {1.0.0}, |
|
doi = {10.5281/zenodo.3770924}, |
|
url = {https://doi.org/10.5281/zenodo.3770924} |
|
} |
|
''' |