Spaces:

Orawan
/

Text_Classify

Sleeping

App Files Files Community

Orawan commited on Oct 6, 2023

Commit

9194c1e

1 Parent(s): 10a1e30

Upload app (2).py

Browse files

Files changed (1) hide show

app (2).py +62 -0

app (2).py ADDED Viewed

	@@ -0,0 +1,62 @@

+import streamlit as st
+from transformers import pipeline
+from textblob import TextBlob
+from transformers import BertForSequenceClassification, AdamW, BertConfig
+st.set_page_config(layout='wide', initial_sidebar_state='expanded')
+col1, col2= st.columns(2)
+with col2:
+    text = st.text_input("Enter the text you'd like to analyze for spam.")
+    aButton = st.button('Analyze')
+with col1:
+    st.title("Spamd: Turkish Spam Detector")
+    st.markdown("Message spam detection tool for Turkish language. Due the small size of the dataset, I decided to go with transformers technology Google BERT. Using the Turkish pre-trained model BERTurk, I imporved the accuracy of the tool by 18 percent compared to the previous model which used fastText.")
+    st.markdown("Original file is located at")
+    st.markdown("https://colab.research.google.com/drive/1QuorqAuLsmomesZHsaQHEZgzbPEM8YTH")
+import torch
+import numpy as np
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
+from transformers import AutoModel
+model = BertForSequenceClassification.from_pretrained("NimaKL/spamd_model")
+token_id = []
+attention_masks = []
+def preprocessing(input_text, tokenizer):
+    '''
+                  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
+                    - input_ids: list of token ids
+                    - token_type_ids: list of token type ids
+                    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
+    '''
+    return tokenizer.encode_plus(
+        input_text,
+        add_special_tokens = True,
+        max_length = 32,
+        pad_to_max_length = True,
+        return_attention_mask = True,
+        return_tensors = 'pt'
+            )
+device = 'cpu'
+def predict(new_sentence):
+    # We need Token IDs and Attention Mask for inference on the new sentence
+    test_ids = []
+    test_attention_mask = []
+    # Apply the tokenizer
+    encoding = preprocessing(new_sentence, tokenizer)
+    # Extract IDs and Attention Mask
+    test_ids.append(encoding['input_ids'])
+    test_attention_mask.append(encoding['attention_mask'])
+    test_ids = torch.cat(test_ids, dim = 0)
+    test_attention_mask = torch.cat(test_attention_mask, dim = 0)
+    # Forward pass, calculate logit predictions
+    with torch.no_grad():
+        output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))
+        prediction = 'Spam' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'Normal'
+        pred = 'Predicted Class: '+ prediction
+        return pred
+if text or aButton:
+    with col2:
+        with st.spinner('Wait for it...'):
+            st.success(predict(text))