SMS-spam-English-sklearn

Runtime error

File size: 1,933 Bytes

a8dbb61
 
 
 
 
 
8678eea
a8dbb61
 
 
 
 
2f60bb1
 
 
 
 
 
a8dbb61
 
 
 
 
8678eea
a8dbb61
 
 
 
 
8678eea
a8dbb61
 
 
 
 
 
 
 
 
 
8678eea
 
a8dbb61
 
8678eea
a8dbb61

from sklearn.feature_extraction.text import TfidfVectorizer
import string
# import for loading python objects (scikit-learn models)
import pickle
import nltk
from nltk.data import load
from nltk.stem import PorterStemmer
import streamlit as st
import sklearn

nltk.download('punkt')

def custom_tokenizer_with_English_stemmer(text):
    # my text was unicode so I had to use the unicode-specific translate function. If your documents are strings, you will need to use a different `translate` function here. `Translated` here just does search-replace. See the trans_table: any matching character in the set is replaced with `None`
    tokens = [word for word in nltk.word_tokenize(text)]
    stems = [stemmerEN.stem(item.lower()) for item in tokens]
    return stems

def predictSMSdata(test_text):
    categories = ["legitimate", "spam"]
    categories.sort()

    # load model
    filename1 = "LinearSVC_SMS_spam_EN.pickle"
    file_handle1 = open(filename1, "rb")
    classifier = pickle.load(file_handle1)
    file_handle1.close()

    # load tfidf_vectorizer for transforming test text data
    filename2 = "tfidf_vectorizer_EN.pickle"
    file_handle2 = open(filename2, "rb")
    tfidf_vectorizer = pickle.load(file_handle2)
    file_handle2.close()

    test_list=[test_text]
    tfidf_vectorizer_vectors_test = tfidf_vectorizer.transform(test_list)
    predicted = classifier.predict(tfidf_vectorizer_vectors_test)
    print(categories[predicted[0]])
    return categories[predicted[0]]

# Porter Stemmer for English
stemmerEN = PorterStemmer()

# adding the text that will show in the text box
default_value = "ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE MINS. INDIA CUST SERVs SED YES. L8ER GOT MEGA BILL. 3 DONT GIV A SHIT. BAILIFF DUE IN DAYS. I O £250 3 WANT £800"
text = st.text_area("enter some text!", default_value)
if text:
    out = predictSMSdata(text)
    st.write("The category of SMS = " + out.upper())