DataAIDemo

Sleeping

File size: 4,508 Bytes

6060e42

import numpy as np
import torch
import transformers
import streamlit as st
from streamlit import session_state
import json
import torch.nn.functional as F
import boto3
import pandas as pd
bucket = 'data-ai-dev2'
from transformers import BertTokenizer, BertModel
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import numpy
from numpy.random import seed
seed(1)
import emoji
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer  # PorterStemmer LancasterStemmer
from nltk.stem import WordNetLemmatizer
import re
stemmer = PorterStemmer()

# uncomment this when run first time 
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()

from transformers import pipeline
stopwords = nltk.corpus.stopwords.words('english')


model = 'C:/Users/Meet/Downloads/core_risk/models/'
tokenizer = 'C:/Users/Meet/Downloads/core_risk/tokenizer/'


from transformers import pipeline

classifier = pipeline("text-classification", model= model,  tokenizer = tokenizer, truncation=True, max_length=512)
def pre_processing_str_esg(df_col):
    df_col = df_col.lower()
    #defining the function to remove punctuation
    def remove_punctuation(text):
        punctuationfree="".join([i for i in text if i not in string.punctuation])
        return punctuationfree
    #storing the puntuation free text
    df_col= remove_punctuation(df_col)
    df_col = re.sub(r"http\S+", " ", df_col)

    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in stopwords])
    #applying the function
    df_col = remove_stopwords(df_col)
    df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
    df_col = df_col.replace("¶", "")
    df_col = df_col.replace("§", "")
    df_col = df_col.replace('“', ' ')
    df_col = df_col.replace('”', ' ')
    df_col = df_col.replace('-', ' ')
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
    df_col = BAD_SYMBOLS_RE.sub(' ',df_col)

#     df_col = re.sub('W*dw*','',df_col)
    df_col = re.sub('[0-9]+', ' ', df_col)
    df_col = re.sub('  ', ' ', df_col)

    def remove_emoji(string):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)
    df_col = remove_emoji(df_col)

    return df_col

def pre_processing_str(df_col):
#    df_col = df_col.lower()
    if len(df_col.split()) >= 70:
        return pre_processing_str_esg(df_col)
    else:
        df_col = df_col.replace('#', '')
        df_col = df_col.replace('!', '')
        df_col = re.sub(r"http\S+", " ", df_col)
    
        df_col = re.sub('[0-9]+', ' ', df_col)
        df_col = re.sub('  ', ' ', df_col)
        def remove_emojis(text):
            return emoji.replace_emoji(text)
        df_col = remove_emojis(df_col)  
        df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
        df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)    
        df_col = df_col.strip()
        return df_col

    
# start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application

def process(text):
    text = pre_processing_str(text)
            
    try:
        if len(text) != 0:
            results = classifier(text, top_k = 2)
        else:
            results = 'No Text'   
        
        return {'output_16':results}
    except:
        return {'output_16':'something went wrong'}
    
st.set_page_config(page_title="core_risk", page_icon="📈")
if 'topic_class' not in session_state:
    session_state['topic_class']= ""
    
st.title("Topic Classifier")
text= st.text_area(label= "Please write the text bellow", 
              placeholder="What does the tweet say?")
def classify(text):
    session_state['topic_class'] = process(text)


st.text_area("result", value=session_state['topic_class'])

st.button("Classify", on_click=classify, args=[text])