|
from transformers import pipeline |
|
import numpy as np |
|
import torch |
|
import transformers |
|
import json |
|
import pandas as pd |
|
from numpy.random import seed |
|
seed(1) |
|
import emoji |
|
import string |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.stem import PorterStemmer |
|
from nltk.stem import WordNetLemmatizer |
|
import re |
|
stemmer = PorterStemmer() |
|
|
|
|
|
nltk.download('wordnet') |
|
nltk.download('omw-1.4') |
|
nltk.download('stopwords') |
|
|
|
lemmatizer = WordNetLemmatizer() |
|
stopwords = nltk.corpus.stopwords.words('english') |
|
|
|
import gradio as gr |
|
def pre_processing_str_esg(df_col): |
|
df_col = df_col.lower() |
|
|
|
def remove_punctuation(text): |
|
punctuationfree="".join([i for i in text if i not in string.punctuation]) |
|
return punctuationfree |
|
|
|
df_col= remove_punctuation(df_col) |
|
df_col = re.sub(r"http\S+", " ", df_col) |
|
|
|
def remove_stopwords(text): |
|
return " ".join([word for word in str(text).split() if word not in stopwords]) |
|
|
|
df_col = remove_stopwords(df_col) |
|
df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col) |
|
df_col = df_col.replace("¶", "") |
|
df_col = df_col.replace("§", "") |
|
df_col = df_col.replace('“', ' ') |
|
df_col = df_col.replace('”', ' ') |
|
df_col = df_col.replace('-', ' ') |
|
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') |
|
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') |
|
df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col) |
|
df_col = BAD_SYMBOLS_RE.sub(' ',df_col) |
|
|
|
|
|
df_col = re.sub('[0-9]+', ' ', df_col) |
|
df_col = re.sub(' ', ' ', df_col) |
|
|
|
def remove_emoji(string): |
|
emoji_pattern = re.compile("[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
"]+", flags=re.UNICODE) |
|
return emoji_pattern.sub(r'', string) |
|
df_col = remove_emoji(df_col) |
|
|
|
return df_col |
|
|
|
def pre_processing_str(df_col): |
|
|
|
if len(df_col.split()) >= 70: |
|
return pre_processing_str_esg(df_col) |
|
else: |
|
df_col = df_col.replace('#', '') |
|
df_col = df_col.replace('!', '') |
|
df_col = re.sub(r"http\S+", " ", df_col) |
|
|
|
df_col = re.sub('[0-9]+', ' ', df_col) |
|
df_col = re.sub(' ', ' ', df_col) |
|
def remove_emojis(text): |
|
return emoji.replace_emoji(text) |
|
df_col = remove_emojis(df_col) |
|
df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col) |
|
df_col = re.sub(r"[^\x20-\x7E]+", "", df_col) |
|
df_col = df_col.strip() |
|
return df_col |
|
pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch") |
|
def classify(text): |
|
text = pre_processing_str(text) |
|
output = pipe(text,top_k = 2) |
|
return {"class": output} |
|
inputs = gr.inputs.Textbox(label="pdf link") |
|
outputs = gr.outputs.Textbox(label="OCR Text") |
|
demo = gr.Interface(fn=classify,inputs=inputs,outputs=outputs) |
|
demo.launch() |