from transformers import pipeline import numpy as np import torch import transformers import json import pandas as pd from numpy.random import seed seed(1) import emoji import string import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer # PorterStemmer LancasterStemmer from nltk.stem import WordNetLemmatizer import re stemmer = PorterStemmer() # uncomment this when run first time nltk.download('wordnet') nltk.download('omw-1.4') nltk.download('stopwords') lemmatizer = WordNetLemmatizer() stopwords = nltk.corpus.stopwords.words('english') import gradio as gr def pre_processing_str_esg(df_col): df_col = df_col.lower() #defining the function to remove punctuation def remove_punctuation(text): punctuationfree="".join([i for i in text if i not in string.punctuation]) return punctuationfree #storing the puntuation free text df_col= remove_punctuation(df_col) df_col = re.sub(r"http\S+", " ", df_col) def remove_stopwords(text): return " ".join([word for word in str(text).split() if word not in stopwords]) #applying the function df_col = remove_stopwords(df_col) df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col) df_col = df_col.replace("¶", "") df_col = df_col.replace("§", "") df_col = df_col.replace('“', ' ') df_col = df_col.replace('”', ' ') df_col = df_col.replace('-', ' ') REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col) df_col = BAD_SYMBOLS_RE.sub(' ',df_col) # df_col = re.sub('W*dw*','',df_col) df_col = re.sub('[0-9]+', ' ', df_col) df_col = re.sub(' ', ' ', df_col) def remove_emoji(string): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', string) df_col = remove_emoji(df_col) return df_col def pre_processing_str(df_col): # df_col = df_col.lower() if len(df_col.split()) >= 70: return pre_processing_str_esg(df_col) else: df_col = df_col.replace('#', '') df_col = df_col.replace('!', '') df_col = re.sub(r"http\S+", " ", df_col) df_col = re.sub('[0-9]+', ' ', df_col) df_col = re.sub(' ', ' ', df_col) def remove_emojis(text): return emoji.replace_emoji(text) df_col = remove_emojis(df_col) df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col) df_col = re.sub(r"[^\x20-\x7E]+", "", df_col) df_col = df_col.strip() return df_col pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch") def classify(text): text = pre_processing_str(text) output = pipe(text,top_k = 2) return {"class": output} inputs = gr.inputs.Textbox(label="pdf link") outputs = gr.outputs.Textbox(label="OCR Text") demo = gr.Interface(fn=classify,inputs=inputs,outputs=outputs) demo.launch()