File size: 3,463 Bytes
4d29f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c63c7bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d29f91
 
c63c7bb
274543c
 
4d29f91
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from transformers import pipeline
import numpy as np
import torch
import transformers
import json
import pandas as pd
from numpy.random import seed
seed(1)
import emoji
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer  # PorterStemmer LancasterStemmer
from nltk.stem import WordNetLemmatizer
import re
stemmer = PorterStemmer()

# uncomment this when run first time 
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')

import gradio as gr
def pre_processing_str_esg(df_col):
    df_col = df_col.lower()
    #defining the function to remove punctuation
    def remove_punctuation(text):
        punctuationfree="".join([i for i in text if i not in string.punctuation])
        return punctuationfree
    #storing the puntuation free text
    df_col= remove_punctuation(df_col)
    df_col = re.sub(r"http\S+", " ", df_col)

    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in stopwords])
    #applying the function
    df_col = remove_stopwords(df_col)
    df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
    df_col = df_col.replace("¶", "")
    df_col = df_col.replace("§", "")
    df_col = df_col.replace('“', ' ')
    df_col = df_col.replace('”', ' ')
    df_col = df_col.replace('-', ' ')
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
    df_col = BAD_SYMBOLS_RE.sub(' ',df_col)

#     df_col = re.sub('W*dw*','',df_col)
    df_col = re.sub('[0-9]+', ' ', df_col)
    df_col = re.sub('  ', ' ', df_col)

    def remove_emoji(string):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', string)
    df_col = remove_emoji(df_col)

    return df_col

def pre_processing_str(df_col):
#    df_col = df_col.lower()
    if len(df_col.split()) >= 70:
        return pre_processing_str_esg(df_col)
    else:
        df_col = df_col.replace('#', '')
        df_col = df_col.replace('!', '')
        df_col = re.sub(r"http\S+", " ", df_col)
    
        df_col = re.sub('[0-9]+', ' ', df_col)
        df_col = re.sub('  ', ' ', df_col)
        def remove_emojis(text):
            return emoji.replace_emoji(text)
        df_col = remove_emojis(df_col)  
        df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
        df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)    
        df_col = df_col.strip()
        return df_col
pipe = pipeline("text-classification", model="dsmsb/16class_12k_newtest1618_xlm_roberta_base_27nov_v2_8epoch")
def classify(text):
    text = pre_processing_str(text)
    output = pipe(text,top_k = 2)
    return {"class": output}
inputs = gr.inputs.Textbox(label="pdf link")
outputs = gr.outputs.Textbox(label="OCR Text")
demo = gr.Interface(fn=classify,inputs=inputs,outputs=outputs)
demo.launch()