Update app.py
Browse files
app.py
CHANGED
@@ -4,42 +4,20 @@ import json
|
|
4 |
import gradio as gr
|
5 |
import spacy
|
6 |
from spacy import displacy
|
7 |
-
from transformers import
|
8 |
import torch
|
9 |
import nltk
|
10 |
from nltk.tokenize import sent_tokenize
|
11 |
from fin_readability_sustainability import BERTClass, do_predict
|
12 |
import pandas as pd
|
13 |
import en_core_web_sm
|
14 |
-
#from fincat_utils import extract_context_words
|
15 |
-
#from fincat_utils import bert_embedding_extract
|
16 |
from score_fincat import score_fincat
|
17 |
-
import
|
18 |
-
#lr_clf = pickle.load(open("lr_clf_FiNCAT.pickle",'rb'))
|
19 |
-
|
20 |
nlp = en_core_web_sm.load()
|
21 |
nltk.download('punkt')
|
22 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
|
24 |
-
#SUSTAINABILITY STARTS
|
25 |
-
tokenizer_sus = RobertaTokenizer.from_pretrained('roberta-base')
|
26 |
-
model_sustain = BERTClass(2, "sustanability")
|
27 |
-
model_sustain.to(device)
|
28 |
-
model_sustain.load_state_dict(torch.load('sustainability_model.bin', map_location=device)['model_state_dict'])
|
29 |
|
30 |
-
def get_sustainability(text):
|
31 |
-
df = pd.DataFrame({'sentence':sent_tokenize(text)})
|
32 |
-
actual_predictions_sustainability = do_predict(model_sustain, tokenizer_sus, df)
|
33 |
-
highlight = []
|
34 |
-
for sent, prob in zip(df['sentence'].values, actual_predictions_sustainability[1]):
|
35 |
-
if prob>=4.384316:
|
36 |
-
highlight.append((sent, 'non-sustainable'))
|
37 |
-
elif prob<=1.423736:
|
38 |
-
highlight.append((sent, 'sustainable'))
|
39 |
-
else:
|
40 |
-
highlight.append((sent, '-'))
|
41 |
-
return highlight
|
42 |
-
#SUSTAINABILITY ENDS
|
43 |
|
44 |
|
45 |
##Summarization
|
@@ -49,22 +27,6 @@ def summarize_text(text):
|
|
49 |
stext = resp[0]['summary_text']
|
50 |
return stext
|
51 |
|
52 |
-
##Forward Looking Statement
|
53 |
-
def split_in_sentences(text):
|
54 |
-
doc = nlp(text)
|
55 |
-
return [str(sent).strip() for sent in doc.sents]
|
56 |
-
def make_spans(text,results):
|
57 |
-
results_list = []
|
58 |
-
for i in range(len(results)):
|
59 |
-
results_list.append(results[i]['label'])
|
60 |
-
facts_spans = []
|
61 |
-
facts_spans = list(zip(split_in_sentences(text),results_list))
|
62 |
-
return facts_spans
|
63 |
-
|
64 |
-
fls_model = pipeline("text-classification", model="yiyanghkust/finbert-fls", tokenizer="yiyanghkust/finbert-fls")
|
65 |
-
def fls(text):
|
66 |
-
results = fls_model(split_in_sentences(text))
|
67 |
-
return make_spans(text,results)
|
68 |
|
69 |
##Company Extraction
|
70 |
ner=pipeline('ner',model='Jean-Baptiste/camembert-ner-with-dates',tokenizer='Jean-Baptiste/camembert-ner-with-dates', aggregation_strategy="simple")
|
|
|
4 |
import gradio as gr
|
5 |
import spacy
|
6 |
from spacy import displacy
|
7 |
+
from transformers import RobertaTokenizer,pipeline
|
8 |
import torch
|
9 |
import nltk
|
10 |
from nltk.tokenize import sent_tokenize
|
11 |
from fin_readability_sustainability import BERTClass, do_predict
|
12 |
import pandas as pd
|
13 |
import en_core_web_sm
|
|
|
|
|
14 |
from score_fincat import score_fincat
|
15 |
+
from sus_fls import get_sustainability,fls
|
|
|
|
|
16 |
nlp = en_core_web_sm.load()
|
17 |
nltk.download('punkt')
|
18 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
|
|
|
|
|
|
|
|
|
|
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
##Summarization
|
|
|
27 |
stext = resp[0]['summary_text']
|
28 |
return stext
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
##Company Extraction
|
32 |
ner=pipeline('ner',model='Jean-Baptiste/camembert-ner-with-dates',tokenizer='Jean-Baptiste/camembert-ner-with-dates', aggregation_strategy="simple")
|