prashant
commited on
Commit
·
4a6159c
1
Parent(s):
1e18f9c
haystack SDG classification
Browse files- app.py +1 -1
- appStore/sdg_analysis.py +13 -13
- requirements.txt +1 -1
- udfPreprocess/paramconfig.cfg +4 -1
- udfPreprocess/preprocessing.py +26 -5
- udfPreprocess/sdg_classifier.py +89 -0
- udfPreprocess/uploadAndExample.py +13 -9
app.py
CHANGED
@@ -12,6 +12,6 @@ app = MultiApp()
|
|
12 |
|
13 |
app.add_app("About","house", info.app)
|
14 |
app.add_app("SDG Analysis","gear",sdg_analysis.app)
|
15 |
-
app.add_app("Search","search", keyword_search.app)
|
16 |
|
17 |
app.run()
|
|
|
12 |
|
13 |
app.add_app("About","house", info.app)
|
14 |
app.add_app("SDG Analysis","gear",sdg_analysis.app)
|
15 |
+
# app.add_app("Search","search", keyword_search.app)
|
16 |
|
17 |
app.run()
|
appStore/sdg_analysis.py
CHANGED
@@ -19,8 +19,9 @@ import docx
|
|
19 |
from docx.shared import Inches
|
20 |
from docx.shared import Pt
|
21 |
from docx.enum.style import WD_STYLE_TYPE
|
22 |
-
from udfPreprocess.
|
23 |
-
|
|
|
24 |
import tempfile
|
25 |
import sqlite3
|
26 |
import logging
|
@@ -28,14 +29,14 @@ logger = logging.getLogger(__name__)
|
|
28 |
|
29 |
|
30 |
|
31 |
-
@st.cache(allow_output_mutation=True)
|
32 |
-
def load_keyBert():
|
33 |
-
|
34 |
|
35 |
-
@st.cache(allow_output_mutation=True)
|
36 |
-
def load_sdgClassifier():
|
37 |
-
|
38 |
-
|
39 |
|
40 |
|
41 |
|
@@ -59,12 +60,11 @@ def app():
|
|
59 |
|
60 |
|
61 |
|
62 |
-
if '
|
63 |
-
|
64 |
-
docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
65 |
with st.spinner("Running SDG"):
|
66 |
|
67 |
-
df, x = sdg_classification(
|
68 |
|
69 |
|
70 |
# classifier = load_sdgClassifier()
|
|
|
19 |
from docx.shared import Inches
|
20 |
from docx.shared import Pt
|
21 |
from docx.enum.style import WD_STYLE_TYPE
|
22 |
+
from udfPreprocess.sdg_classifier import sdg_classification
|
23 |
+
from udfPreprocess.sdg_classifier import runSDGPreprocessingPipeline
|
24 |
+
import configparser
|
25 |
import tempfile
|
26 |
import sqlite3
|
27 |
import logging
|
|
|
29 |
|
30 |
|
31 |
|
32 |
+
# @st.cache(allow_output_mutation=True)
|
33 |
+
# def load_keyBert():
|
34 |
+
# return KeyBERT()
|
35 |
|
36 |
+
# @st.cache(allow_output_mutation=True)
|
37 |
+
# def load_sdgClassifier():
|
38 |
+
# classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
|
39 |
+
# return classifier
|
40 |
|
41 |
|
42 |
|
|
|
60 |
|
61 |
|
62 |
|
63 |
+
if 'filepath' in st.session_state:
|
64 |
+
paraList = runSDGPreprocessingPipeline()
|
|
|
65 |
with st.spinner("Running SDG"):
|
66 |
|
67 |
+
df, x = sdg_classification(paraList)
|
68 |
|
69 |
|
70 |
# classifier = load_sdgClassifier()
|
requirements.txt
CHANGED
@@ -10,7 +10,7 @@ pandas==1.4.0
|
|
10 |
pdfplumber==0.6.2
|
11 |
Pillow==9.1.1
|
12 |
seaborn==0.11.2
|
13 |
-
transformers==4.
|
14 |
rank_bm25
|
15 |
python-docx
|
16 |
streamlit_option_menu
|
|
|
10 |
pdfplumber==0.6.2
|
11 |
Pillow==9.1.1
|
12 |
seaborn==0.11.2
|
13 |
+
transformers==4.21.2
|
14 |
rank_bm25
|
15 |
python-docx
|
16 |
streamlit_option_menu
|
udfPreprocess/paramconfig.cfg
CHANGED
@@ -10,7 +10,10 @@ THRESHOLD = 0.1
|
|
10 |
|
11 |
[sdg]
|
12 |
THRESHOLD = 0.85
|
|
|
|
|
|
|
13 |
|
14 |
[preprocessor]
|
15 |
-
SPLIT_OVERLAP_WORD =
|
16 |
SPLIT_OVERLAP_SENTENCE = 1
|
|
|
10 |
|
11 |
[sdg]
|
12 |
THRESHOLD = 0.85
|
13 |
+
MODEL = 'jonas/sdg_classifier_osdg'
|
14 |
+
SPLIT_BY = 'word'
|
15 |
+
SPLIT_LENGTH = 110
|
16 |
|
17 |
[preprocessor]
|
18 |
+
SPLIT_OVERLAP_WORD = 10
|
19 |
SPLIT_OVERLAP_SENTENCE = 1
|
udfPreprocess/preprocessing.py
CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
|
|
8 |
import logging
|
9 |
import re
|
10 |
import string
|
|
|
11 |
import configparser
|
12 |
config = configparser.ConfigParser()
|
13 |
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
@@ -127,6 +128,8 @@ class FileConverter(BaseComponent):
|
|
127 |
def basic(s, removePunc:bool = False):
|
128 |
|
129 |
"""
|
|
|
|
|
130 |
Params
|
131 |
----------
|
132 |
s: string to be processed
|
@@ -148,7 +151,7 @@ def basic(s, removePunc:bool = False):
|
|
148 |
s = s.translate(translator)
|
149 |
# Remove distracting single quotes and dotted pattern
|
150 |
s = re.sub("\'", " ", s)
|
151 |
-
s =
|
152 |
|
153 |
return s.strip()
|
154 |
|
@@ -165,8 +168,8 @@ class UdfPreProcessor(BaseComponent):
|
|
165 |
|
166 |
"""
|
167 |
outgoing_edges = 1
|
168 |
-
split_overlap_word = config.get('preprocessor','SPLIT_OVERLAP_WORD')
|
169 |
-
split_overlap_sentence = config.get('preprocessor','SPLIT_OVERLAP_SENTENCE')
|
170 |
|
171 |
def run(self, documents:List[Document], removePunc:bool,
|
172 |
split_by: Literal["sentence", "word"] = 'sentence',
|
@@ -210,6 +213,8 @@ class UdfPreProcessor(BaseComponent):
|
|
210 |
split_length=split_length,
|
211 |
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
212 |
split_overlap=split_overlap,
|
|
|
|
|
213 |
add_page_number=True
|
214 |
)
|
215 |
|
@@ -221,7 +226,7 @@ class UdfPreProcessor(BaseComponent):
|
|
221 |
df = pd.DataFrame(docs_processed)
|
222 |
all_text = " ".join(df.content.to_list())
|
223 |
para_list = df.content.to_list()
|
224 |
-
|
225 |
output = {'documents': docs_processed,
|
226 |
'dataframe': df,
|
227 |
'text': all_text,
|
@@ -234,4 +239,20 @@ class UdfPreProcessor(BaseComponent):
|
|
234 |
therefore nothing here, however to use the custom node we need to have
|
235 |
this method for the class.
|
236 |
"""
|
237 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import logging
|
9 |
import re
|
10 |
import string
|
11 |
+
from haystack.pipelines import Pipeline
|
12 |
import configparser
|
13 |
config = configparser.ConfigParser()
|
14 |
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
|
|
128 |
def basic(s, removePunc:bool = False):
|
129 |
|
130 |
"""
|
131 |
+
Performs basic cleaning of text.
|
132 |
+
|
133 |
Params
|
134 |
----------
|
135 |
s: string to be processed
|
|
|
151 |
s = s.translate(translator)
|
152 |
# Remove distracting single quotes and dotted pattern
|
153 |
s = re.sub("\'", " ", s)
|
154 |
+
s = s.replace("..","")
|
155 |
|
156 |
return s.strip()
|
157 |
|
|
|
168 |
|
169 |
"""
|
170 |
outgoing_edges = 1
|
171 |
+
split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
|
172 |
+
split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
|
173 |
|
174 |
def run(self, documents:List[Document], removePunc:bool,
|
175 |
split_by: Literal["sentence", "word"] = 'sentence',
|
|
|
213 |
split_length=split_length,
|
214 |
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
215 |
split_overlap=split_overlap,
|
216 |
+
|
217 |
+
# will add page number only in case of PDF not for text/docx file.
|
218 |
add_page_number=True
|
219 |
)
|
220 |
|
|
|
226 |
df = pd.DataFrame(docs_processed)
|
227 |
all_text = " ".join(df.content.to_list())
|
228 |
para_list = df.content.to_list()
|
229 |
+
logging.info('document split into {} paragraphs'.format(len(para_list)))
|
230 |
output = {'documents': docs_processed,
|
231 |
'dataframe': df,
|
232 |
'text': all_text,
|
|
|
239 |
therefore nothing here, however to use the custom node we need to have
|
240 |
this method for the class.
|
241 |
"""
|
242 |
+
return
|
243 |
+
|
244 |
+
def processingpipeline():
|
245 |
+
"""
|
246 |
+
Returns the preprocessing pipeline
|
247 |
+
|
248 |
+
"""
|
249 |
+
|
250 |
+
preprocessing_pipeline = Pipeline()
|
251 |
+
fileconverter = FileConverter()
|
252 |
+
customPreprocessor = UdfPreProcessor()
|
253 |
+
|
254 |
+
preprocessing_pipeline.add_node(component=fileconverter, name="FileConverter", inputs=["File"])
|
255 |
+
preprocessing_pipeline.add_node(component = customPreprocessor, name ='UdfPreprocessor', inputs=["FileConverter"])
|
256 |
+
|
257 |
+
return preprocessing_pipeline
|
258 |
+
|
udfPreprocess/sdg_classifier.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tkinter import Text
|
2 |
+
from haystack.nodes import TransformersDocumentClassifier
|
3 |
+
from typing import List, Tuple
|
4 |
+
import configparser
|
5 |
+
import streamlit as st
|
6 |
+
from pandas import DataFrame, Series
|
7 |
+
import logging
|
8 |
+
from udfPreprocess.preprocessing import processingpipeline
|
9 |
+
config = configparser.ConfigParser()
|
10 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
11 |
+
|
12 |
+
@st.cache(allow_output_mutation=True)
|
13 |
+
def load_sdgClassifier():
|
14 |
+
"""
|
15 |
+
loads the document classifier using haystack, where the name/path of model
|
16 |
+
in HF-hub as string is used to fetch the model object.
|
17 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
18 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
19 |
+
|
20 |
+
Return: document classifier model
|
21 |
+
"""
|
22 |
+
logging.info("Loading classifier")
|
23 |
+
doc_classifier_model = config.get('sdg','MODEL')
|
24 |
+
doc_classifier = TransformersDocumentClassifier(
|
25 |
+
model_name_or_path=doc_classifier_model,
|
26 |
+
task="text-classification")
|
27 |
+
return doc_classifier
|
28 |
+
|
29 |
+
|
30 |
+
def sdg_classification(paraList:List[Text])->Tuple(DataFrame,Series):
|
31 |
+
"""
|
32 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
33 |
+
most appropriate label for each text. these labels are in terms of if text
|
34 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
35 |
+
|
36 |
+
Params
|
37 |
+
---------
|
38 |
+
paraList: List of paragrpahs/text. The output of Preprocessing Pipeline
|
39 |
+
contains this list of paragraphs in different format, the simple List format
|
40 |
+
is being used here.
|
41 |
+
|
42 |
+
Returns
|
43 |
+
----------
|
44 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
45 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
46 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
47 |
+
|
48 |
+
"""
|
49 |
+
logging.info("running SDG classifiication")
|
50 |
+
threshold = float(config.get('sdg','THRESHOLD'))
|
51 |
+
|
52 |
+
|
53 |
+
classifier = load_sdgClassifier()
|
54 |
+
labels = classifier(paraList)
|
55 |
+
|
56 |
+
labels_= [(l['label'],l['score']) for l in labels]
|
57 |
+
df = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
58 |
+
|
59 |
+
df['text'] = paraList
|
60 |
+
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
61 |
+
df.index += 1
|
62 |
+
df =df[df['Relevancy']>threshold]
|
63 |
+
x = df['SDG'].value_counts()
|
64 |
+
# df = df.copy()
|
65 |
+
df= df.drop(['Relevancy'], axis = 1)
|
66 |
+
|
67 |
+
|
68 |
+
return df, x
|
69 |
+
|
70 |
+
def runSDGPreprocessingPipeline()->List[Text]:
|
71 |
+
"""
|
72 |
+
creates the pipeline and runs the preprocessing pipeline,
|
73 |
+
the params for pipeline are fetched from paramconfig
|
74 |
+
|
75 |
+
"""
|
76 |
+
file_path = st.session_state['filepath']
|
77 |
+
file_name = st.session_state['filename']
|
78 |
+
sdg_processing_pipeline = processingpipeline()
|
79 |
+
split_by = config.get('sdg','SPLIT_BY')
|
80 |
+
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
81 |
+
|
82 |
+
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
83 |
+
params= {"FileConverter": {"file_path": file_path, \
|
84 |
+
"file_name": file_name},
|
85 |
+
"UdfPreProcessor": {"removePunc": False, \
|
86 |
+
"split_by": split_by, \
|
87 |
+
"split_length":split_length}})
|
88 |
+
|
89 |
+
return output_sdg_pre['paraList']
|
udfPreprocess/uploadAndExample.py
CHANGED
@@ -16,10 +16,12 @@ def add_upload(choice):
|
|
16 |
# st.write("Uploaded Filename: ", uploaded_file.name)
|
17 |
file_name = uploaded_file.name
|
18 |
file_path = temp.name
|
19 |
-
docs = pre.load_document(file_path, file_name)
|
20 |
-
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
21 |
-
st.session_state['
|
22 |
-
st.session_state['paraList'] = paraList
|
|
|
|
|
23 |
|
24 |
|
25 |
else:
|
@@ -30,6 +32,7 @@ def add_upload(choice):
|
|
30 |
if option is 'South Africa:Low Emission strategy':
|
31 |
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
32 |
st.session_state['filename'] = file_name
|
|
|
33 |
# st.write("Selected document:", file_name.split('/')[1])
|
34 |
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
35 |
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
@@ -37,12 +40,13 @@ def add_upload(choice):
|
|
37 |
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
38 |
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
39 |
st.session_state['filename'] = file_name
|
|
|
40 |
# st.write("Selected document:", file_name.split('/')[1])
|
41 |
|
42 |
-
if option is not None:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
48 |
|
|
|
16 |
# st.write("Uploaded Filename: ", uploaded_file.name)
|
17 |
file_name = uploaded_file.name
|
18 |
file_path = temp.name
|
19 |
+
# docs = pre.load_document(file_path, file_name)
|
20 |
+
# haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
21 |
+
st.session_state['filename'] = file_name
|
22 |
+
# st.session_state['paraList'] = paraList
|
23 |
+
st.session_state['filepath'] = file_path
|
24 |
+
|
25 |
|
26 |
|
27 |
else:
|
|
|
32 |
if option is 'South Africa:Low Emission strategy':
|
33 |
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
34 |
st.session_state['filename'] = file_name
|
35 |
+
st.sesion_state['filepath'] = file_path
|
36 |
# st.write("Selected document:", file_name.split('/')[1])
|
37 |
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
38 |
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
|
|
40 |
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
41 |
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
42 |
st.session_state['filename'] = file_name
|
43 |
+
st.sesion_state['filepath'] = file_path
|
44 |
# st.write("Selected document:", file_name.split('/')[1])
|
45 |
|
46 |
+
# if option is not None:
|
47 |
+
# docs = pre.load_document(file_path,file_name)
|
48 |
+
# haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
49 |
+
# st.session_state['docs'] = docs
|
50 |
+
# st.session_state['paraList'] = paraList
|
51 |
|
52 |
|