prashant commited on
Commit
4a6159c
·
1 Parent(s): 1e18f9c

haystack SDG classification

Browse files
app.py CHANGED
@@ -12,6 +12,6 @@ app = MultiApp()
12
 
13
  app.add_app("About","house", info.app)
14
  app.add_app("SDG Analysis","gear",sdg_analysis.app)
15
- app.add_app("Search","search", keyword_search.app)
16
 
17
  app.run()
 
12
 
13
  app.add_app("About","house", info.app)
14
  app.add_app("SDG Analysis","gear",sdg_analysis.app)
15
+ # app.add_app("Search","search", keyword_search.app)
16
 
17
  app.run()
appStore/sdg_analysis.py CHANGED
@@ -19,8 +19,9 @@ import docx
19
  from docx.shared import Inches
20
  from docx.shared import Pt
21
  from docx.enum.style import WD_STYLE_TYPE
22
- from udfPreprocess.sdg import sdg_classification
23
-
 
24
  import tempfile
25
  import sqlite3
26
  import logging
@@ -28,14 +29,14 @@ logger = logging.getLogger(__name__)
28
 
29
 
30
 
31
- @st.cache(allow_output_mutation=True)
32
- def load_keyBert():
33
- return KeyBERT()
34
 
35
- @st.cache(allow_output_mutation=True)
36
- def load_sdgClassifier():
37
- classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
38
- return classifier
39
 
40
 
41
 
@@ -59,12 +60,11 @@ def app():
59
 
60
 
61
 
62
- if 'docs' in st.session_state:
63
- docs = st.session_state['docs']
64
- docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
65
  with st.spinner("Running SDG"):
66
 
67
- df, x = sdg_classification(par_list)
68
 
69
 
70
  # classifier = load_sdgClassifier()
 
19
  from docx.shared import Inches
20
  from docx.shared import Pt
21
  from docx.enum.style import WD_STYLE_TYPE
22
+ from udfPreprocess.sdg_classifier import sdg_classification
23
+ from udfPreprocess.sdg_classifier import runSDGPreprocessingPipeline
24
+ import configparser
25
  import tempfile
26
  import sqlite3
27
  import logging
 
29
 
30
 
31
 
32
+ # @st.cache(allow_output_mutation=True)
33
+ # def load_keyBert():
34
+ # return KeyBERT()
35
 
36
+ # @st.cache(allow_output_mutation=True)
37
+ # def load_sdgClassifier():
38
+ # classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
39
+ # return classifier
40
 
41
 
42
 
 
60
 
61
 
62
 
63
+ if 'filepath' in st.session_state:
64
+ paraList = runSDGPreprocessingPipeline()
 
65
  with st.spinner("Running SDG"):
66
 
67
+ df, x = sdg_classification(paraList)
68
 
69
 
70
  # classifier = load_sdgClassifier()
requirements.txt CHANGED
@@ -10,7 +10,7 @@ pandas==1.4.0
10
  pdfplumber==0.6.2
11
  Pillow==9.1.1
12
  seaborn==0.11.2
13
- transformers==4.13.0
14
  rank_bm25
15
  python-docx
16
  streamlit_option_menu
 
10
  pdfplumber==0.6.2
11
  Pillow==9.1.1
12
  seaborn==0.11.2
13
+ transformers==4.21.2
14
  rank_bm25
15
  python-docx
16
  streamlit_option_menu
udfPreprocess/paramconfig.cfg CHANGED
@@ -10,7 +10,10 @@ THRESHOLD = 0.1
10
 
11
  [sdg]
12
  THRESHOLD = 0.85
 
 
 
13
 
14
  [preprocessor]
15
- SPLIT_OVERLAP_WORD = 20
16
  SPLIT_OVERLAP_SENTENCE = 1
 
10
 
11
  [sdg]
12
  THRESHOLD = 0.85
13
+ MODEL = 'jonas/sdg_classifier_osdg'
14
+ SPLIT_BY = 'word'
15
+ SPLIT_LENGTH = 110
16
 
17
  [preprocessor]
18
+ SPLIT_OVERLAP_WORD = 10
19
  SPLIT_OVERLAP_SENTENCE = 1
udfPreprocess/preprocessing.py CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
8
  import logging
9
  import re
10
  import string
 
11
  import configparser
12
  config = configparser.ConfigParser()
13
  config.read_file(open('udfPreprocess/paramconfig.cfg'))
@@ -127,6 +128,8 @@ class FileConverter(BaseComponent):
127
  def basic(s, removePunc:bool = False):
128
 
129
  """
 
 
130
  Params
131
  ----------
132
  s: string to be processed
@@ -148,7 +151,7 @@ def basic(s, removePunc:bool = False):
148
  s = s.translate(translator)
149
  # Remove distracting single quotes and dotted pattern
150
  s = re.sub("\'", " ", s)
151
- s = re.sub("..","",s)
152
 
153
  return s.strip()
154
 
@@ -165,8 +168,8 @@ class UdfPreProcessor(BaseComponent):
165
 
166
  """
167
  outgoing_edges = 1
168
- split_overlap_word = config.get('preprocessor','SPLIT_OVERLAP_WORD')
169
- split_overlap_sentence = config.get('preprocessor','SPLIT_OVERLAP_SENTENCE')
170
 
171
  def run(self, documents:List[Document], removePunc:bool,
172
  split_by: Literal["sentence", "word"] = 'sentence',
@@ -210,6 +213,8 @@ class UdfPreProcessor(BaseComponent):
210
  split_length=split_length,
211
  split_respect_sentence_boundary= split_respect_sentence_boundary,
212
  split_overlap=split_overlap,
 
 
213
  add_page_number=True
214
  )
215
 
@@ -221,7 +226,7 @@ class UdfPreProcessor(BaseComponent):
221
  df = pd.DataFrame(docs_processed)
222
  all_text = " ".join(df.content.to_list())
223
  para_list = df.content.to_list()
224
-
225
  output = {'documents': docs_processed,
226
  'dataframe': df,
227
  'text': all_text,
@@ -234,4 +239,20 @@ class UdfPreProcessor(BaseComponent):
234
  therefore nothing here, however to use the custom node we need to have
235
  this method for the class.
236
  """
237
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import logging
9
  import re
10
  import string
11
+ from haystack.pipelines import Pipeline
12
  import configparser
13
  config = configparser.ConfigParser()
14
  config.read_file(open('udfPreprocess/paramconfig.cfg'))
 
128
  def basic(s, removePunc:bool = False):
129
 
130
  """
131
+ Performs basic cleaning of text.
132
+
133
  Params
134
  ----------
135
  s: string to be processed
 
151
  s = s.translate(translator)
152
  # Remove distracting single quotes and dotted pattern
153
  s = re.sub("\'", " ", s)
154
+ s = s.replace("..","")
155
 
156
  return s.strip()
157
 
 
168
 
169
  """
170
  outgoing_edges = 1
171
+ split_overlap_word = int(config.get('preprocessor','SPLIT_OVERLAP_WORD'))
172
+ split_overlap_sentence = int(config.get('preprocessor','SPLIT_OVERLAP_SENTENCE'))
173
 
174
  def run(self, documents:List[Document], removePunc:bool,
175
  split_by: Literal["sentence", "word"] = 'sentence',
 
213
  split_length=split_length,
214
  split_respect_sentence_boundary= split_respect_sentence_boundary,
215
  split_overlap=split_overlap,
216
+
217
+ # will add page number only in case of PDF not for text/docx file.
218
  add_page_number=True
219
  )
220
 
 
226
  df = pd.DataFrame(docs_processed)
227
  all_text = " ".join(df.content.to_list())
228
  para_list = df.content.to_list()
229
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
230
  output = {'documents': docs_processed,
231
  'dataframe': df,
232
  'text': all_text,
 
239
  therefore nothing here, however to use the custom node we need to have
240
  this method for the class.
241
  """
242
+ return
243
+
244
+ def processingpipeline():
245
+ """
246
+ Returns the preprocessing pipeline
247
+
248
+ """
249
+
250
+ preprocessing_pipeline = Pipeline()
251
+ fileconverter = FileConverter()
252
+ customPreprocessor = UdfPreProcessor()
253
+
254
+ preprocessing_pipeline.add_node(component=fileconverter, name="FileConverter", inputs=["File"])
255
+ preprocessing_pipeline.add_node(component = customPreprocessor, name ='UdfPreprocessor', inputs=["FileConverter"])
256
+
257
+ return preprocessing_pipeline
258
+
udfPreprocess/sdg_classifier.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tkinter import Text
2
+ from haystack.nodes import TransformersDocumentClassifier
3
+ from typing import List, Tuple
4
+ import configparser
5
+ import streamlit as st
6
+ from pandas import DataFrame, Series
7
+ import logging
8
+ from udfPreprocess.preprocessing import processingpipeline
9
+ config = configparser.ConfigParser()
10
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
11
+
12
+ @st.cache(allow_output_mutation=True)
13
+ def load_sdgClassifier():
14
+ """
15
+ loads the document classifier using haystack, where the name/path of model
16
+ in HF-hub as string is used to fetch the model object.
17
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
18
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
19
+
20
+ Return: document classifier model
21
+ """
22
+ logging.info("Loading classifier")
23
+ doc_classifier_model = config.get('sdg','MODEL')
24
+ doc_classifier = TransformersDocumentClassifier(
25
+ model_name_or_path=doc_classifier_model,
26
+ task="text-classification")
27
+ return doc_classifier
28
+
29
+
30
+ def sdg_classification(paraList:List[Text])->Tuple(DataFrame,Series):
31
+ """
32
+ Text-Classification on the list of texts provided. Classifier provides the
33
+ most appropriate label for each text. these labels are in terms of if text
34
+ belongs to which particular Sustainable Devleopment Goal (SDG).
35
+
36
+ Params
37
+ ---------
38
+ paraList: List of paragrpahs/text. The output of Preprocessing Pipeline
39
+ contains this list of paragraphs in different format, the simple List format
40
+ is being used here.
41
+
42
+ Returns
43
+ ----------
44
+ df: Dataframe with two columns['SDG:int', 'text']
45
+ x: Series object with the unique SDG covered in the document uploaded and
46
+ the number of times it is covered/discussed/count_of_paragraphs.
47
+
48
+ """
49
+ logging.info("running SDG classifiication")
50
+ threshold = float(config.get('sdg','THRESHOLD'))
51
+
52
+
53
+ classifier = load_sdgClassifier()
54
+ labels = classifier(paraList)
55
+
56
+ labels_= [(l['label'],l['score']) for l in labels]
57
+ df = DataFrame(labels_, columns=["SDG", "Relevancy"])
58
+
59
+ df['text'] = paraList
60
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
61
+ df.index += 1
62
+ df =df[df['Relevancy']>threshold]
63
+ x = df['SDG'].value_counts()
64
+ # df = df.copy()
65
+ df= df.drop(['Relevancy'], axis = 1)
66
+
67
+
68
+ return df, x
69
+
70
+ def runSDGPreprocessingPipeline()->List[Text]:
71
+ """
72
+ creates the pipeline and runs the preprocessing pipeline,
73
+ the params for pipeline are fetched from paramconfig
74
+
75
+ """
76
+ file_path = st.session_state['filepath']
77
+ file_name = st.session_state['filename']
78
+ sdg_processing_pipeline = processingpipeline()
79
+ split_by = config.get('sdg','SPLIT_BY')
80
+ split_length = int(config.get('sdg','SPLIT_LENGTH'))
81
+
82
+ output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
83
+ params= {"FileConverter": {"file_path": file_path, \
84
+ "file_name": file_name},
85
+ "UdfPreProcessor": {"removePunc": False, \
86
+ "split_by": split_by, \
87
+ "split_length":split_length}})
88
+
89
+ return output_sdg_pre['paraList']
udfPreprocess/uploadAndExample.py CHANGED
@@ -16,10 +16,12 @@ def add_upload(choice):
16
  # st.write("Uploaded Filename: ", uploaded_file.name)
17
  file_name = uploaded_file.name
18
  file_path = temp.name
19
- docs = pre.load_document(file_path, file_name)
20
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
21
- st.session_state['docs'] = docs
22
- st.session_state['paraList'] = paraList
 
 
23
 
24
 
25
  else:
@@ -30,6 +32,7 @@ def add_upload(choice):
30
  if option is 'South Africa:Low Emission strategy':
31
  file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
32
  st.session_state['filename'] = file_name
 
33
  # st.write("Selected document:", file_name.split('/')[1])
34
  # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
35
  # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
@@ -37,12 +40,13 @@ def add_upload(choice):
37
  # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
38
  file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
39
  st.session_state['filename'] = file_name
 
40
  # st.write("Selected document:", file_name.split('/')[1])
41
 
42
- if option is not None:
43
- docs = pre.load_document(file_path,file_name)
44
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
45
- st.session_state['docs'] = docs
46
- st.session_state['paraList'] = paraList
47
 
48
 
 
16
  # st.write("Uploaded Filename: ", uploaded_file.name)
17
  file_name = uploaded_file.name
18
  file_path = temp.name
19
+ # docs = pre.load_document(file_path, file_name)
20
+ # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
21
+ st.session_state['filename'] = file_name
22
+ # st.session_state['paraList'] = paraList
23
+ st.session_state['filepath'] = file_path
24
+
25
 
26
 
27
  else:
 
32
  if option is 'South Africa:Low Emission strategy':
33
  file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
34
  st.session_state['filename'] = file_name
35
+ st.sesion_state['filepath'] = file_path
36
  # st.write("Selected document:", file_name.split('/')[1])
37
  # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
38
  # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
 
40
  # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
41
  file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
42
  st.session_state['filename'] = file_name
43
+ st.sesion_state['filepath'] = file_path
44
  # st.write("Selected document:", file_name.split('/')[1])
45
 
46
+ # if option is not None:
47
+ # docs = pre.load_document(file_path,file_name)
48
+ # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
49
+ # st.session_state['docs'] = docs
50
+ # st.session_state['paraList'] = paraList
51
 
52