prashant commited on
Commit
685552c
1 Parent(s): 9119fa1

update sdg

Browse files
appStore/sdg_analysis.py CHANGED
@@ -2,9 +2,6 @@
2
  import glob, os, sys;
3
  sys.path.append('../utils')
4
 
5
- #import helper
6
-
7
-
8
  #import needed libraries
9
  import seaborn as sns
10
  import matplotlib.pyplot as plt
@@ -16,9 +13,6 @@ from docx.shared import Pt
16
  from docx.enum.style import WD_STYLE_TYPE
17
  from utils.sdg_classifier import sdg_classification
18
  from utils.sdg_classifier import runSDGPreprocessingPipeline
19
- # from utils.streamlitcheck import check_streamlit
20
- import tempfile
21
- import sqlite3
22
  import logging
23
  logger = logging.getLogger(__name__)
24
 
@@ -47,15 +41,16 @@ def app():
47
 
48
 
49
  if 'filepath' in st.session_state:
50
- paraList = runSDGPreprocessingPipeline()
51
- if len(paraList) > 150:
 
52
  warning_msg = ": This might take sometime, please sit back and relax."
53
  else:
54
  warning_msg = ""
55
 
56
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
57
 
58
- df, x = sdg_classification(paraList)
59
 
60
  plt.rcParams['font.size'] = 25
61
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
 
2
  import glob, os, sys;
3
  sys.path.append('../utils')
4
 
 
 
 
5
  #import needed libraries
6
  import seaborn as sns
7
  import matplotlib.pyplot as plt
 
13
  from docx.enum.style import WD_STYLE_TYPE
14
  from utils.sdg_classifier import sdg_classification
15
  from utils.sdg_classifier import runSDGPreprocessingPipeline
 
 
 
16
  import logging
17
  logger = logging.getLogger(__name__)
18
 
 
41
 
42
 
43
  if 'filepath' in st.session_state:
44
+ allDocuments = runSDGPreprocessingPipeline(st.session_state['filepath'],
45
+ st.session_state['filename'])
46
+ if len(allDocuments['documents']) > 100:
47
  warning_msg = ": This might take sometime, please sit back and relax."
48
  else:
49
  warning_msg = ""
50
 
51
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
52
 
53
+ df, x = sdg_classification(allDocuments['documents'])
54
 
55
  plt.rcParams['font.size'] = 25
56
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
paramconfig.cfg CHANGED
@@ -22,6 +22,7 @@ SPLIT_OVERLAP = 0
22
  THRESHOLD = 0.85
23
  MODEL = jonas/sdg_classifier_osdg
24
  SPLIT_BY = word
 
25
  SPLIT_LENGTH = 110
26
  SPLIT_OVERLAP = 10
27
 
 
22
  THRESHOLD = 0.85
23
  MODEL = jonas/sdg_classifier_osdg
24
  SPLIT_BY = word
25
+ REMOVE_PUNC = 0
26
  SPLIT_LENGTH = 110
27
  SPLIT_OVERLAP = 10
28
 
utils/sdg_classifier.py CHANGED
@@ -2,21 +2,28 @@ from haystack.nodes import TransformersDocumentClassifier
2
  from haystack.schema import Document
3
  from typing import List, Tuple
4
  import configparser
5
- import streamlit as st
6
- from utils.streamlitcheck import check_streamlit
7
- from pandas import DataFrame, Series
8
  import logging
 
9
  from utils.preprocessing import processingpipeline
 
 
 
 
10
  config = configparser.ConfigParser()
11
- config.read_file(open('paramconfig.cfg'))
 
 
 
 
12
 
13
 
 
14
  def load_sdgClassifier():
15
  """
16
  loads the document classifier using haystack, where the name/path of model
17
  in HF-hub as string is used to fetch the model object.
18
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
19
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
20
 
21
  Return: document classifier model
22
  """
@@ -28,6 +35,8 @@ def load_sdgClassifier():
28
  return doc_classifier
29
 
30
 
 
 
31
  def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
32
  """
33
  Text-Classification on the list of texts provided. Classifier provides the
@@ -50,16 +59,13 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
50
  logging.info("running SDG classifiication")
51
  threshold = float(config.get('sdg','THRESHOLD'))
52
 
53
- if check_streamlit():
54
- st.write("caching model")
55
- classifier = st.cache(load_sdgClassifier, allow_output_mutation=True)
56
- else:
57
- classifier = load_sdgClassifier()
58
  results = classifier.predict(haystackdoc)
59
 
60
 
61
  labels_= [(l.meta['classification']['label'],
62
- l.meta['classification']['score'],l.content,) for l in results]
63
 
64
  df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
65
 
@@ -72,7 +78,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
72
 
73
  return df, x
74
 
75
- def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Document]:
76
  """
77
  creates the pipeline and runs the preprocessing pipeline,
78
  the params for pipeline are fetched from paramconfig
@@ -80,12 +86,12 @@ def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Docume
80
  Param
81
  ------------
82
 
83
- file_path: filepath, if not given will check for file_path in streamlit
84
- session_state, else will return
 
 
 
85
 
86
- file_name: filename, if not given will check for file_name in streamlit
87
- session_state
88
-
89
  Return
90
  --------------
91
  List[Document]: When preprocessing pipeline is run, the output dictionary
@@ -94,21 +100,20 @@ def runSDGPreprocessingPipeline(file_path = None, file_name = None)->List[Docume
94
  key = 'documents' on output.
95
 
96
  """
97
- # if file_path:
98
- file_path = st.session_state['filepath']
99
- file_name = st.session_state['filename']
100
  sdg_processing_pipeline = processingpipeline()
101
  split_by = config.get('sdg','SPLIT_BY')
102
  split_length = int(config.get('sdg','SPLIT_LENGTH'))
103
  split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
 
104
 
105
 
106
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
107
  params= {"FileConverter": {"file_path": file_path, \
108
  "file_name": file_name},
109
- "UdfPreProcessor": {"removePunc": False, \
110
  "split_by": split_by, \
111
  "split_length":split_length,\
112
  "split_overlap": split_overlap}})
113
 
114
- return output_sdg_pre['documents']
 
2
  from haystack.schema import Document
3
  from typing import List, Tuple
4
  import configparser
 
 
 
5
  import logging
6
+ from pandas import DataFrame, Series
7
  from utils.preprocessing import processingpipeline
8
+ try:
9
+ import streamlit as st
10
+ except ImportError:
11
+ logging.info("Streamlit not installed")
12
  config = configparser.ConfigParser()
13
+ try:
14
+ config.read_file(open('paramconfig.cfg'))
15
+ except Exception:
16
+ logging.info("paramconfig file not found")
17
+ st.info("Please place the paramconfig file in the same directory as app.py")
18
 
19
 
20
+ @st.cache
21
  def load_sdgClassifier():
22
  """
23
  loads the document classifier using haystack, where the name/path of model
24
  in HF-hub as string is used to fetch the model object.
25
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
26
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
27
 
28
  Return: document classifier model
29
  """
 
35
  return doc_classifier
36
 
37
 
38
+
39
+ @st.cache
40
  def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
41
  """
42
  Text-Classification on the list of texts provided. Classifier provides the
 
59
  logging.info("running SDG classifiication")
60
  threshold = float(config.get('sdg','THRESHOLD'))
61
 
62
+
63
+ classifier = load_sdgClassifier()
 
 
 
64
  results = classifier.predict(haystackdoc)
65
 
66
 
67
  labels_= [(l.meta['classification']['label'],
68
+ l.meta['classification']['score'],l.content,) for l in results]
69
 
70
  df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
71
 
 
78
 
79
  return df, x
80
 
81
+ def runSDGPreprocessingPipeline(file_path, file_name)->List[Document]:
82
  """
83
  creates the pipeline and runs the preprocessing pipeline,
84
  the params for pipeline are fetched from paramconfig
 
86
  Param
87
  ------------
88
 
89
+ file_name: filename, in case of streamlit application use
90
+ st.session_state['filename']
91
+ file_path: filepath, in case of streamlit application use
92
+ st.session_state['filepath']
93
+
94
 
 
 
 
95
  Return
96
  --------------
97
  List[Document]: When preprocessing pipeline is run, the output dictionary
 
100
  key = 'documents' on output.
101
 
102
  """
103
+
 
 
104
  sdg_processing_pipeline = processingpipeline()
105
  split_by = config.get('sdg','SPLIT_BY')
106
  split_length = int(config.get('sdg','SPLIT_LENGTH'))
107
  split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
108
+ remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
109
 
110
 
111
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
112
  params= {"FileConverter": {"file_path": file_path, \
113
  "file_name": file_name},
114
+ "UdfPreProcessor": {"removePunc": remove_punc, \
115
  "split_by": split_by, \
116
  "split_length":split_length,\
117
  "split_overlap": split_overlap}})
118
 
119
+ return output_sdg_pre
utils/streamlitcheck.py DELETED
@@ -1,19 +0,0 @@
1
- def check_streamlit():
2
- """
3
- Function to check whether python code is run within streamlit
4
-
5
- Returns
6
- -------
7
- use_streamlit : boolean
8
- True if code is run within streamlit, else False
9
- """
10
- try:
11
- from streamlit.scriptrunner.script_run_context import get_script_run_ctx
12
- if not get_script_run_ctx():
13
- use_streamlit = False
14
- else:
15
- use_streamlit = True
16
- except ModuleNotFoundError:
17
- use_streamlit = False
18
- return use_streamlit
19
-