prashant commited on
Commit
72e4dad
·
1 Parent(s): 49a314a

ver0.2 appstore update

Browse files
appStore/info.py CHANGED
@@ -2,6 +2,13 @@ import streamlit as st
2
 
3
 
4
  def app():
 
 
 
 
 
 
 
5
  with open('style.css') as f:
6
  st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
7
  footer = """
@@ -33,7 +40,7 @@ The collaboration aims to determine the potential of NLP methods for tracking po
33
  """
34
  st.markdown(intro, unsafe_allow_html=True)
35
  st.image("appStore/img/pic1.png", caption="NDC Coherence")
36
- st.subheader("Methodology")
37
  #st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
38
  # "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
39
  # "Wikipedia context passages retrieved. Mouseover on the tooltip will show the sentence from the "
 
2
 
3
 
4
  def app():
5
+ # if 'file' in st.session_state:
6
+ # file = st.session_state['file']
7
+ # else:
8
+ # st.sidebar.markdown(" :cloud: Upload document ")
9
+ # uploaded_file = st.sidebar.file_uploader('', type=['pdf', 'docx', 'txt']) #Upload PDF File
10
+ # st.session_state['file'] = uploaded_file
11
+
12
  with open('style.css') as f:
13
  st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
14
  footer = """
 
40
  """
41
  st.markdown(intro, unsafe_allow_html=True)
42
  st.image("appStore/img/pic1.png", caption="NDC Coherence")
43
+ #st.subheader("Methodology")
44
  #st.write("Each sentence in the generated answer ends with a coloured tooltip; the colour ranges from red to green. "
45
  # "The tooltip contains a value representing answer sentence similarity to a specific sentence in the "
46
  # "Wikipedia context passages retrieved. Mouseover on the tooltip will show the sentence from the "
appStore/keyword_search.py CHANGED
@@ -1,10 +1,12 @@
1
  # set path
2
- import glob, os, sys; sys.path.append('../udfPreprocess')
 
 
3
 
4
  #import helper
5
  import udfPreprocess.docPreprocessing as pre
6
  import udfPreprocess.cleaning as clean
7
-
8
  #import needed libraries
9
  import seaborn as sns
10
  from pandas import DataFrame
@@ -24,20 +26,24 @@ import docx
24
  from docx.shared import Inches
25
  from docx.shared import Pt
26
  from docx.enum.style import WD_STYLE_TYPE
27
-
 
28
  import tempfile
29
  import sqlite3
 
 
 
30
 
31
  def app():
32
 
33
  with st.container():
34
  st.markdown("<h1 style='text-align: center; \
35
- color: black;'> Keyword Search</h1>",
36
  unsafe_allow_html=True)
37
  st.write(' ')
38
  st.write(' ')
39
 
40
- with st.expander("ℹ️ - About this app", expanded=True):
41
 
42
  st.write(
43
  """
@@ -45,498 +51,116 @@ def app():
45
  built in Streamlit for doing keyword search in \
46
  policy document - developed by GIZ Data and the \
47
  Sustainable Development Solution Network.
48
- """
49
- )
50
 
51
  st.markdown("")
52
-
53
- st.markdown("")
54
- st.markdown("### 📌 Step One: Upload document ### ")
55
-
56
- with st.container():
57
- def bm25_tokenizer(text):
58
- tokenized_doc = []
59
- for token in text.lower().split():
60
- token = token.strip(string.punctuation)
61
-
62
- if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
63
- tokenized_doc.append(token)
64
- return tokenized_doc
65
-
66
- def bm25TokenizeDoc(paraList):
67
- tokenized_corpus = []
68
- for passage in tqdm(paraList):
69
- if len(passage.split()) >256:
70
- temp = " ".join(passage.split()[:256])
71
- tokenized_corpus.append(bm25_tokenizer(temp))
72
- temp = " ".join(passage.split()[256:])
73
- tokenized_corpus.append(bm25_tokenizer(temp))
74
- else:
75
- tokenized_corpus.append(bm25_tokenizer(passage))
76
-
77
- return tokenized_corpus
78
- def search(keyword):
79
- ##### BM25 search (lexical search) #####
80
- bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
81
- top_n = np.argpartition(bm25_scores, -10)[-10:]
82
- bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
83
- bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
84
-
85
- ##### Sematic Search #####
86
- # Encode the query using the bi-encoder and find potentially relevant passages
87
- #query = "Does document contain {} issues ?".format(keyword)
88
- question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
89
-
90
- hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
91
- hits = hits[0] # Get the hits for the first query
92
-
93
-
94
- ##### Re-Ranking #####
95
- # Now, score all retrieved passages with the cross_encoder
96
- #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
97
- #cross_scores = cross_encoder.predict(cross_inp)
98
-
99
- # Sort results by the cross-encoder scores
100
- #for idx in range(len(cross_scores)):
101
- # hits[idx]['cross-score'] = cross_scores[idx]
102
 
103
-
104
- return bm25_hits, hits
105
-
106
- def show_results(keywordList):
107
- document = docx.Document()
108
- document.add_heading('Document name:{}'.format(file_name), 2)
109
- section = document.sections[0]
110
-
111
- # Calling the footer
112
- footer = section.footer
113
-
114
- # Calling the paragraph already present in
115
- # the footer section
116
- footer_para = footer.paragraphs[0]
117
 
118
- font_styles = document.styles
119
- font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
120
- font_object = font_charstyle.font
121
- font_object.size = Pt(7)
122
- # Adding the centered zoned footer
123
- footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
124
- document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
125
- for keyword in keywordList:
126
-
127
- st.write("Results for Query: {}".format(keyword))
128
- para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
129
- para.font.size = Pt(12)
130
- bm25_hits, hits = search(keyword)
131
-
132
- st.markdown("""
133
- We will provide with 2 kind of results. The 'lexical search' and the semantic search.
134
- """)
135
- # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
136
- st.markdown("Top few lexical search (BM25) hits")
137
- document.add_paragraph("Top few lexical search (BM25) hits")
138
-
139
- for hit in bm25_hits[0:5]:
140
- if hit['score'] > 0.00:
141
- st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
142
- document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
143
-
144
-
145
-
146
- # st.table(bm25_hits[0:3])
147
-
148
- st.markdown("\n-------------------------\n")
149
- st.markdown("Top few Bi-Encoder Retrieval hits")
150
- document.add_paragraph("\n-------------------------\n")
151
- document.add_paragraph("Top few Bi-Encoder Retrieval hits")
152
-
153
- hits = sorted(hits, key=lambda x: x['score'], reverse=True)
154
- for hit in hits[0:5]:
155
- # if hit['score'] > 0.45:
156
- st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
157
- document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
158
- #st.table(hits[0:3]
159
- document.save('demo.docx')
160
- with open("demo.docx", "rb") as file:
161
- btn = st.download_button(
162
- label="Download file",
163
- data=file,
164
- file_name="demo.docx",
165
- mime="txt/docx"
166
- )
167
-
168
-
169
- @st.cache(allow_output_mutation=True)
170
- def load_sentenceTransformer(name):
171
- return SentenceTransformer(name)
172
-
173
-
174
-
175
- docs = None
176
- # asking user for either upload or select existing doc
177
- choice = st.radio(label = 'Select the Document',
178
- help = 'You can upload the document \
179
- or else you can try a example document',
180
- options = ('Upload Document', 'Try Example'),
181
- horizontal = True)
182
-
183
- if choice == 'Upload Document':
184
- uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
185
- if uploaded_file is not None:
186
- with tempfile.NamedTemporaryFile(mode="wb") as temp:
187
- bytes_data = uploaded_file.getvalue()
188
- temp.write(bytes_data)
189
-
190
- st.write("Uploaded Filename: ", uploaded_file.name)
191
- file_name = uploaded_file.name
192
- file_path = temp.name
193
- docs = pre.load_document(file_path, file_name)
194
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
195
-
196
- else:
197
- # listing the options
198
- option = st.selectbox('Select the example document',
199
- ('South Africa:Low Emission strategy',
200
- 'Ethiopia: 10 Year Development Plan'))
201
- if option is 'South Africa:Low Emission strategy':
202
- file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
203
- st.write("Selected document:", file_name.split('/')[1])
204
- # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
205
- # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
206
  else:
207
- # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
208
- file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
209
- st.write("Selected document:", file_name.split('/')[1])
210
 
211
- if option is not None:
212
- docs = pre.load_document(file_path,file_name)
213
- haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
214
-
215
- if docs is not None:
216
-
217
- bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
218
- bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
219
- top_k = 32
220
-
221
- document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
222
- tokenized_corpus = bm25TokenizeDoc(paraList)
223
- document_bm25 = BM25Okapi(tokenized_corpus)
224
- keywordList = None
225
-
226
- col1, col2 = st.columns(2)
227
- with col1:
228
- if st.button('Climate Change Keyword Search'):
229
- keywordList = ['extreme weather', 'floods', 'droughts']
230
-
231
- # show_results(keywordList)
232
- with col2:
233
- if st.button('Gender Keywords Search'):
234
- keywordList = ['Gender', 'Women empowernment']
235
-
236
- # show_results(keywordList)
237
-
238
- keyword = st.text_input("Please enter here \
239
- what you want to search, \
240
- we will look for similar context \
241
- in the document.",
242
- value="",)
243
- if st.button("Find them."):
244
- keywordList = [keyword]
245
- if keywordList is not None:
246
 
247
- show_results(keywordList)
248
-
249
-
250
-
251
-
252
- # @st.cache(allow_output_mutation=True)
253
- # def load_sentenceTransformer(name):
254
- # return SentenceTransformer(name)
255
-
256
- # bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
257
- # bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
258
- # top_k = 32
259
-
260
- # #@st.cache(allow_output_mutation=True)
261
- # #def load_crossEncoder(name):
262
- # # return CrossEncoder(name)
263
 
264
- # # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
265
- # document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
266
-
267
- # def bm25_tokenizer(text):
268
- # tokenized_doc = []
269
- # for token in text.lower().split():
270
- # token = token.strip(string.punctuation)
271
-
272
- # if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
273
- # tokenized_doc.append(token)
274
- # return tokenized_doc
275
-
276
- # def bm25TokenizeDoc(paraList):
277
- # tokenized_corpus = []
278
- # for passage in tqdm(paraList):
279
- # if len(passage.split()) >256:
280
- # temp = " ".join(passage.split()[:256])
281
- # tokenized_corpus.append(bm25_tokenizer(temp))
282
- # temp = " ".join(passage.split()[256:])
283
- # tokenized_corpus.append(bm25_tokenizer(temp))
284
- # else:
285
- # tokenized_corpus.append(bm25_tokenizer(passage))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
- # return tokenized_corpus
288
-
289
- # tokenized_corpus = bm25TokenizeDoc(paraList)
290
-
291
-
292
- # document_bm25 = BM25Okapi(tokenized_corpus)
293
-
294
- # # def search(keyword):
295
- # # ##### BM25 search (lexical search) #####
296
- # # bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
297
- # top_n = np.argpartition(bm25_scores, -10)[-10:]
298
- # bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
299
- # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
300
-
301
- # ##### Sematic Search #####
302
- # # Encode the query using the bi-encoder and find potentially relevant passages
303
- # #query = "Does document contain {} issues ?".format(keyword)
304
- # question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
305
-
306
- # hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
307
- # hits = hits[0] # Get the hits for the first query
308
-
309
-
310
- # ##### Re-Ranking #####
311
- # # Now, score all retrieved passages with the cross_encoder
312
- # #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
313
- # #cross_scores = cross_encoder.predict(cross_inp)
314
-
315
- # # Sort results by the cross-encoder scores
316
- # #for idx in range(len(cross_scores)):
317
- # # hits[idx]['cross-score'] = cross_scores[idx]
318
-
319
-
320
- # return bm25_hits, hits
321
-
322
- # def show_results(keywordList):
323
- # for keyword in keywordList:
324
- # bm25_hits, hits = search(keyword)
325
-
326
- # st.markdown("""
327
- # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
328
- # """)
329
- # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
330
- # st.markdown("Top few lexical search (BM25) hits")
331
- # for hit in bm25_hits[0:5]:
332
- # if hit['score'] > 0.00:
333
- # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
334
-
335
-
336
-
337
-
338
-
339
- # # st.table(bm25_hits[0:3])
340
-
341
- # st.markdown("\n-------------------------\n")
342
- # st.markdown("Top few Bi-Encoder Retrieval hits")
343
-
344
- # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
345
- # for hit in hits[0:5]:
346
- # # if hit['score'] > 0.45:
347
- # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
348
- # #st.table(hits[0:3]
349
-
350
-
351
- # # if docs is not None:
352
- # # col1, col2 = st.columns(2)
353
- # # with col1:
354
- # # if st.button('Gender Keywords Search'):
355
- # # keywordList = ['Gender Equality', 'Women empowernment']
356
- # # show_results(keywordList)
357
- # # with col2:
358
- # # if st.button('Climate Change Keyword Search'):
359
- # # keywordList = ['extreme weather', 'floods', 'droughts']
360
- # # show_results(keywordList)
361
-
362
- # # keyword = st.text_input("Please enter here \
363
- # # what you want to search, \
364
- # # we will look for similar context \
365
- # # in the document.",
366
- # # value="",)
367
- # # if st.button("Find them."):
368
- # # show_results([keyword])
369
-
370
-
371
- # choice1 = st.radio(label = 'Keyword Search',
372
- # help = 'Search \
373
- # or else you can try a example document',
374
- # options = ('Enter your own Query', 'Try Example'),
375
- # horizontal = True)
376
-
377
- # if choice1 == 'Enter your own Query':
378
- # keyword = st.text_input("Please enter here \
379
- # what you want to search, \
380
- # we will look for similar context \
381
- # in the document.",
382
- # value="",)
383
- # else:
384
- # option1 = st.selectbox('Select the Predefined word cluster',
385
- # ('Gender:[Gender Equality, Women empowernment]',
386
- # 'Climate change:[extreme weather, floods, droughts]',
387
- # ))
388
- # if option1 == 'Gender:[Gender Equality, Women empowernment]':
389
- # keywordList = ['Gender Equality', 'Women empowernment']
390
- # else:
391
- # keywordList = ['extreme weather', 'floods', 'droughts']
392
-
393
- # option1 = st.selectbox('Select the Predefined word cluster',
394
- # ('Gender:[Gender Equality, Women empowernment]',
395
- # 'Climate change:[extreme weather, floods, droughts]',
396
- # # 'Enter your Own Keyword Query'))
397
- # if option1 == 'Enter your Own Keyword Query':
398
- # keyword = st.text_input("Please enter here \
399
- # what you want to search, \
400
- # we will look for similar context \
401
- # in the document.",
402
- # value="",)
403
- # elif option1 == 'Gender:[Gender Equality, Women empowernment]':
404
- # keywordList = ['Gender Equality', 'Women empowernment']
405
- # elif option1 == 'Climate change:[extreme weather, floods, droughts]':
406
- # keywordList = ['extreme weather', 'floods', 'droughts']
407
-
408
-
409
- # st.markdown("### 📌 Step Two: Search Keyword in Document ### ")
410
-
411
-
412
- # @st.cache(allow_output_mutation=True)
413
- # def load_sentenceTransformer(name):
414
- # return SentenceTransformer(name)
415
-
416
- # bi_encoder = load_sentenceTransformer('msmarco-distilbert-cos-v5') # multi-qa-MiniLM-L6-cos-v1
417
- # bi_encoder.max_seq_length = 64 #Truncate long passages to 256 tokens
418
- # top_k = 32
419
-
420
- # #@st.cache(allow_output_mutation=True)
421
- # #def load_crossEncoder(name):
422
- # # return CrossEncoder(name)
423
-
424
- # # cross_encoder = load_crossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
425
- # document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
426
-
427
- # def bm25_tokenizer(text):
428
- # tokenized_doc = []
429
- # for token in text.lower().split():
430
- # token = token.strip(string.punctuation)
431
-
432
- # if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
433
- # tokenized_doc.append(token)
434
- # return tokenized_doc
435
-
436
- # def bm25TokenizeDoc(paraList):
437
- # tokenized_corpus = []
438
- # for passage in tqdm(paraList):
439
- # if len(passage.split()) >256:
440
- # temp = " ".join(passage.split()[:256])
441
- # tokenized_corpus.append(bm25_tokenizer(temp))
442
- # temp = " ".join(passage.split()[256:])
443
- # tokenized_corpus.append(bm25_tokenizer(temp))
444
- # else:
445
- # tokenized_corpus.append(bm25_tokenizer(passage))
446
-
447
- # return tokenized_corpus
448
-
449
- # tokenized_corpus = bm25TokenizeDoc(paraList)
450
-
451
-
452
- # document_bm25 = BM25Okapi(tokenized_corpus)
453
-
454
-
455
- # def search(keyword):
456
- # ##### BM25 search (lexical search) #####
457
- # bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
458
- # top_n = np.argpartition(bm25_scores, -10)[-10:]
459
- # bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
460
- # bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
461
 
462
- # ##### Sematic Search #####
463
- # # Encode the query using the bi-encoder and find potentially relevant passages
464
- # #query = "Does document contain {} issues ?".format(keyword)
465
- # question_embedding = bi_encoder.encode(keyword, convert_to_tensor=True)
466
-
467
- # hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
468
- # hits = hits[0] # Get the hits for the first query
469
-
470
-
471
- # ##### Re-Ranking #####
472
- # # Now, score all retrieved passages with the cross_encoder
473
- # #cross_inp = [[query, paraList[hit['corpus_id']]] for hit in hits]
474
- # #cross_scores = cross_encoder.predict(cross_inp)
475
-
476
- # # Sort results by the cross-encoder scores
477
- # #for idx in range(len(cross_scores)):
478
- # # hits[idx]['cross-score'] = cross_scores[idx]
479
-
480
-
481
- # return bm25_hits, hits
482
-
483
- # def show_results(keywordList):
484
- # for keyword in keywordList:
485
- # bm25_hits, hits = search(keyword)
486
-
487
- # st.markdown("""
488
- # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
489
- # """)
490
- # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
491
- # st.markdown("Top few lexical search (BM25) hits")
492
- # for hit in bm25_hits[0:5]:
493
- # if hit['score'] > 0.00:
494
- # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
495
-
496
-
497
-
498
-
499
-
500
- # # st.table(bm25_hits[0:3])
501
-
502
- # st.markdown("\n-------------------------\n")
503
- # st.markdown("Top few Bi-Encoder Retrieval hits")
504
-
505
- # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
506
- # for hit in hits[0:5]:
507
- # # if hit['score'] > 0.45:
508
- # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
509
- # #st.table(hits[0:3]
510
-
511
-
512
-
513
-
514
- # # if st.button("Find them."):
515
- # # bm25_hits, hits = search(keyword)
516
-
517
- # # st.markdown("""
518
- # # We will provide with 2 kind of results. The 'lexical search' and the semantic search.
519
- # # """)
520
- # # # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
521
- # # st.markdown("Top few lexical search (BM25) hits")
522
- # # for hit in bm25_hits[0:5]:
523
- # # if hit['score'] > 0.00:
524
- # # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
525
-
526
-
527
-
528
-
529
-
530
- # # # st.table(bm25_hits[0:3])
531
-
532
- # # st.markdown("\n-------------------------\n")
533
- # # st.markdown("Top few Bi-Encoder Retrieval hits")
534
-
535
- # # hits = sorted(hits, key=lambda x: x['score'], reverse=True)
536
- # # for hit in hits[0:5]:
537
- # # # if hit['score'] > 0.45:
538
- # # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
539
- # # #st.table(hits[0:3]
540
-
541
-
542
-
 
1
  # set path
2
+ import glob, os, sys
3
+ from udfPreprocess.search import semantic_search
4
+ sys.path.append('../udfPreprocess')
5
 
6
  #import helper
7
  import udfPreprocess.docPreprocessing as pre
8
  import udfPreprocess.cleaning as clean
9
+ from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
10
  #import needed libraries
11
  import seaborn as sns
12
  from pandas import DataFrame
 
26
  from docx.shared import Inches
27
  from docx.shared import Pt
28
  from docx.enum.style import WD_STYLE_TYPE
29
+ import logging
30
+ logger = logging.getLogger(__name__)
31
  import tempfile
32
  import sqlite3
33
+ import json
34
+ import configparser
35
+
36
 
37
  def app():
38
 
39
  with st.container():
40
  st.markdown("<h1 style='text-align: center; \
41
+ color: black;'> Search</h1>",
42
  unsafe_allow_html=True)
43
  st.write(' ')
44
  st.write(' ')
45
 
46
+ with st.expander("ℹ️ - About this app", expanded=False):
47
 
48
  st.write(
49
  """
 
51
  built in Streamlit for doing keyword search in \
52
  policy document - developed by GIZ Data and the \
53
  Sustainable Development Solution Network.
54
+ """)
 
55
 
56
  st.markdown("")
57
+
58
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ with st.sidebar:
61
+ with open('sample/keywordexample.json','r') as json_file:
62
+ keywordexample = json.load(json_file)
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
65
+ if genre == 'Food':
66
+ keywordList = keywordexample['Food']
67
+ elif genre == 'Climate':
68
+ keywordList = keywordexample['Climate']
69
+ elif genre == 'Social':
70
+ keywordList = keywordexample['Social']
71
+ elif genre == 'Nature':
72
+ keywordList = keywordexample['Nature']
73
+ elif genre == 'Implementation':
74
+ keywordList = keywordexample['Implementation']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  else:
76
+ keywordList = None
 
 
77
 
78
+ searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ with st.container():
82
+ if keywordList is not None:
83
+ queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
84
+ value="{}".format(keywordList))
85
+ else:
86
+ queryList = st.text_input("Please enter here your question and we will look \
87
+ for an answer in the document OR enter the keyword you \
88
+ are looking for and we will \
89
+ we will look for similar context \
90
+ in the document.",
91
+ placeholder="Enter keyword here")
92
+
93
+ if st.button("Find them"):
94
+
95
+ if queryList == "":
96
+ st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
97
+ logging.warning("Terminated as no keyword provided")
98
+ else:
99
+
100
+ if 'docs' in st.session_state:
101
+ docs = st.session_state['docs']
102
+ paraList = st.session_state['paraList']
103
+
104
+ if searchtype == 'Exact Matches':
105
+ queryList = list(queryList.split(","))
106
+ logging.info("performing lexical search")
107
+ tokenized_corpus = bm25TokenizeDoc(paraList)
108
+ # st.write(len(tokenized_corpus))
109
+ document_bm25 = BM25Okapi(tokenized_corpus)
110
+
111
+ with st.spinner("Performing Exact matching search (Lexical search) for you"):
112
+ st.markdown("##### Top few lexical search (BM25) hits #####")
113
+
114
+ for keyword in queryList:
115
+
116
+ bm25_hits = lexical_search(keyword,document_bm25)
117
+
118
+
119
+ counter = 0
120
+ for hit in bm25_hits:
121
+ if hit['score'] > 0.00:
122
+ counter += 1
123
+ if counter == 1:
124
+ st.markdown("###### Results for keyword: **{}** ######".format(keyword))
125
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
126
+ st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
127
+
128
+ st.markdown("---")
129
+ if counter == 0:
130
+ st.write("No results found for '**{}**' ".format(keyword))
131
+ else:
132
+ logging.info("starting semantic search")
133
+ with st.spinner("Performing Similar/Contextual search"):
134
+ query = "Find {} related issues ?".format(queryList)
135
+ config = configparser.ConfigParser()
136
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
137
+ threshold = float(config.get('semantic_search','THRESHOLD'))
138
+ st.write(query)
139
+ semantic_hits = semantic_search(query,paraList)
140
+ st.markdown("##### Semantic search hits for {} related topics #####".format(queryList))
141
+
142
+ for i,queryhit in enumerate(semantic_hits):
143
+
144
+ # st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
145
+ counter = 0
146
+ for hit in queryhit:
147
+ counter += 1
148
+
149
+
150
+ if hit['score'] > threshold:
151
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
152
+ st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
153
+
154
+ # document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
155
+ st.markdown("---")
156
+ # st.write(semantic_hits)
157
+
158
+
159
+
160
+
161
+ else:
162
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
163
+ logging.warning("Terminated as no keyword provided")
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/multiapp.py CHANGED
@@ -2,6 +2,8 @@
2
  """
3
  import streamlit as st
4
  from PIL import Image
 
 
5
 
6
  class MultiApp:
7
  """Framework for combining multiple streamlit applications.
@@ -25,7 +27,7 @@ class MultiApp:
25
  def __init__(self):
26
  self.apps = []
27
 
28
- def add_app(self, title, func):
29
  """Adds a new application.
30
  Parameters
31
  ----------
@@ -36,16 +38,39 @@ class MultiApp:
36
  """
37
  self.apps.append({
38
  "title": title,
 
39
  "function": func
40
  })
41
 
42
  def run(self):
 
43
  st.sidebar.write(format_func=lambda app: app['title'])
44
- image = Image.open('appStore/img/giz_sdsn_small.jpg')
45
  st.sidebar.image(image)
46
- app = st.sidebar.radio(
47
- 'Pages',
48
- self.apps,
49
- format_func=lambda app: app['title'])
50
-
51
- app['function']()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  """
3
  import streamlit as st
4
  from PIL import Image
5
+ from streamlit_option_menu import option_menu
6
+ from udfPreprocess.uploadAndExample import add_upload
7
 
8
  class MultiApp:
9
  """Framework for combining multiple streamlit applications.
 
27
  def __init__(self):
28
  self.apps = []
29
 
30
+ def add_app(self,title,icon, func):
31
  """Adds a new application.
32
  Parameters
33
  ----------
 
38
  """
39
  self.apps.append({
40
  "title": title,
41
+ "icon": icon,
42
  "function": func
43
  })
44
 
45
  def run(self):
46
+
47
  st.sidebar.write(format_func=lambda app: app['title'])
48
+ image = Image.open('appStore/img/giz_sdsn.jpg')
49
  st.sidebar.image(image)
50
+ #st.sidebar.markdown("## 📌 Pages ")
51
+ #app = st.sidebar.radio(
52
+ # 'Pages',
53
+ # self.apps,
54
+ # from streamlit_option_menu import option_menu
55
+ with st.sidebar:
56
+ selected = option_menu(None, [page["title"] for page in self.apps],
57
+ icons=[page["icon"] for page in self.apps],
58
+ menu_icon="cast", default_index=0)
59
+
60
+
61
+ for index, item in enumerate(self.apps):
62
+ if item["title"] == selected:
63
+ self.apps[index]["function"]()
64
+ break
65
+
66
+ # app['function']()
67
+ choice = st.sidebar.radio(label = 'Select the Document',
68
+ help = 'You can upload the document \
69
+ or else you can try a example document',
70
+ options = ('Upload Document', 'Try Example'),
71
+ horizontal = True)
72
+ add_upload(choice)
73
+ # st.sidebar.markdown('')
74
+ # st.sidebar.markdown(" :cloud: Upload document ")
75
+ # uploaded_file = st.sidebar.file_uploader('', type=['pdf', 'docx', 'txt']) #Upload PDF File
76
+ # st.session_state['file'] = uploaded_file
appStore/sdg_analysis.py CHANGED
@@ -1,5 +1,6 @@
1
  # set path
2
- import glob, os, sys; sys.path.append('../udfPreprocess')
 
3
 
4
  #import helper
5
  import udfPreprocess.docPreprocessing as pre
@@ -17,10 +18,26 @@ import pandas as pd
17
  import docx
18
  from docx.shared import Inches
19
  from docx.shared import Pt
20
- from docx.enum.style import WD_STYLE_TYPE
 
21
 
22
  import tempfile
23
  import sqlite3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def app():
26
 
@@ -29,154 +46,38 @@ def app():
29
  st.write(' ')
30
  st.write(' ')
31
 
32
- with st.expander("ℹ️ - About this app", expanded=True):
33
 
34
  st.write(
35
  """
36
- The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents - developed by GIZ Data and the Sustainable Development Solution Network. \n
37
- 1. Keyword heatmap \n
38
- 2. SDG Classification for the paragraphs/texts in the document
39
- """
40
- )
41
-
42
  st.markdown("")
43
 
44
- st.markdown("")
45
- st.markdown("## 📌 Step One: Upload document ")
46
-
47
- with st.container():
48
-
49
 
50
- docs = None
51
- # asking user for either upload or select existing doc
52
- choice = st.radio(label = 'Select the Document',
53
- help = 'You can upload the document \
54
- or else you can try a example document',
55
- options = ('Upload Document', 'Try Example'),
56
- horizontal = True)
57
-
58
- if choice == 'Upload Document':
59
- uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
60
- if uploaded_file is not None:
61
- with tempfile.NamedTemporaryFile(mode="wb") as temp:
62
- bytes_data = uploaded_file.getvalue()
63
- temp.write(bytes_data)
64
-
65
- st.write("Uploaded Filename: ", uploaded_file.name)
66
- file_name = uploaded_file.name
67
- file_path = temp.name
68
- docs = pre.load_document(file_path, file_name)
69
- docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
70
- #haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
71
-
72
- else:
73
- # listing the options
74
- option = st.selectbox('Select the example document',
75
- ('Ethiopia: 10 Year Development Plan',
76
- 'South Africa:Low Emission strategy'))
77
- if option is 'South Africa:Low Emission strategy':
78
- file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
79
- st.write("Selected document:", file_name.split('/')[1])
80
- # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
81
- # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
82
- else:
83
- # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
84
- file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
85
- st.write("Selected document:", file_name.split('/')[1])
86
-
87
- if option is not None:
88
- docs = pre.load_document(file_path,file_name)
89
- # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
90
- docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
91
-
92
-
93
 
94
- if docs is not None:
95
 
96
- @st.cache(allow_output_mutation=True)
97
- def load_keyBert():
98
- return KeyBERT()
 
 
99
 
100
- kw_model = load_keyBert()
101
 
102
- keywords = kw_model.extract_keywords(
103
- all_text,
104
- keyphrase_ngram_range=(1, 3),
105
- use_mmr=True,
106
- stop_words="english",
107
- top_n=10,
108
- diversity=0.7,
109
- )
110
 
111
- st.markdown("## 🎈 What is my document about?")
112
-
113
- df = (
114
- DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
115
- .sort_values(by="Relevancy", ascending=False)
116
- .reset_index(drop=True)
117
- )
118
- df1 = (
119
- DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
120
- .sort_values(by="Relevancy", ascending=False)
121
- .reset_index(drop=True)
122
- )
123
- df.index += 1
124
 
125
- # Add styling
126
- cmGreen = sns.light_palette("green", as_cmap=True)
127
- cmRed = sns.light_palette("red", as_cmap=True)
128
- df = df.style.background_gradient(
129
- cmap=cmGreen,
130
- subset=[
131
- "Relevancy",
132
- ],
133
- )
134
- c1, c2, c3 = st.columns([1, 3, 1])
135
-
136
- format_dictionary = {
137
- "Relevancy": "{:.1%}",
138
- }
139
-
140
- df = df.format(format_dictionary)
141
-
142
- with c2:
143
- st.table(df)
144
-
145
- ######## SDG classiciation
146
- # @st.cache(allow_output_mutation=True)
147
- # def load_sdgClassifier():
148
- # classifier = pipeline("text-classification", model= "../models/osdg_sdg/")
149
-
150
- # return classifier
151
-
152
- # load from disc (github repo) for performance boost
153
- @st.cache(allow_output_mutation=True)
154
- def load_sdgClassifier():
155
- classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
156
-
157
- return classifier
158
-
159
- classifier = load_sdgClassifier()
160
-
161
- # # not needed, par list comes from pre_processing function already
162
-
163
- # word_list = all_text.split()
164
- # len_word_list = len(word_list)
165
- # par_list = []
166
- # par_len = 130
167
- # for i in range(0,len_word_list // par_len):
168
- # string_part = ' '.join(word_list[i*par_len:(i+1)*par_len])
169
- # par_list.append(string_part)
170
-
171
- labels = classifier(par_list)
172
- labels_= [(l['label'],l['score']) for l in labels]
173
- df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
174
- df2['text'] = par_list
175
- df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
176
- df2.index += 1
177
- df2 =df2[df2['Relevancy']>.85]
178
- x = df2['SDG'].value_counts()
179
- df3 = df2.copy()
180
 
181
  plt.rcParams['font.size'] = 25
182
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
@@ -184,110 +85,92 @@ def app():
184
  fig, ax = plt.subplots()
185
  ax.pie(x, colors=colors, radius=2, center=(4, 4),
186
  wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
187
- fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
188
- st.markdown("## 🎈 Anything related to SDGs?")
 
 
189
 
190
  c4, c5, c6 = st.columns([2, 2, 2])
191
 
192
  # Add styling
193
  cmGreen = sns.light_palette("green", as_cmap=True)
194
  cmRed = sns.light_palette("red", as_cmap=True)
195
- df2 = df2.style.background_gradient(
196
- cmap=cmGreen,
197
- subset=[
198
- "Relevancy",
199
- ],
200
- )
201
 
202
- format_dictionary = {
203
- "Relevancy": "{:.1%}",
204
- }
205
 
206
- df2 = df2.format(format_dictionary)
207
 
208
  with c5:
209
  st.pyplot(fig)
210
 
211
  c7, c8, c9 = st.columns([1, 10, 1])
212
  with c8:
213
- st.table(df2)
214
-
215
- document = docx.Document()
216
- document.add_heading('Document name:{}'.format(file_name), 2)
217
- # Choosing the top most section of the page
218
- section = document.sections[0]
219
-
220
- # Calling the footer
221
- footer = section.footer
222
-
223
- # Calling the paragraph already present in
224
- # the footer section
225
- footer_para = footer.paragraphs[0]
226
-
227
- font_styles = document.styles
228
- font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
229
- font_object = font_charstyle.font
230
- font_object.size = Pt(7)
231
- # Adding the centered zoned footer
232
- footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
233
-
234
- #footer_para.text = "\tPowered by GIZ Data and the Sustainable Development Solution Network\
235
- # hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev"
236
- #footer_para.font.size = docx.shared.Pt(6)
237
-
238
- document.add_heading('What is the document about', level=1)
239
- t = document.add_table(df1.shape[0]+1, df1.shape[1])
240
-
241
-
242
- # add the header rows.
243
- for j in range(df1.shape[-1]):
244
- t.cell(0,j).text = df1.columns[j]
245
-
246
 
247
- # add the rest of the data frame
248
- for i in range(df1.shape[0]):
249
- for j in range(df1.shape[-1]):
250
- t.cell(i+1,j).text = str(df1.values[i,j])
251
-
252
-
253
-
254
- document.add_heading('Anything Related to SDG', level=1)
255
- document.add_picture('temp.png', width=Inches(3), height=Inches(3))
256
- t = document.add_table(df3.shape[0]+1, df3.shape[1])
257
-
258
- widths = [Inches(0.4), Inches(0.4), Inches(4.5)]
259
- # add the header rows.
260
- for j in range(df3.shape[-1]):
261
- t.cell(0,j).text = df3.columns[j]
262
- t.cell(0,j).width = widths[j]
263
-
264
- # add the rest of the data frame
265
- for i in range(df3.shape[0]):
266
- for j in range(df3.shape[-1]):
267
- t.cell(i+1,j).width = widths[j]
268
- t.cell(i+1,j).text = str(df3.values[i,j])
269
-
270
-
271
- document.save('demo.docx')
272
-
273
- #with open('summary.txt', 'w') as f:
274
- # f.write(df1.to_string())
275
- # f.write(fig)
276
- #f.write(df2)
277
- # f.write(df3.to_string())
278
-
279
- with open("demo.docx", "rb") as file:
280
- btn = st.download_button(
281
- label="Download file",
282
- data=file,
283
- file_name="demo.docx",
284
- mime="txt/docx"
285
- )
286
- #with document st.download_button(
287
- # label="Download data as docx",
288
- # data=document,
289
- #file_name='test.docx',
290
- #mime='text/docx',
291
- # )
292
-
293
-
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../udfPreprocess')
4
 
5
  #import helper
6
  import udfPreprocess.docPreprocessing as pre
 
18
  import docx
19
  from docx.shared import Inches
20
  from docx.shared import Pt
21
+ from docx.enum.style import WD_STYLE_TYPE
22
+ from udfPreprocess.sdg import sdg_classification
23
 
24
  import tempfile
25
  import sqlite3
26
+ import logging
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+
31
+ @st.cache(allow_output_mutation=True)
32
+ def load_keyBert():
33
+ return KeyBERT()
34
+
35
+ @st.cache(allow_output_mutation=True)
36
+ def load_sdgClassifier():
37
+ classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
38
+ return classifier
39
+
40
+
41
 
42
  def app():
43
 
 
46
  st.write(' ')
47
  st.write(' ')
48
 
49
+ with st.expander("ℹ️ - About this app", expanded=False):
50
 
51
  st.write(
52
  """
53
+ The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents with respect to SDG Classification for the paragraphs/texts in the document - developed by GIZ Data and the Sustainable Development Solution Network. \n
54
+ """)
 
 
 
 
55
  st.markdown("")
56
 
 
 
 
 
 
57
 
58
+ with st.container():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
 
60
 
61
+
62
+ if 'docs' in st.session_state:
63
+ docs = st.session_state['docs']
64
+ docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
65
+ with st.spinner("Running SDG"):
66
 
67
+ df, x = sdg_classification(par_list)
68
 
 
 
 
 
 
 
 
 
69
 
70
+ # classifier = load_sdgClassifier()
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # labels = classifier(par_list)
73
+ # labels_= [(l['label'],l['score']) for l in labels]
74
+ # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
75
+ # df2['text'] = par_list
76
+ # df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
77
+ # df2.index += 1
78
+ # df2 =df2[df2['Relevancy']>.85]
79
+ # x = df2['SDG'].value_counts()
80
+ # df3 = df2.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  plt.rcParams['font.size'] = 25
83
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
 
85
  fig, ax = plt.subplots()
86
  ax.pie(x, colors=colors, radius=2, center=(4, 4),
87
  wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
88
+ # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
89
+ st.markdown("#### Anything related to SDGs? ####")
90
+
91
+ # st.markdown("#### 🎈 Anything related to SDGs? ####")
92
 
93
  c4, c5, c6 = st.columns([2, 2, 2])
94
 
95
  # Add styling
96
  cmGreen = sns.light_palette("green", as_cmap=True)
97
  cmRed = sns.light_palette("red", as_cmap=True)
98
+ # df2 = df2.style.background_gradient(
99
+ # cmap=cmGreen,
100
+ # subset=[
101
+ # "Relevancy",
102
+ # ],
103
+ # )
104
 
105
+ # format_dictionary = {
106
+ # "Relevancy": "{:.1%}",
107
+ # }
108
 
109
+ # df2 = df2.format(format_dictionary)
110
 
111
  with c5:
112
  st.pyplot(fig)
113
 
114
  c7, c8, c9 = st.columns([1, 10, 1])
115
  with c8:
116
+ st.table(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+
119
+ # 1. Keyword heatmap \n
120
+ # 2. SDG Classification for the paragraphs/texts in the document
121
+ #
122
+
123
+ # with st.container():
124
+ # if 'docs' in st.session_state:
125
+ # docs = st.session_state['docs']
126
+ # docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
127
+ # # paraList = st.session_state['paraList']
128
+ # logging.info("keybert")
129
+ # with st.spinner("Running Key bert"):
130
+
131
+ # kw_model = load_keyBert()
132
+
133
+ # keywords = kw_model.extract_keywords(
134
+ # all_text,
135
+ # keyphrase_ngram_range=(1, 3),
136
+ # use_mmr=True,
137
+ # stop_words="english",
138
+ # top_n=10,
139
+ # diversity=0.7,
140
+ # )
141
+
142
+ # st.markdown("## 🎈 What is my document about?")
143
+
144
+ # df = (
145
+ # DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
146
+ # .sort_values(by="Relevancy", ascending=False)
147
+ # .reset_index(drop=True)
148
+ # )
149
+ # df1 = (
150
+ # DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
151
+ # .sort_values(by="Relevancy", ascending=False)
152
+ # .reset_index(drop=True)
153
+ # )
154
+ # df.index += 1
155
+
156
+ # # Add styling
157
+ # cmGreen = sns.light_palette("green", as_cmap=True)
158
+ # cmRed = sns.light_palette("red", as_cmap=True)
159
+ # df = df.style.background_gradient(
160
+ # cmap=cmGreen,
161
+ # subset=[
162
+ # "Relevancy",
163
+ # ],
164
+ # )
165
+
166
+ # c1, c2, c3 = st.columns([1, 3, 1])
167
+
168
+ # format_dictionary = {
169
+ # "Relevancy": "{:.1%}",
170
+ # }
171
+
172
+ # df = df.format(format_dictionary)
173
+
174
+ # with c2:
175
+ #
176
+ # st.table(df)
sample/keywordexample.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"I will enter my own keyword":[],
2
+ "Food":"Food security,Nutrition,Diets,Food loss",
3
+ "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
4
+ "Social":"Indigenous,Local community(ies),Gender,Rural livelihoods,Minority",
5
+ "Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
6
+ "Implementation":"Implementation,transformation,reform,integration,strategy,policy"
7
+ }