TeresaK commited on Nov 22, 2023

Commit

a5e9cde

1 Parent(s): b8fac0d

Upload 38 files

Browse files

Files changed (37) hide show

.DS_Store +0 -0
.gitignore +1 -0
README.md +13 -0
app.py +148 -0
appStore/.DS_Store +0 -0
appStore/__init__.py +1 -0
appStore/__pycache__/__init__.cpython-310.pyc +0 -0
appStore/__pycache__/__init__.cpython-38.pyc +0 -0
appStore/__pycache__/doc_processing.cpython-310.pyc +0 -0
appStore/__pycache__/vulnerability_analysis.cpython-310.pyc +0 -0
appStore/__pycache__/vulnerability_analysis.cpython-38.pyc +0 -0
appStore/doc_processing.py +87 -0
appStore/rag.py +80 -0
appStore/vulnerability_analysis.py +69 -0
docStore/.DS_Store +0 -0
docStore/sample/KE_First_NDC.pdf +0 -0
docStore/sample/PH_First_NDC.pdf +0 -0
docStore/sample/files.json +3 -0
packages.txt +4 -0
paramconfig.cfg +19 -0
requirements.txt +24 -0
style.css +179 -0
utils/.DS_Store +0 -0
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-310.pyc +0 -0
utils/__pycache__/__init__.cpython-38.pyc +0 -0
utils/__pycache__/config.cpython-310.pyc +0 -0
utils/__pycache__/config.cpython-38.pyc +0 -0
utils/__pycache__/preprocessing.cpython-310.pyc +0 -0
utils/__pycache__/preprocessing.cpython-38.pyc +0 -0
utils/__pycache__/uploadAndExample.cpython-310.pyc +0 -0
utils/__pycache__/vulnerability_classifier.cpython-310.pyc +0 -0
utils/__pycache__/vulnerability_classifier.cpython-38.pyc +0 -0
utils/config.py +31 -0
utils/preprocessing.py +291 -0
utils/uploadAndExample.py +44 -0
utils/vulnerability_classifier.py +156 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ cpv_v2/

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Vulnerable Groups
+emoji: 🦀
+colorFrom: blue
+colorTo: pink
+sdk: streamlit
+sdk_version: 1.21.0
+app_file: app.py
+pinned: false
+license: openrail
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# hacky fix for HF environment issues
+import os
+os.system("pip uninstall -y spaces")
+os.system('pip install spaces==0.17.0')
+os.system("pip uninstall -y gradio")
+os.system("pip uninstall -y pydantic")
+os.system("pip uninstall -y typer")
+os.system('pip install typer==0.4.0')
+os.system('pip install pydantic==1.8.2 --use-deprecated=legacy-resolver')
+import appStore.vulnerability_analysis as vulnerability_analysis
+import appStore.doc_processing as processing
+from appStore.rag import run_query
+from utils.uploadAndExample import add_upload, get_tabs
+from utils.vulnerability_classifier import label_dict
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+st.set_page_config(page_title = 'Vulnerability Analysis',
+                   initial_sidebar_state='expanded', layout="wide")
+with st.sidebar:
+    # upload and example doc
+    choice = st.sidebar.radio(label = 'Select the Document',
+                            help = 'You can upload your own documents \
+                            or use the example document',
+                            options = ('Upload Document', 'Try Example'),
+                            horizontal = True)
+    add_upload(choice)
+with st.container():
+        st.markdown("<h2 style='text-align: center;'> Vulnerability Analysis </h2>", unsafe_allow_html=True)
+        st.write(' ')
+with st.expander("ℹ️ - About this app", expanded=False):
+    st.write(
+        """
+        The Vulnerability Analysis App is an open-source\
+        digital tool which aims to assist policy analysts and \
+        other users in extracting and filtering references \
+        to different vulnerable groups from public documents.
+        """)
+    st.write("""
+        What Happens in background?
+        - Step 1: Once the document is provided to app, it undergoes *Pre-processing*.\
+        In this step the document is broken into smaller paragraphs \
+        (based on word/sentence count).
+        - Step 2: The paragraphs are then fed to the **Vulnerability Classifier** which detects if
+        the paragraph contains any references to vulnerable groups.
+        """)
+    st.write("")
+# Define the apps used
+apps = [processing.app, vulnerability_analysis.app]
+multiplier_val = 1 / len(apps)
+if st.button("Analyze Documents"):
+    prg = st.progress(0.0)
+    for i, func in enumerate(apps):
+        func()
+        prg.progress((i + 1) * multiplier_val)
+if 'combined_files_df' in st.session_state: # check for existence of processed documents
+    # get the filenames from the processed docs dataframe so we can use for tab names
+    uploaded_docs = [value for key, value in st.session_state.items() if key.startswith('filename_')]
+    tab_titles = get_tabs(uploaded_docs)
+    if tab_titles:
+        tabs = st.tabs(tab_titles)
+        # Render the results (Pie chart, Summary and Table) in indidivual tabs for each doc
+        for tab, doc in zip(tabs, uploaded_docs):
+            with tab:
+                # Main app code
+                with st.container():
+                    st.write(' ')
+                    # Assign dataframe a name
+                df_vul = st.session_state['combined_files_df']
+                df_vul = df_vul[df_vul['filename'] == doc]
+                col1, col2 = st.columns([1,1])
+                with col1:
+                    # Header
+                    st.subheader("Explore references to vulnerable groups:")
+                    # Text
+                    num_paragraphs = len(df_vul['Vulnerability Label'])
+                    num_references = len(df_vul[df_vul['Vulnerability Label'] != 'Other'])
+                    st.markdown(f"""<div style="text-align: justify;"> The document contains a
+                            total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
+                            We identified <span style="color: red;">{num_references}</span>
+                            references to vulnerable groups.</div>
+                            <br>
+                            In the pie chart on the right you can see the distribution of the different
+                            groups defined. For a more detailed view in the text, see the paragraphs and
+                            their respective labels in the table below.</div>""", unsafe_allow_html=True)
+                with col2:
+                    ### Pie chart
+                    # Create a df that stores all the labels
+                    df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])
+                    # Count how often each label appears in the "Vulnerability Labels" column
+                    label_counts = df_vul['Vulnerability Label'].value_counts().reset_index()
+                    label_counts.columns = ['Label', 'Count']
+                    # Merge the label counts with the df_label DataFrame
+                    df_labels = df_labels.merge(label_counts, on='Label', how='left')
+                    # Configure graph
+                    fig = px.pie(df_labels,
+                            names="Label",
+                            values="Count",
+                            title='Label Counts',
+                            hover_name="Count",
+                            color_discrete_sequence=px.colors.qualitative.Plotly
+                    )
+                    #Show plot
+                    st.plotly_chart(fig, use_container_width=True)
+                ### Document Summary
+                st.markdown("----")
+                st.markdown('**DOCUMENT FINDINGS SUMMARY:**')
+                # filter out 'Other' cause we don't want that in the table (and it's way too big for the summary)
+                df_docs = df_vul[df_vul['Vulnerability Label'] != 'Other']
+                # construct RAG query, send to openai and process response
+                run_query(df_docs)
+                st.markdown("----")
+                with st.expander("ℹ️ - Document Text Classifications", expanded=False):
+                    ### Table
+                    st.table(df_docs)

appStore/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

appStore/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # adding for package implementation

appStore/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (138 Bytes). View file

appStore/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (240 Bytes). View file

appStore/__pycache__/doc_processing.cpython-310.pyc ADDED Viewed

Binary file (3.42 kB). View file

appStore/__pycache__/vulnerability_analysis.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

appStore/__pycache__/vulnerability_analysis.cpython-38.pyc ADDED Viewed

Binary file (2.05 kB). View file

appStore/doc_processing.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+from typing import List, Tuple
+from typing_extensions import Literal
+from haystack.schema import Document
+from utils.config import get_classifier_params
+from utils.preprocessing import processingpipeline,paraLengthCheck
+import streamlit as st
+import logging
+import pandas as pd
+params  = get_classifier_params("preprocessing")
+@st.cache_data
+def runPreprocessingPipeline(file_name:str, file_path:str,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
+    """
+    creates the pipeline and runs the preprocessing pipeline,
+    the params for pipeline are fetched from paramconfig
+    Params
+    ------------
+    file_name: filename, in case of streamlit application use
+    st.session_state['filename']
+    file_path: filepath, in case of streamlit application use st.session_state['filepath']
+    split_by: document splitting strategy either as word or sentence
+    split_length: when synthetically creating the paragrpahs from document,
+                    it defines the length of paragraph.
+    split_respect_sentence_boundary: Used when using 'word' strategy for
+    splititng of text.
+    split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+    remove_punc: to remove all Punctuation including ',' and '.' or not
+    Return
+    --------------
+    List[Document]: When preprocessing pipeline is run, the output dictionary
+    has four objects. For the Haysatck implementation of SDG classification we,
+    need to use the List of Haystack Document, which can be fetched by
+    key = 'documents' on output.
+    """
+    processing_pipeline = processingpipeline()
+    output_pre = processing_pipeline.run(file_paths = file_path,
+                            params= {"FileConverter": {"file_path": file_path, \
+                                        "file_name": file_name},
+                                     "UdfPreProcessor": {"remove_punc": remove_punc, \
+                                            "split_by": split_by, \
+                                            "split_length":split_length,\
+                                            "split_overlap": split_overlap, \
+        "split_respect_sentence_boundary":split_respect_sentence_boundary}})
+    return output_pre
+def app():
+    with st.container():
+        all_files_df = pd.DataFrame()  # Initialize an empty DataFrame to store data from all files
+        for key in st.session_state:
+            if key.startswith('filepath_'):
+                file_path = st.session_state[key]
+                file_name = st.session_state['filename' + key[-2:]]
+                all_documents = runPreprocessingPipeline(file_name=file_name,
+                                                        file_path=file_path, split_by=params['split_by'],
+                                                        split_length=params['split_length'],
+                                                        split_respect_sentence_boundary=params['split_respect_sentence_boundary'],
+                                                        split_overlap=params['split_overlap'], remove_punc=params['remove_punc'])
+                paralist = paraLengthCheck(all_documents['documents'], 100)
+                file_df = pd.DataFrame(paralist, columns=['text', 'page'])
+                file_df['filename'] = file_name  # Add a column for the file name
+                all_files_df = pd.concat([all_files_df, file_df], ignore_index=True)
+        if not all_files_df.empty:
+            st.session_state['combined_files_df'] = all_files_df
+        else:
+            st.info("🤔 No document found, please try to upload it at the sidebar!")
+            logging.warning("Terminated as no document provided")

appStore/rag.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import os
+# import json
+import numpy as np
+import pandas as pd
+import openai
+from haystack.schema import Document
+import streamlit as st
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+# Get openai API key
+openai.api_key = os.environ["OPENAI_API_KEY"]
+model_select = "gpt-3.5-turbo-1106"
+# define a special function for putting the prompt together (as we can't use haystack)
+def get_prompt(docs):
+  base_prompt="Provide a single paragraph summary of the documents provided below. \
+  Formulate your answer in the style of an academic report."
+  # Add the meta data for references
+  context = ' - '.join([d.content for d in docs])
+  prompt = base_prompt+"; Context: "+context+"; Answer:"
+  return prompt
+# convert df rows to Document object so we can feed it into the summarizer easily
+def get_document(df):
+    # we take a list of each extract
+    ls_dict = []
+    for index, row in df.iterrows():
+        # Create a Document object for each row (we only need the text)
+        doc = Document(
+            row['text'],
+            meta={
+            'filename': row['filename']}
+        )
+        # Append the Document object to the documents list
+        ls_dict.append(doc)
+    return ls_dict
+# exception handling for issuing multiple API calls to openai (exponential backoff)
+@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
+def completion_with_backoff(**kwargs):
+    return openai.ChatCompletion.create(**kwargs)
+# construct RAG query, send to openai and process response
+def run_query(df):
+    docs = get_document(df)
+    '''
+    For non-streamed completion, enable the following 2 lines and comment out the code below
+    '''
+    # res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
+    # result = res.choices[0].message.content
+    # instantiate ChatCompletion as a generator object (stream is set to True)
+    response = completion_with_backoff(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}], stream=True)
+    # iterate through the streamed output
+    report = []
+    res_box = st.empty()
+    for chunk in response:
+        # extract the object containing the text (totally different structure when streaming)
+        chunk_message = chunk['choices'][0]['delta']
+        # test to make sure there is text in the object (some don't have)
+        if 'content' in chunk_message:
+            report.append(chunk_message.content) # extract the message
+            # add the latest text and merge it with all previous
+            result = "".join(report).strip()
+            # res_box.success(result) # output to response text box
+            res_box.success(result)

appStore/vulnerability_analysis.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# set path
+import glob, os, sys;
+sys.path.append('../utils')
+#import needed libraries
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import streamlit as st
+from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification
+import logging
+logger = logging.getLogger(__name__)
+from utils.config import get_classifier_params
+from utils.preprocessing import paraLengthCheck
+from io import BytesIO
+import xlsxwriter
+import plotly.express as px
+# Declare all the necessary variables
+classifier_identifier = 'vulnerability'
+params  = get_classifier_params(classifier_identifier)
+@st.cache_data
+def to_excel(df,sectorlist):
+    len_df = len(df)
+    output = BytesIO()
+    writer = pd.ExcelWriter(output, engine='xlsxwriter')
+    df.to_excel(writer, index=False, sheet_name='Sheet1')
+    workbook = writer.book
+    worksheet = writer.sheets['Sheet1']
+    worksheet.data_validation('S2:S{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': ['No', 'Yes', 'Discard']})
+    worksheet.data_validation('X2:X{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('T2:T{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('U2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('V2:V{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    worksheet.data_validation('W2:U{}'.format(len_df),
+                              {'validate': 'list',
+                               'source': sectorlist + ['Blank']})
+    writer.save()
+    processed_data = output.getvalue()
+    return processed_data
+def app():
+    with st.container():
+        if 'combined_files_df' in st.session_state:
+            combined_files_df = st.session_state['combined_files_df']
+            classifier = load_vulnerabilityClassifier(classifier_name=params['model_name'])
+            st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
+            combined_files_df = vulnerability_classification(haystack_doc=combined_files_df,
+                                                            threshold=params['threshold'])
+            st.session_state['combined_files_df'] = combined_files_df

docStore/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

docStore/sample/KE_First_NDC.pdf ADDED Viewed

Binary file (214 kB). View file

docStore/sample/PH_First_NDC.pdf ADDED Viewed

Binary file (136 kB). View file

docStore/sample/files.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{"Kenya: First NDC":"docStore/sample/KE_First_NDC.pdf",
+"Philippines: First NDC":"docStore/sample/PH_First_NDC.pdf"
+ }

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+poppler-utils
+xpdf
+tesseract-ocr
+libtesseract-dev

paramconfig.cfg ADDED Viewed

	@@ -0,0 +1,19 @@

+[preprocessing]
+THRESHOLD = 0.50
+MODEL = garbage
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 5
+RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10
+[vulnerability]
+THRESHOLD = 0.50
+MODEL = leavoigt/vulnerable_groups
+SPLIT_BY = word
+REMOVE_PUNC = 0
+SPLIT_LENGTH = 60
+SPLIT_OVERLAP = 5
+RESPECT_SENTENCE_BOUNDARY = 1
+TOP_KEY = 10

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+farm-haystack == 1.16
+farm-haystack[ocr,pdf]==1.16.0
+spacy==3.2.0
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
+matplotlib==3.5.1
+nltk==3.7
+numpy==1.22.1
+pandas==1.4.0
+pdfplumber==0.6.2
+Pillow==9.1.1
+seaborn==0.11.2
+transformers==4.25.1
+st-annotated-text==3.0.0
+markdown==3.4.1
+summa==1.2.0
+plotly
+xlsxwriter
+altair==4.0
+streamlit-aggrid
+python-docx
+setfit
+plotly.express
+openai==0.27.9
+pydantic==1.8.2

style.css ADDED Viewed

	@@ -0,0 +1,179 @@

+.row-widget.stTextInput > div:first-of-type {
+    background: #fff;
+    display: flex;
+    border: 1px solid #dfe1e5;
+    box-shadow: none;
+    border-radius: 24px;
+    height: 50px;
+    width: auto;
+    margin: 10px auto 30px;
+}
+.row-widget.stTextInput > div:first-of-type:hover,
+.row-widget.stTextInput > div:first-of-type:focus {
+    box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
+}
+.row-widget.stTextInput .st-bq {
+    background-color: #fff;
+}
+.row-widget.stTextInput > label {
+    color: #b3b3b3;
+}
+.row-widget.stButton > button {
+    border-radius: 24px;
+    background-color: #B6C9B1;
+    color: #fff;
+    border: none;
+    padding: 6px 20px;
+    float: right;
+    background-image: none;
+}
+.row-widget.stButton > button:hover {
+    box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
+}
+.row-widget.stButton > button:focus {
+    border: none;
+    color: #fff;
+}
+.footer-custom {
+    position: fixed;
+    bottom: 0;
+    width: 100%;
+    color: var(--text-color);
+    max-width: 698px;
+    font-size: 14px;
+    height: 50px;
+    padding: 10px 0;
+    z-index: 50;
+}
+.main {
+    padding: 20px;
+}
+footer {
+    display: none !important;
+}
+.footer-custom a {
+    color: var(--text-color);
+}
+#wikipedia-assistant {
+    font-size: 36px;
+}
+.generated-answer p {
+    font-size: 16px;
+    font-weight: bold;
+}
+.react-json-view {
+    margin: 40px 0 80px;
+}
+.tooltip {
+    text-align: center;
+    line-height: 20px;
+    display: table-caption;
+    font-size: 10px;
+    border-radius: 50%;
+    height: 20px;
+    width: 20px;
+    position: relative;
+    cursor: pointer;
+    color:#000;
+}
+.tooltip .tooltiptext {
+    visibility: hidden;
+    width: 280px;
+    text-align: center;
+    border-radius: 6px;
+    padding: 10px;
+    position: absolute;
+    z-index: 1;
+    top: 25px;
+    left: 50%;
+    margin-left: -140px;
+    font-size: 14px;
+    background-color: #fff;
+    border: 1px solid #ccc;
+    box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
+    color: #000;
+}
+.tooltip:hover .tooltiptext {
+    visibility: visible;
+}
+.sentence-wrapper {
+    border-left: 4px solid #ffc423;
+    padding-left: 20px;
+    margin-bottom: 40px;
+}
+#context {
+    padding: 2rem 0 1rem;
+}
+hr {
+    margin: 2em 0 1em;
+}
+.technical-details-info {
+    margin-bottom: 100px;
+}
+.loader-wrapper {
+    display: flex;
+    align-items: center;
+    background-color: rgba(250, 202, 43, 0.2);
+    padding: 15px 20px;
+    border-radius: 6px;
+}
+.loader-wrapper p {
+    margin-bottom: 0;
+    margin-left: 20px;
+}
+.loader {
+    width: 30px;
+    height: 30px;
+    border: dotted 5px #868686;
+    border-radius: 100%;
+    animation: spin 1s linear infinite;
+}
+.loader-note {
+    font-size: 14px;
+    color: #b3b3b3;
+    margin-left: 5px;
+}
+@keyframes spin {
+  0% {
+    transform: rotate(0deg) scale(0.8);
+    border-top-color: transparent;
+    border-right-color: transparent;
+  }
+  50% { transform: rotate(180deg) scale(1.2);
+    border-color: #949494;
+    border-top-color: transparent;
+    border-right-color: transparent;
+  }
+  100% { transform: rotate(360deg) scale(0.8);
+    border-color: #bbbbbb;
+    border-top-color: transparent;
+    border-right-color: transparent;
+  }
+}

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (135 Bytes). View file

utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (237 Bytes). View file

utils/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.08 kB). View file

utils/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (1.19 kB). View file

utils/__pycache__/preprocessing.cpython-310.pyc ADDED Viewed

Binary file (9.05 kB). View file

utils/__pycache__/preprocessing.cpython-38.pyc ADDED Viewed

Binary file (9.13 kB). View file

utils/__pycache__/uploadAndExample.cpython-310.pyc ADDED Viewed

Binary file (1.27 kB). View file

utils/__pycache__/vulnerability_classifier.cpython-310.pyc ADDED Viewed

Binary file (4.2 kB). View file

utils/__pycache__/vulnerability_classifier.cpython-38.pyc ADDED Viewed

Binary file (4.2 kB). View file

utils/config.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import configparser
+import logging
+def getconfig(configfile_path:str):
+    """
+    configfile_path: file path of .cfg file
+    """
+    config = configparser.ConfigParser()
+    try:
+        config.read_file(open(configfile_path))
+        return config
+    except:
+        logging.warning("config file not found")
+# Declare all the necessary variables
+def get_classifier_params(model_name):
+    config = getconfig('paramconfig.cfg')
+    params = {}
+    params['model_name'] = config.get(model_name,'MODEL')
+    params['split_by'] = config.get(model_name,'SPLIT_BY')
+    params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
+    params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
+    params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
+    params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
+    params['threshold'] = float(config.get(model_name,'THRESHOLD'))
+    params['top_n'] = int(config.get(model_name,'TOP_KEY'))
+    return params

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,291 @@

+from haystack.nodes.base import BaseComponent
+from haystack.schema import Document
+from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
+from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
+from typing import Callable, Dict, List, Optional, Text, Tuple, Union
+from typing_extensions import Literal
+import pandas as pd
+import logging
+import re
+import string
+from haystack.pipelines import Pipeline
+def useOCR(file_path: str)-> Text:
+    """
+    Converts image pdfs into text, Using the Farm-haystack[OCR]
+    Params
+    ----------
+    file_path: file_path of uploade file, returned by add_upload function in
+    uploadAndExample.py
+    Returns the text file as string.
+    """
+    converter = PDFToTextOCRConverter(remove_numeric_tables=True,
+                                      valid_languages=["eng"])
+    docs = converter.convert(file_path=file_path, meta=None)
+    return docs[0].content
+class FileConverter(BaseComponent):
+    """
+    Wrapper class to convert uploaded document into text by calling appropriate
+    Converter class, will use internally haystack PDFToTextOCR in case of image
+    pdf. Cannot use the FileClassifier from haystack as its doesnt has any
+    label/output class for image.
+    1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
+    2. https://docs.haystack.deepset.ai/docs/file_converters
+    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
+    4. https://docs.haystack.deepset.ai/reference/file-converters-api
+    """
+    outgoing_edges = 1
+    def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
+            id_hash_keys: Optional[List[str]] = None,
+            ) -> Tuple[dict,str]:
+        """ this is required method to invoke the component in
+            the pipeline implementation.
+        Params
+        ----------
+        file_name: name of file
+        file_path: file_path of uploade file, returned by add_upload function in
+                    uploadAndExample.py
+        See the links provided in Class docstring/description to see other params
+        Return
+        ---------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case its the List of Hasyatck Document
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        try:
+            if file_name.endswith('.pdf'):
+                converter = PDFToTextConverter(remove_numeric_tables=True)
+            if file_name.endswith('.txt'):
+                converter = TextConverter(remove_numeric_tables=True)
+            if file_name.endswith('.docx'):
+                converter = DocxToTextConverter()
+        except Exception as e:
+            logging.error(e)
+            return
+        documents = []
+# encoding is empty, probably should be utf-8
+        document = converter.convert(
+                      file_path=file_path, meta=None,
+                      encoding=encoding, id_hash_keys=id_hash_keys
+                      )[0]
+        text = document.content
+        # in case of scanned/images only PDF the content might contain only
+        # the page separator (\f or \x0c). We check if is so and use
+        # use the OCR to get the text.
+        filtered = re.sub(r'\x0c', '', text)
+        if filtered == "":
+            logging.info("Using OCR")
+            text = useOCR(file_path)
+        documents.append(Document(content=text,
+                              meta={"name": file_name},
+                              id_hash_keys=id_hash_keys))
+        logging.info('file conversion succesful')
+        output = {'documents': documents}
+        return output, 'output_1'
+    def run_batch():
+        """
+        we dont have requirement to process the multiple files in one go
+        therefore nothing here, however to use the custom node we need to have
+        this method for the class.
+        """
+        return
+def basic(s:str, remove_punc:bool = False):
+    """
+    Performs basic cleaning of text.
+    Params
+    ----------
+    s: string to be processed
+    removePunc: to remove all Punctuation including ',' and '.' or not
+    Returns: processed string: see comments in the source code for more info
+    """
+    # Remove URLs
+    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
+    s = re.sub(r"http\S+", " ", s)
+    # Remove new line characters
+    s = re.sub('\n', ' ', s)
+    # Remove punctuations
+    if remove_punc == True:
+      translator = str.maketrans(' ', ' ', string.punctuation)
+      s = s.translate(translator)
+    # Remove distracting single quotes and dotted pattern
+    s = re.sub("\'", " ", s)
+    s = s.replace("..","")
+    return s.strip()
+def paraLengthCheck(paraList, max_len = 100):
+    """
+    There are cases where preprocessor cannot respect word limit, when using
+    respect sentence boundary flag due to missing sentence boundaries.
+    Therefore we run one more round of split here for those paragraphs
+    Params
+    ---------------
+    paraList : list of paragraphs/text
+    max_len : max length to be respected by sentences which bypassed
+              preprocessor strategy
+    """
+    new_para_list = []
+    for passage in paraList:
+        # check if para exceeds words limit
+        if len(passage.content.split()) > max_len:
+          # we might need few iterations example if para = 512 tokens
+          # we need to iterate 5 times to reduce para to size limit of '100'
+            iterations = int(len(passage.content.split())/max_len)
+            for i in range(iterations):
+                temp  = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
+                new_para_list.append((temp,passage.meta['page']))
+            temp  = " ".join(passage.content.split()[max_len*(i+1):])
+            new_para_list.append((temp,passage.meta['page']))
+        else:
+            # paragraphs which dont need any splitting
+            new_para_list.append((passage.content, passage.meta['page']))
+    logging.info("New paragraphs length {}".format(len(new_para_list)))
+    return new_para_list
+class UdfPreProcessor(BaseComponent):
+    """
+    class to preprocess the document returned by FileConverter. It will check
+    for splitting strategy and splits the document by word or sentences and then
+    synthetically create the paragraphs.
+    1. https://docs.haystack.deepset.ai/docs/preprocessor
+    2. https://docs.haystack.deepset.ai/reference/preprocessor-api
+    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
+    """
+    outgoing_edges = 1
+    def run(self, documents:List[Document], remove_punc:bool=False,
+            split_by: Literal["sentence", "word"] = 'sentence',
+            split_length:int = 2, split_respect_sentence_boundary:bool = False,
+            split_overlap:int = 0):
+        """ this is required method to invoke the component in
+        the pipeline implementation.
+        Params
+        ----------
+        documents: documents from the output dictionary returned by Fileconverter
+        remove_punc: to remove all Punctuation including ',' and '.' or not
+        split_by: document splitting strategy either as word or sentence
+        split_length: when synthetically creating the paragrpahs from document,
+                      it defines the length of paragraph.
+        split_respect_sentence_boundary: Used when using 'word' strategy for
+        splititng of text.
+        split_overlap: Number of words or sentences that overlap when creating
+        the paragraphs. This is done as one sentence or 'some words' make sense
+        when  read in together with others. Therefore the overlap is used.
+        Return
+        ---------
+        output: dictionary, with key as identifier and value could be anything
+                we need to return. In this case the output will contain 4 objects
+                the paragraphs text list as List, Haystack document, Dataframe and
+                one raw text file.
+        output_1: As there is only one outgoing edge, we pass 'output_1' string
+        """
+        if split_by == 'sentence':
+            split_respect_sentence_boundary = False
+        else:
+            split_respect_sentence_boundary = split_respect_sentence_boundary
+        preprocessor = PreProcessor(
+            clean_empty_lines=True,
+            clean_whitespace=True,
+            clean_header_footer=True,
+            split_by=split_by,
+            split_length=split_length,
+            split_respect_sentence_boundary= split_respect_sentence_boundary,
+            split_overlap=split_overlap,
+            # will add page number only in case of PDF not for text/docx file.
+            add_page_number=True
+            )
+        for i in documents:
+            # # basic cleaning before passing it to preprocessor.
+            # i = basic(i)
+            docs_processed = preprocessor.process([i])
+            for item in docs_processed:
+                item.content = basic(item.content, remove_punc= remove_punc)
+        df = pd.DataFrame(docs_processed)
+        all_text = " ".join(df.content.to_list())
+        para_list = df.content.to_list()
+        logging.info('document split into {} paragraphs'.format(len(para_list)))
+        output = {'documents': docs_processed,
+                  'dataframe': df,
+                  'text': all_text,
+                  'paraList': para_list
+                 }
+        return output, "output_1"
+    def run_batch():
+        """
+            we dont have requirement to process the multiple files in one go
+            therefore nothing here, however to use the custom node we need to have
+            this method for the class.
+        """
+        return
+def processingpipeline():
+    """
+    Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
+    from utils.preprocessing
+    """
+    preprocessing_pipeline = Pipeline()
+    file_converter = FileConverter()
+    custom_preprocessor = UdfPreProcessor()
+    preprocessing_pipeline.add_node(component=file_converter,
+                                    name="FileConverter", inputs=["File"])
+    preprocessing_pipeline.add_node(component = custom_preprocessor,
+                            name ='UdfPreProcessor', inputs=["FileConverter"])
+    return preprocessing_pipeline

utils/uploadAndExample.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import streamlit as st
+import tempfile
+import json
+def add_upload(choice):
+    if choice == 'Upload Document':
+        uploaded_files = st.sidebar.file_uploader('Upload Files',
+                                                  type=['pdf', 'docx', 'txt'],
+                                                  accept_multiple_files=True)
+        if uploaded_files is not None:
+            # Clear previous uploaded files from session state
+            for key in list(st.session_state.keys()):
+                if key.startswith('filename') or key.startswith('filepath'):
+                    del st.session_state[key]
+        # Process and store each uploaded file
+        for index, uploaded_file in enumerate(uploaded_files):
+            with tempfile.NamedTemporaryFile(mode="wb", delete=False) as temp:
+                bytes_data = uploaded_file.getvalue()
+                temp.write(bytes_data)
+                st.session_state[f'filename_{index}'] = uploaded_file.name
+                st.session_state[f'filepath_{index}'] = temp.name
+    else:  # Handle example document selection
+        # listing the options
+        with open('docStore/sample/files.json', 'r') as json_file:
+            files = json.load(json_file)
+        option = st.sidebar.selectbox('Select the example document',
+                                      list(files.keys()))
+        file_path = files[option]
+        st.session_state['filename_0'] = file_path  # Use 'filename_0' to align with the upload naming convention
+        st.session_state['filepath_0'] = file_path  # Use 'filepath_0' for consistency
+# get the filenames from the processed docs dataframe so we can use for tab names
+def get_tabs(uploaded_docs):
+    tabs = []
+    for doc_name in uploaded_docs:
+        tab_title = doc_name  # Assuming doc_name is a string with the file name
+        tabs.append(tab_title)
+    return tabs

utils/vulnerability_classifier.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from typing import List, Tuple
+from typing_extensions import Literal
+import logging
+import pandas as pd
+from pandas import DataFrame, Series
+from utils.config import getconfig
+from utils.preprocessing import processingpipeline
+import streamlit as st
+from transformers import pipeline
+from setfit import SetFitModel
+label_dict= {0: 'Agricultural communities',
+ 1: 'Children',
+ 2: 'Coastal communities',
+ 3: 'Ethnic, racial or other minorities',
+ 4: 'Fishery communities',
+ 5: 'Informal sector workers',
+ 6: 'Members of indigenous and local communities',
+ 7: 'Migrants and displaced persons',
+ 8: 'Older persons',
+ 9: 'Other',
+ 10: 'Persons living in poverty',
+ 11: 'Persons with disabilities',
+ 12: 'Persons with pre-existing health conditions',
+ 13: 'Residents of drought-prone regions',
+ 14: 'Rural populations',
+ 15: 'Sexual minorities (LGBTQI+)',
+ 16: 'Urban populations',
+ 17: 'Women and other genders'}
+def getlabels(preds):
+    # Get label names
+    preds_list = preds.tolist()
+    predictions_names=[]
+    # loop through each prediction
+    for ele in preds_list:
+      # see if there is a value 1 and retrieve index
+      try:
+        index_of_one = ele.index(1)
+      except ValueError:
+        index_of_one = "NA"
+      # Retrieve the name of the label (if no prediction made = NA)
+      if index_of_one != "NA":
+        name  = label_dict[index_of_one]
+      else:
+        name = "Other"
+      # Append name to list
+      predictions_names.append(name)
+    return predictions_names
+@st.cache_resource
+def load_vulnerabilityClassifier(config_file:str = None, classifier_name:str = None):
+    """
+    loads the document classifier using haystack, where the name/path of model
+    in HF-hub as string is used to fetch the model object.Either configfile or
+    model should be passed.
+    1. https://docs.haystack.deepset.ai/reference/document-classifier-api
+    2. https://docs.haystack.deepset.ai/docs/document_classifier
+    Params
+    --------
+    config_file: config file path from which to read the model name
+    classifier_name: if modelname is passed, it takes a priority if not \
+    found then will look for configfile, else raise error.
+    Return: document classifier model
+    """
+    if not classifier_name:
+        if not config_file:
+            logging.warning("Pass either model name or config file")
+            return
+        else:
+            config = getconfig(config_file)
+            classifier_name = config.get('vulnerability','MODEL')
+    logging.info("Loading vulnerability classifier")
+    # we are using the pipeline as the model is multilabel and DocumentClassifier
+    # from Haystack doesnt support multilabel
+    # in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
+    # if not then it will automatically use softmax, which is not a desired thing.
+    # doc_classifier = TransformersDocumentClassifier(
+    #                     model_name_or_path=classifier_name,
+    #                     task="text-classification",
+    #                     top_k = None)
+    # # Download model from HF Hub
+    doc_classifier = SetFitModel.from_pretrained("leavoigt/vulnerable_groups")
+    # doc_classifier = pipeline("text-classification",
+    #                         model=classifier_name,
+    #                         return_all_scores=True,
+    #                         function_to_apply= "sigmoid")
+    return doc_classifier
+@st.cache_data
+def vulnerability_classification(haystack_doc:pd.DataFrame,
+                        threshold:float = 0.5,
+                        classifier_model:pipeline= None
+                        )->Tuple[DataFrame,Series]:
+    """
+    Text-Classification on the list of texts provided. Classifier provides the
+    most appropriate label for each text. these labels are in terms of if text
+    belongs to which particular Sustainable Devleopment Goal (SDG).
+    Params
+    ---------
+    haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
+    contains the list of paragraphs in different format,here the list of
+    Haystack Documents is used.
+    threshold: threshold value for the model to keep the results from classifier
+    classifiermodel: you can pass the classifier model directly,which takes priority
+    however if not then looks for model in streamlit session.
+    In case of streamlit avoid passing the model directly.
+    Returns
+    ----------
+    df: Dataframe with two columns['SDG:int', 'text']
+    x: Series object with the unique SDG covered in the document uploaded and
+    the number of times it is covered/discussed/count_of_paragraphs.
+    """
+    logging.info("Working on vulnerability Identification")
+    haystack_doc['Vulnerability Label'] = 'NA'
+    # haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
+    # df1 = haystack_doc[haystack_doc['PA_check'] == True]
+    # df = haystack_doc[haystack_doc['PA_check'] == False]
+    if not classifier_model:
+        classifier_model = st.session_state['vulnerability_classifier']
+        predictions = classifier_model(list(haystack_doc.text))
+        pred_labels = getlabels(predictions)
+        haystack_doc['Vulnerability Label'] = pred_labels
+    #   placeholder = {}
+    #   for j in range(len(temp)):
+    #     placeholder[temp[j]['label']] = temp[j]['score']
+    #   list_.append(placeholder)
+    # labels_ = [{**list_[l]} for l in range(len(predictions))]
+    # truth_df = DataFrame.from_dict(labels_)
+    # truth_df = truth_df.round(2)
+    # truth_df = truth_df.astype(float) >= threshold
+    # truth_df = truth_df.astype(str)
+    # categories = list(truth_df.columns)
+    # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
+    #                                           None for i in categories}, axis=1)
+    # truth_df['Vulnerability Label'] = truth_df.apply(lambda x: list(x['Vulnerability Label']
+    #                                                         -{None}),axis=1)
+    # haystack_doc['Vulnerability Label'] = list(truth_df['Vulnerability Label'])
+    return haystack_doc