Upload 38 files
Browse files- .DS_Store +0 -0
- .gitignore +1 -0
- README.md +13 -0
- app.py +148 -0
- appStore/.DS_Store +0 -0
- appStore/__init__.py +1 -0
- appStore/__pycache__/__init__.cpython-310.pyc +0 -0
- appStore/__pycache__/__init__.cpython-38.pyc +0 -0
- appStore/__pycache__/doc_processing.cpython-310.pyc +0 -0
- appStore/__pycache__/vulnerability_analysis.cpython-310.pyc +0 -0
- appStore/__pycache__/vulnerability_analysis.cpython-38.pyc +0 -0
- appStore/doc_processing.py +87 -0
- appStore/rag.py +80 -0
- appStore/vulnerability_analysis.py +69 -0
- docStore/.DS_Store +0 -0
- docStore/sample/KE_First_NDC.pdf +0 -0
- docStore/sample/PH_First_NDC.pdf +0 -0
- docStore/sample/files.json +3 -0
- packages.txt +4 -0
- paramconfig.cfg +19 -0
- requirements.txt +24 -0
- style.css +179 -0
- utils/.DS_Store +0 -0
- utils/__init__.py +0 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/__init__.cpython-38.pyc +0 -0
- utils/__pycache__/config.cpython-310.pyc +0 -0
- utils/__pycache__/config.cpython-38.pyc +0 -0
- utils/__pycache__/preprocessing.cpython-310.pyc +0 -0
- utils/__pycache__/preprocessing.cpython-38.pyc +0 -0
- utils/__pycache__/uploadAndExample.cpython-310.pyc +0 -0
- utils/__pycache__/vulnerability_classifier.cpython-310.pyc +0 -0
- utils/__pycache__/vulnerability_classifier.cpython-38.pyc +0 -0
- utils/config.py +31 -0
- utils/preprocessing.py +291 -0
- utils/uploadAndExample.py +44 -0
- utils/vulnerability_classifier.py +156 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
cpv_v2/
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Vulnerable Groups
|
| 3 |
+
emoji: 🦀
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.21.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: openrail
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# hacky fix for HF environment issues
|
| 2 |
+
import os
|
| 3 |
+
os.system("pip uninstall -y spaces")
|
| 4 |
+
os.system('pip install spaces==0.17.0')
|
| 5 |
+
os.system("pip uninstall -y gradio")
|
| 6 |
+
os.system("pip uninstall -y pydantic")
|
| 7 |
+
os.system("pip uninstall -y typer")
|
| 8 |
+
os.system('pip install typer==0.4.0')
|
| 9 |
+
os.system('pip install pydantic==1.8.2 --use-deprecated=legacy-resolver')
|
| 10 |
+
|
| 11 |
+
import appStore.vulnerability_analysis as vulnerability_analysis
|
| 12 |
+
import appStore.doc_processing as processing
|
| 13 |
+
from appStore.rag import run_query
|
| 14 |
+
from utils.uploadAndExample import add_upload, get_tabs
|
| 15 |
+
from utils.vulnerability_classifier import label_dict
|
| 16 |
+
import streamlit as st
|
| 17 |
+
import pandas as pd
|
| 18 |
+
import plotly.express as px
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
st.set_page_config(page_title = 'Vulnerability Analysis',
|
| 22 |
+
initial_sidebar_state='expanded', layout="wide")
|
| 23 |
+
|
| 24 |
+
with st.sidebar:
|
| 25 |
+
# upload and example doc
|
| 26 |
+
choice = st.sidebar.radio(label = 'Select the Document',
|
| 27 |
+
help = 'You can upload your own documents \
|
| 28 |
+
or use the example document',
|
| 29 |
+
options = ('Upload Document', 'Try Example'),
|
| 30 |
+
horizontal = True)
|
| 31 |
+
add_upload(choice)
|
| 32 |
+
|
| 33 |
+
with st.container():
|
| 34 |
+
st.markdown("<h2 style='text-align: center;'> Vulnerability Analysis </h2>", unsafe_allow_html=True)
|
| 35 |
+
st.write(' ')
|
| 36 |
+
|
| 37 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
| 38 |
+
st.write(
|
| 39 |
+
"""
|
| 40 |
+
The Vulnerability Analysis App is an open-source\
|
| 41 |
+
digital tool which aims to assist policy analysts and \
|
| 42 |
+
other users in extracting and filtering references \
|
| 43 |
+
to different vulnerable groups from public documents.
|
| 44 |
+
""")
|
| 45 |
+
|
| 46 |
+
st.write("""
|
| 47 |
+
What Happens in background?
|
| 48 |
+
|
| 49 |
+
- Step 1: Once the document is provided to app, it undergoes *Pre-processing*.\
|
| 50 |
+
In this step the document is broken into smaller paragraphs \
|
| 51 |
+
(based on word/sentence count).
|
| 52 |
+
- Step 2: The paragraphs are then fed to the **Vulnerability Classifier** which detects if
|
| 53 |
+
the paragraph contains any references to vulnerable groups.
|
| 54 |
+
""")
|
| 55 |
+
|
| 56 |
+
st.write("")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# Define the apps used
|
| 60 |
+
apps = [processing.app, vulnerability_analysis.app]
|
| 61 |
+
|
| 62 |
+
multiplier_val = 1 / len(apps)
|
| 63 |
+
if st.button("Analyze Documents"):
|
| 64 |
+
prg = st.progress(0.0)
|
| 65 |
+
for i, func in enumerate(apps):
|
| 66 |
+
func()
|
| 67 |
+
prg.progress((i + 1) * multiplier_val)
|
| 68 |
+
|
| 69 |
+
if 'combined_files_df' in st.session_state: # check for existence of processed documents
|
| 70 |
+
# get the filenames from the processed docs dataframe so we can use for tab names
|
| 71 |
+
uploaded_docs = [value for key, value in st.session_state.items() if key.startswith('filename_')]
|
| 72 |
+
tab_titles = get_tabs(uploaded_docs)
|
| 73 |
+
|
| 74 |
+
if tab_titles:
|
| 75 |
+
tabs = st.tabs(tab_titles)
|
| 76 |
+
|
| 77 |
+
# Render the results (Pie chart, Summary and Table) in indidivual tabs for each doc
|
| 78 |
+
for tab, doc in zip(tabs, uploaded_docs):
|
| 79 |
+
with tab:
|
| 80 |
+
# Main app code
|
| 81 |
+
with st.container():
|
| 82 |
+
st.write(' ')
|
| 83 |
+
|
| 84 |
+
# Assign dataframe a name
|
| 85 |
+
df_vul = st.session_state['combined_files_df']
|
| 86 |
+
df_vul = df_vul[df_vul['filename'] == doc]
|
| 87 |
+
|
| 88 |
+
col1, col2 = st.columns([1,1])
|
| 89 |
+
|
| 90 |
+
with col1:
|
| 91 |
+
# Header
|
| 92 |
+
st.subheader("Explore references to vulnerable groups:")
|
| 93 |
+
|
| 94 |
+
# Text
|
| 95 |
+
num_paragraphs = len(df_vul['Vulnerability Label'])
|
| 96 |
+
num_references = len(df_vul[df_vul['Vulnerability Label'] != 'Other'])
|
| 97 |
+
|
| 98 |
+
st.markdown(f"""<div style="text-align: justify;"> The document contains a
|
| 99 |
+
total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
|
| 100 |
+
We identified <span style="color: red;">{num_references}</span>
|
| 101 |
+
references to vulnerable groups.</div>
|
| 102 |
+
<br>
|
| 103 |
+
In the pie chart on the right you can see the distribution of the different
|
| 104 |
+
groups defined. For a more detailed view in the text, see the paragraphs and
|
| 105 |
+
their respective labels in the table below.</div>""", unsafe_allow_html=True)
|
| 106 |
+
|
| 107 |
+
with col2:
|
| 108 |
+
### Pie chart
|
| 109 |
+
|
| 110 |
+
# Create a df that stores all the labels
|
| 111 |
+
df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])
|
| 112 |
+
|
| 113 |
+
# Count how often each label appears in the "Vulnerability Labels" column
|
| 114 |
+
label_counts = df_vul['Vulnerability Label'].value_counts().reset_index()
|
| 115 |
+
label_counts.columns = ['Label', 'Count']
|
| 116 |
+
|
| 117 |
+
# Merge the label counts with the df_label DataFrame
|
| 118 |
+
df_labels = df_labels.merge(label_counts, on='Label', how='left')
|
| 119 |
+
|
| 120 |
+
# Configure graph
|
| 121 |
+
fig = px.pie(df_labels,
|
| 122 |
+
names="Label",
|
| 123 |
+
values="Count",
|
| 124 |
+
title='Label Counts',
|
| 125 |
+
hover_name="Count",
|
| 126 |
+
color_discrete_sequence=px.colors.qualitative.Plotly
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
#Show plot
|
| 130 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 131 |
+
|
| 132 |
+
### Document Summary
|
| 133 |
+
st.markdown("----")
|
| 134 |
+
st.markdown('**DOCUMENT FINDINGS SUMMARY:**')
|
| 135 |
+
|
| 136 |
+
# filter out 'Other' cause we don't want that in the table (and it's way too big for the summary)
|
| 137 |
+
df_docs = df_vul[df_vul['Vulnerability Label'] != 'Other']
|
| 138 |
+
# construct RAG query, send to openai and process response
|
| 139 |
+
run_query(df_docs)
|
| 140 |
+
|
| 141 |
+
st.markdown("----")
|
| 142 |
+
|
| 143 |
+
with st.expander("ℹ️ - Document Text Classifications", expanded=False):
|
| 144 |
+
### Table
|
| 145 |
+
st.table(df_docs)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
appStore/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
appStore/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# adding for package implementation
|
appStore/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (138 Bytes). View file
|
|
|
appStore/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (240 Bytes). View file
|
|
|
appStore/__pycache__/doc_processing.cpython-310.pyc
ADDED
|
Binary file (3.42 kB). View file
|
|
|
appStore/__pycache__/vulnerability_analysis.cpython-310.pyc
ADDED
|
Binary file (2.01 kB). View file
|
|
|
appStore/__pycache__/vulnerability_analysis.cpython-38.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
appStore/doc_processing.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# set path
|
| 2 |
+
import glob, os, sys;
|
| 3 |
+
sys.path.append('../utils')
|
| 4 |
+
from typing import List, Tuple
|
| 5 |
+
from typing_extensions import Literal
|
| 6 |
+
from haystack.schema import Document
|
| 7 |
+
from utils.config import get_classifier_params
|
| 8 |
+
from utils.preprocessing import processingpipeline,paraLengthCheck
|
| 9 |
+
import streamlit as st
|
| 10 |
+
import logging
|
| 11 |
+
import pandas as pd
|
| 12 |
+
params = get_classifier_params("preprocessing")
|
| 13 |
+
|
| 14 |
+
@st.cache_data
|
| 15 |
+
def runPreprocessingPipeline(file_name:str, file_path:str,
|
| 16 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
| 17 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 18 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
| 19 |
+
"""
|
| 20 |
+
creates the pipeline and runs the preprocessing pipeline,
|
| 21 |
+
the params for pipeline are fetched from paramconfig
|
| 22 |
+
Params
|
| 23 |
+
------------
|
| 24 |
+
file_name: filename, in case of streamlit application use
|
| 25 |
+
st.session_state['filename']
|
| 26 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
| 27 |
+
split_by: document splitting strategy either as word or sentence
|
| 28 |
+
split_length: when synthetically creating the paragrpahs from document,
|
| 29 |
+
it defines the length of paragraph.
|
| 30 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
| 31 |
+
splititng of text.
|
| 32 |
+
split_overlap: Number of words or sentences that overlap when creating
|
| 33 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
| 34 |
+
when read in together with others. Therefore the overlap is used.
|
| 35 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
| 36 |
+
Return
|
| 37 |
+
--------------
|
| 38 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
| 39 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
| 40 |
+
need to use the List of Haystack Document, which can be fetched by
|
| 41 |
+
key = 'documents' on output.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
processing_pipeline = processingpipeline()
|
| 45 |
+
|
| 46 |
+
output_pre = processing_pipeline.run(file_paths = file_path,
|
| 47 |
+
params= {"FileConverter": {"file_path": file_path, \
|
| 48 |
+
"file_name": file_name},
|
| 49 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
| 50 |
+
"split_by": split_by, \
|
| 51 |
+
"split_length":split_length,\
|
| 52 |
+
"split_overlap": split_overlap, \
|
| 53 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
| 54 |
+
|
| 55 |
+
return output_pre
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def app():
|
| 59 |
+
with st.container():
|
| 60 |
+
all_files_df = pd.DataFrame() # Initialize an empty DataFrame to store data from all files
|
| 61 |
+
|
| 62 |
+
for key in st.session_state:
|
| 63 |
+
if key.startswith('filepath_'):
|
| 64 |
+
file_path = st.session_state[key]
|
| 65 |
+
file_name = st.session_state['filename' + key[-2:]]
|
| 66 |
+
|
| 67 |
+
all_documents = runPreprocessingPipeline(file_name=file_name,
|
| 68 |
+
file_path=file_path, split_by=params['split_by'],
|
| 69 |
+
split_length=params['split_length'],
|
| 70 |
+
split_respect_sentence_boundary=params['split_respect_sentence_boundary'],
|
| 71 |
+
split_overlap=params['split_overlap'], remove_punc=params['remove_punc'])
|
| 72 |
+
paralist = paraLengthCheck(all_documents['documents'], 100)
|
| 73 |
+
file_df = pd.DataFrame(paralist, columns=['text', 'page'])
|
| 74 |
+
file_df['filename'] = file_name # Add a column for the file name
|
| 75 |
+
|
| 76 |
+
all_files_df = pd.concat([all_files_df, file_df], ignore_index=True)
|
| 77 |
+
|
| 78 |
+
if not all_files_df.empty:
|
| 79 |
+
st.session_state['combined_files_df'] = all_files_df
|
| 80 |
+
else:
|
| 81 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
| 82 |
+
logging.warning("Terminated as no document provided")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
appStore/rag.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
# import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import openai
|
| 6 |
+
from haystack.schema import Document
|
| 7 |
+
import streamlit as st
|
| 8 |
+
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Get openai API key
|
| 12 |
+
openai.api_key = os.environ["OPENAI_API_KEY"]
|
| 13 |
+
model_select = "gpt-3.5-turbo-1106"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# define a special function for putting the prompt together (as we can't use haystack)
|
| 17 |
+
def get_prompt(docs):
|
| 18 |
+
base_prompt="Provide a single paragraph summary of the documents provided below. \
|
| 19 |
+
Formulate your answer in the style of an academic report."
|
| 20 |
+
# Add the meta data for references
|
| 21 |
+
context = ' - '.join([d.content for d in docs])
|
| 22 |
+
prompt = base_prompt+"; Context: "+context+"; Answer:"
|
| 23 |
+
|
| 24 |
+
return prompt
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# convert df rows to Document object so we can feed it into the summarizer easily
|
| 28 |
+
def get_document(df):
|
| 29 |
+
# we take a list of each extract
|
| 30 |
+
ls_dict = []
|
| 31 |
+
for index, row in df.iterrows():
|
| 32 |
+
# Create a Document object for each row (we only need the text)
|
| 33 |
+
doc = Document(
|
| 34 |
+
row['text'],
|
| 35 |
+
meta={
|
| 36 |
+
'filename': row['filename']}
|
| 37 |
+
)
|
| 38 |
+
# Append the Document object to the documents list
|
| 39 |
+
ls_dict.append(doc)
|
| 40 |
+
|
| 41 |
+
return ls_dict
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# exception handling for issuing multiple API calls to openai (exponential backoff)
|
| 45 |
+
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
|
| 46 |
+
def completion_with_backoff(**kwargs):
|
| 47 |
+
return openai.ChatCompletion.create(**kwargs)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# construct RAG query, send to openai and process response
|
| 51 |
+
def run_query(df):
|
| 52 |
+
docs = get_document(df)
|
| 53 |
+
|
| 54 |
+
'''
|
| 55 |
+
For non-streamed completion, enable the following 2 lines and comment out the code below
|
| 56 |
+
'''
|
| 57 |
+
# res = openai.ChatCompletion.create(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}])
|
| 58 |
+
# result = res.choices[0].message.content
|
| 59 |
+
|
| 60 |
+
# instantiate ChatCompletion as a generator object (stream is set to True)
|
| 61 |
+
response = completion_with_backoff(model=model_select, messages=[{"role": "user", "content": get_prompt(docs)}], stream=True)
|
| 62 |
+
# iterate through the streamed output
|
| 63 |
+
report = []
|
| 64 |
+
res_box = st.empty()
|
| 65 |
+
for chunk in response:
|
| 66 |
+
# extract the object containing the text (totally different structure when streaming)
|
| 67 |
+
chunk_message = chunk['choices'][0]['delta']
|
| 68 |
+
# test to make sure there is text in the object (some don't have)
|
| 69 |
+
if 'content' in chunk_message:
|
| 70 |
+
report.append(chunk_message.content) # extract the message
|
| 71 |
+
# add the latest text and merge it with all previous
|
| 72 |
+
result = "".join(report).strip()
|
| 73 |
+
# res_box.success(result) # output to response text box
|
| 74 |
+
res_box.success(result)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
appStore/vulnerability_analysis.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# set path
|
| 2 |
+
import glob, os, sys;
|
| 3 |
+
sys.path.append('../utils')
|
| 4 |
+
|
| 5 |
+
#import needed libraries
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import streamlit as st
|
| 11 |
+
from utils.vulnerability_classifier import load_vulnerabilityClassifier, vulnerability_classification
|
| 12 |
+
import logging
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
from utils.config import get_classifier_params
|
| 15 |
+
from utils.preprocessing import paraLengthCheck
|
| 16 |
+
from io import BytesIO
|
| 17 |
+
import xlsxwriter
|
| 18 |
+
import plotly.express as px
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Declare all the necessary variables
|
| 22 |
+
classifier_identifier = 'vulnerability'
|
| 23 |
+
params = get_classifier_params(classifier_identifier)
|
| 24 |
+
|
| 25 |
+
@st.cache_data
|
| 26 |
+
def to_excel(df,sectorlist):
|
| 27 |
+
len_df = len(df)
|
| 28 |
+
output = BytesIO()
|
| 29 |
+
writer = pd.ExcelWriter(output, engine='xlsxwriter')
|
| 30 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
| 31 |
+
workbook = writer.book
|
| 32 |
+
worksheet = writer.sheets['Sheet1']
|
| 33 |
+
worksheet.data_validation('S2:S{}'.format(len_df),
|
| 34 |
+
{'validate': 'list',
|
| 35 |
+
'source': ['No', 'Yes', 'Discard']})
|
| 36 |
+
worksheet.data_validation('X2:X{}'.format(len_df),
|
| 37 |
+
{'validate': 'list',
|
| 38 |
+
'source': sectorlist + ['Blank']})
|
| 39 |
+
worksheet.data_validation('T2:T{}'.format(len_df),
|
| 40 |
+
{'validate': 'list',
|
| 41 |
+
'source': sectorlist + ['Blank']})
|
| 42 |
+
worksheet.data_validation('U2:U{}'.format(len_df),
|
| 43 |
+
{'validate': 'list',
|
| 44 |
+
'source': sectorlist + ['Blank']})
|
| 45 |
+
worksheet.data_validation('V2:V{}'.format(len_df),
|
| 46 |
+
{'validate': 'list',
|
| 47 |
+
'source': sectorlist + ['Blank']})
|
| 48 |
+
worksheet.data_validation('W2:U{}'.format(len_df),
|
| 49 |
+
{'validate': 'list',
|
| 50 |
+
'source': sectorlist + ['Blank']})
|
| 51 |
+
writer.save()
|
| 52 |
+
processed_data = output.getvalue()
|
| 53 |
+
return processed_data
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def app():
|
| 58 |
+
with st.container():
|
| 59 |
+
if 'combined_files_df' in st.session_state:
|
| 60 |
+
combined_files_df = st.session_state['combined_files_df']
|
| 61 |
+
classifier = load_vulnerabilityClassifier(classifier_name=params['model_name'])
|
| 62 |
+
st.session_state['{}_classifier'.format(classifier_identifier)] = classifier
|
| 63 |
+
|
| 64 |
+
combined_files_df = vulnerability_classification(haystack_doc=combined_files_df,
|
| 65 |
+
threshold=params['threshold'])
|
| 66 |
+
|
| 67 |
+
st.session_state['combined_files_df'] = combined_files_df
|
| 68 |
+
|
| 69 |
+
|
docStore/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
docStore/sample/KE_First_NDC.pdf
ADDED
|
Binary file (214 kB). View file
|
|
|
docStore/sample/PH_First_NDC.pdf
ADDED
|
Binary file (136 kB). View file
|
|
|
docStore/sample/files.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"Kenya: First NDC":"docStore/sample/KE_First_NDC.pdf",
|
| 2 |
+
"Philippines: First NDC":"docStore/sample/PH_First_NDC.pdf"
|
| 3 |
+
}
|
packages.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
poppler-utils
|
| 2 |
+
xpdf
|
| 3 |
+
tesseract-ocr
|
| 4 |
+
libtesseract-dev
|
paramconfig.cfg
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[preprocessing]
|
| 2 |
+
THRESHOLD = 0.50
|
| 3 |
+
MODEL = garbage
|
| 4 |
+
SPLIT_BY = word
|
| 5 |
+
REMOVE_PUNC = 0
|
| 6 |
+
SPLIT_LENGTH = 60
|
| 7 |
+
SPLIT_OVERLAP = 5
|
| 8 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
| 9 |
+
TOP_KEY = 10
|
| 10 |
+
|
| 11 |
+
[vulnerability]
|
| 12 |
+
THRESHOLD = 0.50
|
| 13 |
+
MODEL = leavoigt/vulnerable_groups
|
| 14 |
+
SPLIT_BY = word
|
| 15 |
+
REMOVE_PUNC = 0
|
| 16 |
+
SPLIT_LENGTH = 60
|
| 17 |
+
SPLIT_OVERLAP = 5
|
| 18 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
| 19 |
+
TOP_KEY = 10
|
requirements.txt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
farm-haystack == 1.16
|
| 2 |
+
farm-haystack[ocr,pdf]==1.16.0
|
| 3 |
+
spacy==3.2.0
|
| 4 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
|
| 5 |
+
matplotlib==3.5.1
|
| 6 |
+
nltk==3.7
|
| 7 |
+
numpy==1.22.1
|
| 8 |
+
pandas==1.4.0
|
| 9 |
+
pdfplumber==0.6.2
|
| 10 |
+
Pillow==9.1.1
|
| 11 |
+
seaborn==0.11.2
|
| 12 |
+
transformers==4.25.1
|
| 13 |
+
st-annotated-text==3.0.0
|
| 14 |
+
markdown==3.4.1
|
| 15 |
+
summa==1.2.0
|
| 16 |
+
plotly
|
| 17 |
+
xlsxwriter
|
| 18 |
+
altair==4.0
|
| 19 |
+
streamlit-aggrid
|
| 20 |
+
python-docx
|
| 21 |
+
setfit
|
| 22 |
+
plotly.express
|
| 23 |
+
openai==0.27.9
|
| 24 |
+
pydantic==1.8.2
|
style.css
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
.row-widget.stTextInput > div:first-of-type {
|
| 3 |
+
background: #fff;
|
| 4 |
+
display: flex;
|
| 5 |
+
border: 1px solid #dfe1e5;
|
| 6 |
+
box-shadow: none;
|
| 7 |
+
border-radius: 24px;
|
| 8 |
+
height: 50px;
|
| 9 |
+
width: auto;
|
| 10 |
+
margin: 10px auto 30px;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
.row-widget.stTextInput > div:first-of-type:hover,
|
| 14 |
+
.row-widget.stTextInput > div:first-of-type:focus {
|
| 15 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
.row-widget.stTextInput .st-bq {
|
| 19 |
+
background-color: #fff;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
.row-widget.stTextInput > label {
|
| 23 |
+
color: #b3b3b3;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.row-widget.stButton > button {
|
| 27 |
+
border-radius: 24px;
|
| 28 |
+
background-color: #B6C9B1;
|
| 29 |
+
color: #fff;
|
| 30 |
+
border: none;
|
| 31 |
+
padding: 6px 20px;
|
| 32 |
+
float: right;
|
| 33 |
+
background-image: none;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
.row-widget.stButton > button:hover {
|
| 37 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
.row-widget.stButton > button:focus {
|
| 41 |
+
border: none;
|
| 42 |
+
color: #fff;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
.footer-custom {
|
| 46 |
+
position: fixed;
|
| 47 |
+
bottom: 0;
|
| 48 |
+
width: 100%;
|
| 49 |
+
color: var(--text-color);
|
| 50 |
+
max-width: 698px;
|
| 51 |
+
font-size: 14px;
|
| 52 |
+
height: 50px;
|
| 53 |
+
padding: 10px 0;
|
| 54 |
+
z-index: 50;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.main {
|
| 58 |
+
padding: 20px;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
footer {
|
| 62 |
+
display: none !important;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
.footer-custom a {
|
| 66 |
+
color: var(--text-color);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
#wikipedia-assistant {
|
| 70 |
+
font-size: 36px;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
.generated-answer p {
|
| 74 |
+
font-size: 16px;
|
| 75 |
+
font-weight: bold;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
.react-json-view {
|
| 79 |
+
margin: 40px 0 80px;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
.tooltip {
|
| 83 |
+
text-align: center;
|
| 84 |
+
line-height: 20px;
|
| 85 |
+
display: table-caption;
|
| 86 |
+
font-size: 10px;
|
| 87 |
+
border-radius: 50%;
|
| 88 |
+
height: 20px;
|
| 89 |
+
width: 20px;
|
| 90 |
+
position: relative;
|
| 91 |
+
cursor: pointer;
|
| 92 |
+
color:#000;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
.tooltip .tooltiptext {
|
| 96 |
+
visibility: hidden;
|
| 97 |
+
width: 280px;
|
| 98 |
+
text-align: center;
|
| 99 |
+
border-radius: 6px;
|
| 100 |
+
padding: 10px;
|
| 101 |
+
position: absolute;
|
| 102 |
+
z-index: 1;
|
| 103 |
+
top: 25px;
|
| 104 |
+
left: 50%;
|
| 105 |
+
margin-left: -140px;
|
| 106 |
+
font-size: 14px;
|
| 107 |
+
background-color: #fff;
|
| 108 |
+
border: 1px solid #ccc;
|
| 109 |
+
box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
|
| 110 |
+
color: #000;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.tooltip:hover .tooltiptext {
|
| 114 |
+
visibility: visible;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.sentence-wrapper {
|
| 118 |
+
border-left: 4px solid #ffc423;
|
| 119 |
+
padding-left: 20px;
|
| 120 |
+
margin-bottom: 40px;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
#context {
|
| 124 |
+
padding: 2rem 0 1rem;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
hr {
|
| 128 |
+
margin: 2em 0 1em;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
.technical-details-info {
|
| 133 |
+
margin-bottom: 100px;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.loader-wrapper {
|
| 137 |
+
display: flex;
|
| 138 |
+
align-items: center;
|
| 139 |
+
background-color: rgba(250, 202, 43, 0.2);
|
| 140 |
+
padding: 15px 20px;
|
| 141 |
+
border-radius: 6px;
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
.loader-wrapper p {
|
| 145 |
+
margin-bottom: 0;
|
| 146 |
+
margin-left: 20px;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.loader {
|
| 150 |
+
width: 30px;
|
| 151 |
+
height: 30px;
|
| 152 |
+
border: dotted 5px #868686;
|
| 153 |
+
border-radius: 100%;
|
| 154 |
+
animation: spin 1s linear infinite;
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
.loader-note {
|
| 158 |
+
font-size: 14px;
|
| 159 |
+
color: #b3b3b3;
|
| 160 |
+
margin-left: 5px;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
@keyframes spin {
|
| 164 |
+
0% {
|
| 165 |
+
transform: rotate(0deg) scale(0.8);
|
| 166 |
+
border-top-color: transparent;
|
| 167 |
+
border-right-color: transparent;
|
| 168 |
+
}
|
| 169 |
+
50% { transform: rotate(180deg) scale(1.2);
|
| 170 |
+
border-color: #949494;
|
| 171 |
+
border-top-color: transparent;
|
| 172 |
+
border-right-color: transparent;
|
| 173 |
+
}
|
| 174 |
+
100% { transform: rotate(360deg) scale(0.8);
|
| 175 |
+
border-color: #bbbbbb;
|
| 176 |
+
border-top-color: transparent;
|
| 177 |
+
border-right-color: transparent;
|
| 178 |
+
}
|
| 179 |
+
}
|
utils/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
utils/__init__.py
ADDED
|
File without changes
|
utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (135 Bytes). View file
|
|
|
utils/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (237 Bytes). View file
|
|
|
utils/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (1.08 kB). View file
|
|
|
utils/__pycache__/config.cpython-38.pyc
ADDED
|
Binary file (1.19 kB). View file
|
|
|
utils/__pycache__/preprocessing.cpython-310.pyc
ADDED
|
Binary file (9.05 kB). View file
|
|
|
utils/__pycache__/preprocessing.cpython-38.pyc
ADDED
|
Binary file (9.13 kB). View file
|
|
|
utils/__pycache__/uploadAndExample.cpython-310.pyc
ADDED
|
Binary file (1.27 kB). View file
|
|
|
utils/__pycache__/vulnerability_classifier.cpython-310.pyc
ADDED
|
Binary file (4.2 kB). View file
|
|
|
utils/__pycache__/vulnerability_classifier.cpython-38.pyc
ADDED
|
Binary file (4.2 kB). View file
|
|
|
utils/config.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import configparser
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
def getconfig(configfile_path:str):
|
| 5 |
+
"""
|
| 6 |
+
configfile_path: file path of .cfg file
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
config = configparser.ConfigParser()
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
config.read_file(open(configfile_path))
|
| 13 |
+
return config
|
| 14 |
+
except:
|
| 15 |
+
logging.warning("config file not found")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Declare all the necessary variables
|
| 19 |
+
def get_classifier_params(model_name):
|
| 20 |
+
config = getconfig('paramconfig.cfg')
|
| 21 |
+
params = {}
|
| 22 |
+
params['model_name'] = config.get(model_name,'MODEL')
|
| 23 |
+
params['split_by'] = config.get(model_name,'SPLIT_BY')
|
| 24 |
+
params['split_length'] = int(config.get(model_name,'SPLIT_LENGTH'))
|
| 25 |
+
params['split_overlap'] = int(config.get(model_name,'SPLIT_OVERLAP'))
|
| 26 |
+
params['remove_punc'] = bool(int(config.get(model_name,'REMOVE_PUNC')))
|
| 27 |
+
params['split_respect_sentence_boundary'] = bool(int(config.get(model_name,'RESPECT_SENTENCE_BOUNDARY')))
|
| 28 |
+
params['threshold'] = float(config.get(model_name,'THRESHOLD'))
|
| 29 |
+
params['top_n'] = int(config.get(model_name,'TOP_KEY'))
|
| 30 |
+
|
| 31 |
+
return params
|
utils/preprocessing.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from haystack.nodes.base import BaseComponent
|
| 2 |
+
from haystack.schema import Document
|
| 3 |
+
from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
|
| 4 |
+
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
| 5 |
+
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
| 6 |
+
from typing_extensions import Literal
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import logging
|
| 9 |
+
import re
|
| 10 |
+
import string
|
| 11 |
+
from haystack.pipelines import Pipeline
|
| 12 |
+
|
| 13 |
+
def useOCR(file_path: str)-> Text:
|
| 14 |
+
"""
|
| 15 |
+
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
| 16 |
+
|
| 17 |
+
Params
|
| 18 |
+
----------
|
| 19 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
| 20 |
+
uploadAndExample.py
|
| 21 |
+
|
| 22 |
+
Returns the text file as string.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
converter = PDFToTextOCRConverter(remove_numeric_tables=True,
|
| 27 |
+
valid_languages=["eng"])
|
| 28 |
+
docs = converter.convert(file_path=file_path, meta=None)
|
| 29 |
+
return docs[0].content
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class FileConverter(BaseComponent):
|
| 35 |
+
"""
|
| 36 |
+
Wrapper class to convert uploaded document into text by calling appropriate
|
| 37 |
+
Converter class, will use internally haystack PDFToTextOCR in case of image
|
| 38 |
+
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
| 39 |
+
label/output class for image.
|
| 40 |
+
|
| 41 |
+
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
| 42 |
+
2. https://docs.haystack.deepset.ai/docs/file_converters
|
| 43 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
| 44 |
+
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
outgoing_edges = 1
|
| 50 |
+
|
| 51 |
+
def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
|
| 52 |
+
id_hash_keys: Optional[List[str]] = None,
|
| 53 |
+
) -> Tuple[dict,str]:
|
| 54 |
+
""" this is required method to invoke the component in
|
| 55 |
+
the pipeline implementation.
|
| 56 |
+
|
| 57 |
+
Params
|
| 58 |
+
----------
|
| 59 |
+
file_name: name of file
|
| 60 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
| 61 |
+
uploadAndExample.py
|
| 62 |
+
|
| 63 |
+
See the links provided in Class docstring/description to see other params
|
| 64 |
+
|
| 65 |
+
Return
|
| 66 |
+
---------
|
| 67 |
+
output: dictionary, with key as identifier and value could be anything
|
| 68 |
+
we need to return. In this case its the List of Hasyatck Document
|
| 69 |
+
|
| 70 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
if file_name.endswith('.pdf'):
|
| 74 |
+
converter = PDFToTextConverter(remove_numeric_tables=True)
|
| 75 |
+
if file_name.endswith('.txt'):
|
| 76 |
+
converter = TextConverter(remove_numeric_tables=True)
|
| 77 |
+
if file_name.endswith('.docx'):
|
| 78 |
+
converter = DocxToTextConverter()
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logging.error(e)
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
documents = []
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# encoding is empty, probably should be utf-8
|
| 89 |
+
document = converter.convert(
|
| 90 |
+
file_path=file_path, meta=None,
|
| 91 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
| 92 |
+
)[0]
|
| 93 |
+
|
| 94 |
+
text = document.content
|
| 95 |
+
|
| 96 |
+
# in case of scanned/images only PDF the content might contain only
|
| 97 |
+
# the page separator (\f or \x0c). We check if is so and use
|
| 98 |
+
# use the OCR to get the text.
|
| 99 |
+
filtered = re.sub(r'\x0c', '', text)
|
| 100 |
+
|
| 101 |
+
if filtered == "":
|
| 102 |
+
logging.info("Using OCR")
|
| 103 |
+
text = useOCR(file_path)
|
| 104 |
+
|
| 105 |
+
documents.append(Document(content=text,
|
| 106 |
+
meta={"name": file_name},
|
| 107 |
+
id_hash_keys=id_hash_keys))
|
| 108 |
+
|
| 109 |
+
logging.info('file conversion succesful')
|
| 110 |
+
output = {'documents': documents}
|
| 111 |
+
return output, 'output_1'
|
| 112 |
+
|
| 113 |
+
def run_batch():
|
| 114 |
+
"""
|
| 115 |
+
we dont have requirement to process the multiple files in one go
|
| 116 |
+
therefore nothing here, however to use the custom node we need to have
|
| 117 |
+
this method for the class.
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
return
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def basic(s:str, remove_punc:bool = False):
|
| 124 |
+
|
| 125 |
+
"""
|
| 126 |
+
Performs basic cleaning of text.
|
| 127 |
+
|
| 128 |
+
Params
|
| 129 |
+
----------
|
| 130 |
+
s: string to be processed
|
| 131 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
| 132 |
+
|
| 133 |
+
Returns: processed string: see comments in the source code for more info
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
# Remove URLs
|
| 137 |
+
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
| 138 |
+
s = re.sub(r"http\S+", " ", s)
|
| 139 |
+
|
| 140 |
+
# Remove new line characters
|
| 141 |
+
s = re.sub('\n', ' ', s)
|
| 142 |
+
|
| 143 |
+
# Remove punctuations
|
| 144 |
+
if remove_punc == True:
|
| 145 |
+
translator = str.maketrans(' ', ' ', string.punctuation)
|
| 146 |
+
s = s.translate(translator)
|
| 147 |
+
# Remove distracting single quotes and dotted pattern
|
| 148 |
+
s = re.sub("\'", " ", s)
|
| 149 |
+
s = s.replace("..","")
|
| 150 |
+
|
| 151 |
+
return s.strip()
|
| 152 |
+
|
| 153 |
+
def paraLengthCheck(paraList, max_len = 100):
|
| 154 |
+
"""
|
| 155 |
+
There are cases where preprocessor cannot respect word limit, when using
|
| 156 |
+
respect sentence boundary flag due to missing sentence boundaries.
|
| 157 |
+
Therefore we run one more round of split here for those paragraphs
|
| 158 |
+
|
| 159 |
+
Params
|
| 160 |
+
---------------
|
| 161 |
+
paraList : list of paragraphs/text
|
| 162 |
+
max_len : max length to be respected by sentences which bypassed
|
| 163 |
+
preprocessor strategy
|
| 164 |
+
|
| 165 |
+
"""
|
| 166 |
+
new_para_list = []
|
| 167 |
+
for passage in paraList:
|
| 168 |
+
# check if para exceeds words limit
|
| 169 |
+
if len(passage.content.split()) > max_len:
|
| 170 |
+
# we might need few iterations example if para = 512 tokens
|
| 171 |
+
# we need to iterate 5 times to reduce para to size limit of '100'
|
| 172 |
+
iterations = int(len(passage.content.split())/max_len)
|
| 173 |
+
for i in range(iterations):
|
| 174 |
+
temp = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
|
| 175 |
+
new_para_list.append((temp,passage.meta['page']))
|
| 176 |
+
temp = " ".join(passage.content.split()[max_len*(i+1):])
|
| 177 |
+
new_para_list.append((temp,passage.meta['page']))
|
| 178 |
+
else:
|
| 179 |
+
# paragraphs which dont need any splitting
|
| 180 |
+
new_para_list.append((passage.content, passage.meta['page']))
|
| 181 |
+
|
| 182 |
+
logging.info("New paragraphs length {}".format(len(new_para_list)))
|
| 183 |
+
return new_para_list
|
| 184 |
+
|
| 185 |
+
class UdfPreProcessor(BaseComponent):
|
| 186 |
+
"""
|
| 187 |
+
class to preprocess the document returned by FileConverter. It will check
|
| 188 |
+
for splitting strategy and splits the document by word or sentences and then
|
| 189 |
+
synthetically create the paragraphs.
|
| 190 |
+
|
| 191 |
+
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
| 192 |
+
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
| 193 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
| 194 |
+
|
| 195 |
+
"""
|
| 196 |
+
outgoing_edges = 1
|
| 197 |
+
|
| 198 |
+
def run(self, documents:List[Document], remove_punc:bool=False,
|
| 199 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
| 200 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
| 201 |
+
split_overlap:int = 0):
|
| 202 |
+
|
| 203 |
+
""" this is required method to invoke the component in
|
| 204 |
+
the pipeline implementation.
|
| 205 |
+
|
| 206 |
+
Params
|
| 207 |
+
----------
|
| 208 |
+
documents: documents from the output dictionary returned by Fileconverter
|
| 209 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
| 210 |
+
split_by: document splitting strategy either as word or sentence
|
| 211 |
+
split_length: when synthetically creating the paragrpahs from document,
|
| 212 |
+
it defines the length of paragraph.
|
| 213 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
| 214 |
+
splititng of text.
|
| 215 |
+
split_overlap: Number of words or sentences that overlap when creating
|
| 216 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
| 217 |
+
when read in together with others. Therefore the overlap is used.
|
| 218 |
+
|
| 219 |
+
Return
|
| 220 |
+
---------
|
| 221 |
+
output: dictionary, with key as identifier and value could be anything
|
| 222 |
+
we need to return. In this case the output will contain 4 objects
|
| 223 |
+
the paragraphs text list as List, Haystack document, Dataframe and
|
| 224 |
+
one raw text file.
|
| 225 |
+
|
| 226 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
| 227 |
+
|
| 228 |
+
"""
|
| 229 |
+
|
| 230 |
+
if split_by == 'sentence':
|
| 231 |
+
split_respect_sentence_boundary = False
|
| 232 |
+
|
| 233 |
+
else:
|
| 234 |
+
split_respect_sentence_boundary = split_respect_sentence_boundary
|
| 235 |
+
|
| 236 |
+
preprocessor = PreProcessor(
|
| 237 |
+
clean_empty_lines=True,
|
| 238 |
+
clean_whitespace=True,
|
| 239 |
+
clean_header_footer=True,
|
| 240 |
+
split_by=split_by,
|
| 241 |
+
split_length=split_length,
|
| 242 |
+
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
| 243 |
+
split_overlap=split_overlap,
|
| 244 |
+
|
| 245 |
+
# will add page number only in case of PDF not for text/docx file.
|
| 246 |
+
add_page_number=True
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
for i in documents:
|
| 250 |
+
# # basic cleaning before passing it to preprocessor.
|
| 251 |
+
# i = basic(i)
|
| 252 |
+
docs_processed = preprocessor.process([i])
|
| 253 |
+
for item in docs_processed:
|
| 254 |
+
item.content = basic(item.content, remove_punc= remove_punc)
|
| 255 |
+
|
| 256 |
+
df = pd.DataFrame(docs_processed)
|
| 257 |
+
all_text = " ".join(df.content.to_list())
|
| 258 |
+
para_list = df.content.to_list()
|
| 259 |
+
logging.info('document split into {} paragraphs'.format(len(para_list)))
|
| 260 |
+
output = {'documents': docs_processed,
|
| 261 |
+
'dataframe': df,
|
| 262 |
+
'text': all_text,
|
| 263 |
+
'paraList': para_list
|
| 264 |
+
}
|
| 265 |
+
return output, "output_1"
|
| 266 |
+
def run_batch():
|
| 267 |
+
"""
|
| 268 |
+
we dont have requirement to process the multiple files in one go
|
| 269 |
+
therefore nothing here, however to use the custom node we need to have
|
| 270 |
+
this method for the class.
|
| 271 |
+
"""
|
| 272 |
+
return
|
| 273 |
+
|
| 274 |
+
def processingpipeline():
|
| 275 |
+
"""
|
| 276 |
+
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
| 277 |
+
from utils.preprocessing
|
| 278 |
+
|
| 279 |
+
"""
|
| 280 |
+
|
| 281 |
+
preprocessing_pipeline = Pipeline()
|
| 282 |
+
file_converter = FileConverter()
|
| 283 |
+
custom_preprocessor = UdfPreProcessor()
|
| 284 |
+
|
| 285 |
+
preprocessing_pipeline.add_node(component=file_converter,
|
| 286 |
+
name="FileConverter", inputs=["File"])
|
| 287 |
+
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
| 288 |
+
name ='UdfPreProcessor', inputs=["FileConverter"])
|
| 289 |
+
|
| 290 |
+
return preprocessing_pipeline
|
| 291 |
+
|
utils/uploadAndExample.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import tempfile
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
def add_upload(choice):
|
| 6 |
+
if choice == 'Upload Document':
|
| 7 |
+
uploaded_files = st.sidebar.file_uploader('Upload Files',
|
| 8 |
+
type=['pdf', 'docx', 'txt'],
|
| 9 |
+
accept_multiple_files=True)
|
| 10 |
+
|
| 11 |
+
if uploaded_files is not None:
|
| 12 |
+
# Clear previous uploaded files from session state
|
| 13 |
+
for key in list(st.session_state.keys()):
|
| 14 |
+
if key.startswith('filename') or key.startswith('filepath'):
|
| 15 |
+
del st.session_state[key]
|
| 16 |
+
|
| 17 |
+
# Process and store each uploaded file
|
| 18 |
+
for index, uploaded_file in enumerate(uploaded_files):
|
| 19 |
+
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as temp:
|
| 20 |
+
bytes_data = uploaded_file.getvalue()
|
| 21 |
+
temp.write(bytes_data)
|
| 22 |
+
st.session_state[f'filename_{index}'] = uploaded_file.name
|
| 23 |
+
st.session_state[f'filepath_{index}'] = temp.name
|
| 24 |
+
|
| 25 |
+
else: # Handle example document selection
|
| 26 |
+
# listing the options
|
| 27 |
+
with open('docStore/sample/files.json', 'r') as json_file:
|
| 28 |
+
files = json.load(json_file)
|
| 29 |
+
|
| 30 |
+
option = st.sidebar.selectbox('Select the example document',
|
| 31 |
+
list(files.keys()))
|
| 32 |
+
file_path = files[option]
|
| 33 |
+
st.session_state['filename_0'] = file_path # Use 'filename_0' to align with the upload naming convention
|
| 34 |
+
st.session_state['filepath_0'] = file_path # Use 'filepath_0' for consistency
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# get the filenames from the processed docs dataframe so we can use for tab names
|
| 38 |
+
def get_tabs(uploaded_docs):
|
| 39 |
+
tabs = []
|
| 40 |
+
for doc_name in uploaded_docs:
|
| 41 |
+
tab_title = doc_name # Assuming doc_name is a string with the file name
|
| 42 |
+
tabs.append(tab_title)
|
| 43 |
+
return tabs
|
| 44 |
+
|
utils/vulnerability_classifier.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple
|
| 2 |
+
from typing_extensions import Literal
|
| 3 |
+
import logging
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from pandas import DataFrame, Series
|
| 6 |
+
from utils.config import getconfig
|
| 7 |
+
from utils.preprocessing import processingpipeline
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from transformers import pipeline
|
| 10 |
+
from setfit import SetFitModel
|
| 11 |
+
|
| 12 |
+
label_dict= {0: 'Agricultural communities',
|
| 13 |
+
1: 'Children',
|
| 14 |
+
2: 'Coastal communities',
|
| 15 |
+
3: 'Ethnic, racial or other minorities',
|
| 16 |
+
4: 'Fishery communities',
|
| 17 |
+
5: 'Informal sector workers',
|
| 18 |
+
6: 'Members of indigenous and local communities',
|
| 19 |
+
7: 'Migrants and displaced persons',
|
| 20 |
+
8: 'Older persons',
|
| 21 |
+
9: 'Other',
|
| 22 |
+
10: 'Persons living in poverty',
|
| 23 |
+
11: 'Persons with disabilities',
|
| 24 |
+
12: 'Persons with pre-existing health conditions',
|
| 25 |
+
13: 'Residents of drought-prone regions',
|
| 26 |
+
14: 'Rural populations',
|
| 27 |
+
15: 'Sexual minorities (LGBTQI+)',
|
| 28 |
+
16: 'Urban populations',
|
| 29 |
+
17: 'Women and other genders'}
|
| 30 |
+
|
| 31 |
+
def getlabels(preds):
|
| 32 |
+
# Get label names
|
| 33 |
+
preds_list = preds.tolist()
|
| 34 |
+
|
| 35 |
+
predictions_names=[]
|
| 36 |
+
|
| 37 |
+
# loop through each prediction
|
| 38 |
+
for ele in preds_list:
|
| 39 |
+
|
| 40 |
+
# see if there is a value 1 and retrieve index
|
| 41 |
+
try:
|
| 42 |
+
index_of_one = ele.index(1)
|
| 43 |
+
except ValueError:
|
| 44 |
+
index_of_one = "NA"
|
| 45 |
+
|
| 46 |
+
# Retrieve the name of the label (if no prediction made = NA)
|
| 47 |
+
if index_of_one != "NA":
|
| 48 |
+
name = label_dict[index_of_one]
|
| 49 |
+
else:
|
| 50 |
+
name = "Other"
|
| 51 |
+
|
| 52 |
+
# Append name to list
|
| 53 |
+
predictions_names.append(name)
|
| 54 |
+
|
| 55 |
+
return predictions_names
|
| 56 |
+
|
| 57 |
+
@st.cache_resource
|
| 58 |
+
def load_vulnerabilityClassifier(config_file:str = None, classifier_name:str = None):
|
| 59 |
+
"""
|
| 60 |
+
loads the document classifier using haystack, where the name/path of model
|
| 61 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
| 62 |
+
model should be passed.
|
| 63 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
| 64 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
| 65 |
+
Params
|
| 66 |
+
--------
|
| 67 |
+
config_file: config file path from which to read the model name
|
| 68 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
| 69 |
+
found then will look for configfile, else raise error.
|
| 70 |
+
Return: document classifier model
|
| 71 |
+
"""
|
| 72 |
+
if not classifier_name:
|
| 73 |
+
if not config_file:
|
| 74 |
+
logging.warning("Pass either model name or config file")
|
| 75 |
+
return
|
| 76 |
+
else:
|
| 77 |
+
config = getconfig(config_file)
|
| 78 |
+
classifier_name = config.get('vulnerability','MODEL')
|
| 79 |
+
|
| 80 |
+
logging.info("Loading vulnerability classifier")
|
| 81 |
+
# we are using the pipeline as the model is multilabel and DocumentClassifier
|
| 82 |
+
# from Haystack doesnt support multilabel
|
| 83 |
+
# in pipeline we use 'sigmoid' to explicitly tell pipeline to make it multilabel
|
| 84 |
+
# if not then it will automatically use softmax, which is not a desired thing.
|
| 85 |
+
# doc_classifier = TransformersDocumentClassifier(
|
| 86 |
+
# model_name_or_path=classifier_name,
|
| 87 |
+
# task="text-classification",
|
| 88 |
+
# top_k = None)
|
| 89 |
+
|
| 90 |
+
# # Download model from HF Hub
|
| 91 |
+
doc_classifier = SetFitModel.from_pretrained("leavoigt/vulnerable_groups")
|
| 92 |
+
|
| 93 |
+
# doc_classifier = pipeline("text-classification",
|
| 94 |
+
# model=classifier_name,
|
| 95 |
+
# return_all_scores=True,
|
| 96 |
+
# function_to_apply= "sigmoid")
|
| 97 |
+
|
| 98 |
+
return doc_classifier
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@st.cache_data
|
| 102 |
+
def vulnerability_classification(haystack_doc:pd.DataFrame,
|
| 103 |
+
threshold:float = 0.5,
|
| 104 |
+
classifier_model:pipeline= None
|
| 105 |
+
)->Tuple[DataFrame,Series]:
|
| 106 |
+
"""
|
| 107 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
| 108 |
+
most appropriate label for each text. these labels are in terms of if text
|
| 109 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
| 110 |
+
Params
|
| 111 |
+
---------
|
| 112 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
| 113 |
+
contains the list of paragraphs in different format,here the list of
|
| 114 |
+
Haystack Documents is used.
|
| 115 |
+
threshold: threshold value for the model to keep the results from classifier
|
| 116 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
| 117 |
+
however if not then looks for model in streamlit session.
|
| 118 |
+
In case of streamlit avoid passing the model directly.
|
| 119 |
+
Returns
|
| 120 |
+
----------
|
| 121 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
| 122 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
| 123 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
| 124 |
+
"""
|
| 125 |
+
logging.info("Working on vulnerability Identification")
|
| 126 |
+
haystack_doc['Vulnerability Label'] = 'NA'
|
| 127 |
+
# haystack_doc['PA_check'] = haystack_doc['Policy-Action Label'].apply(lambda x: True if len(x) != 0 else False)
|
| 128 |
+
|
| 129 |
+
# df1 = haystack_doc[haystack_doc['PA_check'] == True]
|
| 130 |
+
# df = haystack_doc[haystack_doc['PA_check'] == False]
|
| 131 |
+
if not classifier_model:
|
| 132 |
+
classifier_model = st.session_state['vulnerability_classifier']
|
| 133 |
+
|
| 134 |
+
predictions = classifier_model(list(haystack_doc.text))
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
pred_labels = getlabels(predictions)
|
| 139 |
+
|
| 140 |
+
haystack_doc['Vulnerability Label'] = pred_labels
|
| 141 |
+
# placeholder = {}
|
| 142 |
+
# for j in range(len(temp)):
|
| 143 |
+
# placeholder[temp[j]['label']] = temp[j]['score']
|
| 144 |
+
# list_.append(placeholder)
|
| 145 |
+
# labels_ = [{**list_[l]} for l in range(len(predictions))]
|
| 146 |
+
# truth_df = DataFrame.from_dict(labels_)
|
| 147 |
+
# truth_df = truth_df.round(2)
|
| 148 |
+
# truth_df = truth_df.astype(float) >= threshold
|
| 149 |
+
# truth_df = truth_df.astype(str)
|
| 150 |
+
# categories = list(truth_df.columns)
|
| 151 |
+
# truth_df['Vulnerability Label'] = truth_df.apply(lambda x: {i if x[i]=='True' else
|
| 152 |
+
# None for i in categories}, axis=1)
|
| 153 |
+
# truth_df['Vulnerability Label'] = truth_df.apply(lambda x: list(x['Vulnerability Label']
|
| 154 |
+
# -{None}),axis=1)
|
| 155 |
+
# haystack_doc['Vulnerability Label'] = list(truth_df['Vulnerability Label'])
|
| 156 |
+
return haystack_doc
|