prashant
commited on
Commit
·
a4bf4e8
1
Parent(s):
2bccbcb
adding semantic search
Browse files- appStore/keyword_search.py +14 -5
- appStore/sdg_analysis.py +1 -1
- paramconfig.cfg +6 -2
- utils/search.py +120 -13
appStore/keyword_search.py
CHANGED
@@ -6,6 +6,7 @@ import streamlit as st
|
|
6 |
import json
|
7 |
import logging
|
8 |
from utils.search import runLexicalPreprocessingPipeline, lexical_search
|
|
|
9 |
|
10 |
def app():
|
11 |
|
@@ -46,11 +47,13 @@ def app():
|
|
46 |
else:
|
47 |
keywordList = None
|
48 |
|
49 |
-
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
|
|
|
50 |
|
51 |
with st.container():
|
52 |
if keywordList is not None:
|
53 |
-
queryList = st.text_input("You selcted the {} category we
|
|
|
54 |
value="{}".format(keywordList))
|
55 |
else:
|
56 |
queryList = st.text_input("Please enter here your question and we will look \
|
@@ -67,13 +70,19 @@ def app():
|
|
67 |
logging.warning("Terminated as no keyword provided")
|
68 |
else:
|
69 |
if 'filepath' in st.session_state:
|
70 |
-
|
71 |
|
72 |
if searchtype == 'Exact Matches':
|
73 |
-
|
74 |
logging.info("performing lexical search")
|
75 |
-
# token_list = tokenize_lexical_query(queryList)
|
76 |
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
77 |
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
78 |
lexical_search(queryList,paraList)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
|
|
6 |
import json
|
7 |
import logging
|
8 |
from utils.search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
+
from utils.search import runSemanticPreprocessingPipeline, semantic_search
|
10 |
|
11 |
def app():
|
12 |
|
|
|
47 |
else:
|
48 |
keywordList = None
|
49 |
|
50 |
+
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context",
|
51 |
+
['Exact Matches', 'Similar context/meaning'])
|
52 |
|
53 |
with st.container():
|
54 |
if keywordList is not None:
|
55 |
+
queryList = st.text_input("You selcted the {} category we \
|
56 |
+
will look for these keywords in document".format(genre),
|
57 |
value="{}".format(keywordList))
|
58 |
else:
|
59 |
queryList = st.text_input("Please enter here your question and we will look \
|
|
|
70 |
logging.warning("Terminated as no keyword provided")
|
71 |
else:
|
72 |
if 'filepath' in st.session_state:
|
73 |
+
|
74 |
|
75 |
if searchtype == 'Exact Matches':
|
76 |
+
paraList = runLexicalPreprocessingPipeline()
|
77 |
logging.info("performing lexical search")
|
|
|
78 |
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
79 |
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
80 |
lexical_search(queryList,paraList)
|
81 |
+
else:
|
82 |
+
paraList = runSemanticPreprocessingPipeline()
|
83 |
+
logging.info("starting semantic search")
|
84 |
+
with st.spinner("Performing Similar/Contextual search"):
|
85 |
+
st.markdown("##### Top few semantic search results #####")
|
86 |
+
semantic_search(queryList,paraList,show_answers=True)
|
87 |
+
|
88 |
|
appStore/sdg_analysis.py
CHANGED
@@ -47,7 +47,7 @@ def app():
|
|
47 |
if 'filepath' in st.session_state:
|
48 |
paraList = runSDGPreprocessingPipeline()
|
49 |
if len(paraList) > 150:
|
50 |
-
warning_msg = ": This might take
|
51 |
else:
|
52 |
warning_msg = ""
|
53 |
|
|
|
47 |
if 'filepath' in st.session_state:
|
48 |
paraList = runSDGPreprocessingPipeline()
|
49 |
if len(paraList) > 150:
|
50 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
51 |
else:
|
52 |
warning_msg = ""
|
53 |
|
paramconfig.cfg
CHANGED
@@ -6,9 +6,13 @@ SPLIT_LENGTH = 3
|
|
6 |
SPLIT_OVERLAP = 0
|
7 |
|
8 |
[semantic_search]
|
9 |
-
|
10 |
MAX_SEQ_LENGTH = 64
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
THRESHOLD = 0.1
|
13 |
SPLIT_BY = sentence
|
14 |
SPLIT_LENGTH = 3
|
|
|
6 |
SPLIT_OVERLAP = 0
|
7 |
|
8 |
[semantic_search]
|
9 |
+
RETRIEVER_TOP_K = 10
|
10 |
MAX_SEQ_LENGTH = 64
|
11 |
+
RETRIEVER = msmarco-bert-base-dot-v5
|
12 |
+
RETRIEVER_FORMAT = sentence_transformers
|
13 |
+
RETRIEVER_EMB_LAYER = -1
|
14 |
+
READER = deepset/tinyroberta-squad2
|
15 |
+
READER_TOP_K = 5
|
16 |
THRESHOLD = 0.1
|
17 |
SPLIT_BY = sentence
|
18 |
SPLIT_LENGTH = 3
|
utils/search.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
from haystack.nodes import TfidfRetriever
|
|
|
|
|
2 |
from haystack.document_stores import InMemoryDocumentStore
|
3 |
import configparser
|
4 |
import spacy
|
@@ -8,8 +10,9 @@ import streamlit as st
|
|
8 |
from markdown import markdown
|
9 |
from annotated_text import annotation
|
10 |
from haystack.schema import Document
|
11 |
-
from typing import List,
|
12 |
from utils.preprocessing import processingpipeline
|
|
|
13 |
|
14 |
config = configparser.ConfigParser()
|
15 |
config.read_file(open('paramconfig.cfg'))
|
@@ -142,7 +145,7 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
142 |
# if result.content != "":
|
143 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
144 |
if len(matches) != 0:
|
145 |
-
st.write("Result {}".format(count))
|
146 |
searchAnnotator(matches, doc)
|
147 |
|
148 |
def runLexicalPreprocessingPipeline()->List[Document]:
|
@@ -153,19 +156,19 @@ def runLexicalPreprocessingPipeline()->List[Document]:
|
|
153 |
Return
|
154 |
--------------
|
155 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
156 |
-
has four objects. For the
|
157 |
need to use the List of Haystack Document, which can be fetched by
|
158 |
key = 'documents' on output.
|
159 |
|
160 |
"""
|
161 |
file_path = st.session_state['filepath']
|
162 |
file_name = st.session_state['filename']
|
163 |
-
|
164 |
split_by = config.get('lexical_search','SPLIT_BY')
|
165 |
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
166 |
split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
167 |
|
168 |
-
output_lexical_pre =
|
169 |
params= {"FileConverter": {"file_path": file_path, \
|
170 |
"file_name": file_name},
|
171 |
"UdfPreProcessor": {"removePunc": False, \
|
@@ -183,19 +186,19 @@ def runSemanticPreprocessingPipeline()->List[Document]:
|
|
183 |
Return
|
184 |
--------------
|
185 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
186 |
-
has four objects. For the Haysatck implementation of
|
187 |
need to use the List of Haystack Document, which can be fetched by
|
188 |
key = 'documents' on output.
|
189 |
|
190 |
"""
|
191 |
file_path = st.session_state['filepath']
|
192 |
file_name = st.session_state['filename']
|
193 |
-
|
194 |
-
split_by = config.get('
|
195 |
-
split_length = int(config.get('
|
196 |
-
split_overlap = int(config.get('
|
197 |
|
198 |
-
|
199 |
params= {"FileConverter": {"file_path": file_path, \
|
200 |
"file_name": file_name},
|
201 |
"UdfPreProcessor": {"removePunc": False, \
|
@@ -203,4 +206,108 @@ def runSemanticPreprocessingPipeline()->List[Document]:
|
|
203 |
"split_length":split_length,\
|
204 |
"split_overlap": split_overlap}})
|
205 |
|
206 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TfidfRetriever, TransformersQueryClassifier
|
2 |
+
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
+
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
5 |
import configparser
|
6 |
import spacy
|
|
|
10 |
from markdown import markdown
|
11 |
from annotated_text import annotation
|
12 |
from haystack.schema import Document
|
13 |
+
from typing import List, Text
|
14 |
from utils.preprocessing import processingpipeline
|
15 |
+
from haystack.pipelines import Pipeline
|
16 |
|
17 |
config = configparser.ConfigParser()
|
18 |
config.read_file(open('paramconfig.cfg'))
|
|
|
145 |
# if result.content != "":
|
146 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
147 |
if len(matches) != 0:
|
148 |
+
st.write("Result {}".format(count+1))
|
149 |
searchAnnotator(matches, doc)
|
150 |
|
151 |
def runLexicalPreprocessingPipeline()->List[Document]:
|
|
|
156 |
Return
|
157 |
--------------
|
158 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
159 |
+
has four objects. For the lexicaal search using TFIDFRetriever we
|
160 |
need to use the List of Haystack Document, which can be fetched by
|
161 |
key = 'documents' on output.
|
162 |
|
163 |
"""
|
164 |
file_path = st.session_state['filepath']
|
165 |
file_name = st.session_state['filename']
|
166 |
+
lexical_processing_pipeline = processingpipeline()
|
167 |
split_by = config.get('lexical_search','SPLIT_BY')
|
168 |
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
169 |
split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
170 |
|
171 |
+
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
172 |
params= {"FileConverter": {"file_path": file_path, \
|
173 |
"file_name": file_name},
|
174 |
"UdfPreProcessor": {"removePunc": False, \
|
|
|
186 |
Return
|
187 |
--------------
|
188 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
189 |
+
has four objects. For the Haysatck implementation of semantic search we,
|
190 |
need to use the List of Haystack Document, which can be fetched by
|
191 |
key = 'documents' on output.
|
192 |
|
193 |
"""
|
194 |
file_path = st.session_state['filepath']
|
195 |
file_name = st.session_state['filename']
|
196 |
+
semantic_processing_pipeline = processingpipeline()
|
197 |
+
split_by = config.get('semantic_search','SPLIT_BY')
|
198 |
+
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
199 |
+
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
200 |
|
201 |
+
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
202 |
params= {"FileConverter": {"file_path": file_path, \
|
203 |
"file_name": file_name},
|
204 |
"UdfPreProcessor": {"removePunc": False, \
|
|
|
206 |
"split_length":split_length,\
|
207 |
"split_overlap": split_overlap}})
|
208 |
|
209 |
+
return output_semantic_pre['documents']
|
210 |
+
|
211 |
+
class QueryCheck(BaseComponent):
|
212 |
+
|
213 |
+
outgoing_edges = 1
|
214 |
+
|
215 |
+
def run(self, query):
|
216 |
+
|
217 |
+
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
218 |
+
"shahrukhx01/bert-mini-finetune-question-detection")
|
219 |
+
|
220 |
+
|
221 |
+
result = query_classifier.run(query=query)
|
222 |
+
|
223 |
+
if result[1] == "output_1":
|
224 |
+
output = {"query":query,
|
225 |
+
"query_type": 'question/statement'}
|
226 |
+
else:
|
227 |
+
output = {"query": "find all issues related to {}".format(query),
|
228 |
+
"query_type": 'statements/keyword'}
|
229 |
+
|
230 |
+
return output, "output_1"
|
231 |
+
|
232 |
+
def run_batch(self, query):
|
233 |
+
pass
|
234 |
+
|
235 |
+
|
236 |
+
def semanticSearchPipeline(documents, show_answers = False):
|
237 |
+
document_store = InMemoryDocumentStore()
|
238 |
+
document_store.write_documents(documents)
|
239 |
+
|
240 |
+
embedding_model = config.get('semantic_search','RETRIEVER')
|
241 |
+
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
242 |
+
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
243 |
+
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
+
querycheck = QueryCheck()
|
248 |
+
retriever = EmbeddingRetriever(
|
249 |
+
document_store=document_store,
|
250 |
+
embedding_model=embedding_model,top_k = retriever_top_k,
|
251 |
+
emb_extraction_layer=embedding_layer, scale_score =True,
|
252 |
+
model_format=embedding_model_format, use_gpu = True)
|
253 |
+
document_store.update_embeddings(retriever)
|
254 |
+
|
255 |
+
|
256 |
+
semanticsearch_pipeline = Pipeline()
|
257 |
+
semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
258 |
+
inputs = ["Query"])
|
259 |
+
semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
260 |
+
inputs = ["QueryCheck.output_1"])
|
261 |
+
if show_answers == True:
|
262 |
+
reader_model = config.get('semantic_search','READER')
|
263 |
+
reader_top_k = retriever_top_k
|
264 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
265 |
+
top_k = reader_top_k, use_gpu=True)
|
266 |
+
|
267 |
+
semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
268 |
+
inputs= ["EmbeddingRetriever"])
|
269 |
+
|
270 |
+
return semanticsearch_pipeline, document_store
|
271 |
+
|
272 |
+
def semantic_search(query:Text,documents:List[Document],show_answers = False):
|
273 |
+
"""
|
274 |
+
Performs the Lexical search on the List of haystack documents which is
|
275 |
+
returned by preprocessing Pipeline.
|
276 |
+
"""
|
277 |
+
threshold = 0.4
|
278 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
|
279 |
+
show_answers=show_answers)
|
280 |
+
results = semanticsearch_pipeline.run(query = query)
|
281 |
+
|
282 |
+
|
283 |
+
if show_answers == False:
|
284 |
+
results = results['documents']
|
285 |
+
for i,queryhit in enumerate(results):
|
286 |
+
|
287 |
+
if queryhit.score > threshold:
|
288 |
+
st.write("\t {}: \t {}".format(i+1, queryhit.content.replace("\n", " ")))
|
289 |
+
st.markdown("---")
|
290 |
+
|
291 |
+
else:
|
292 |
+
matches = []
|
293 |
+
doc = []
|
294 |
+
for answer in results['answers']:
|
295 |
+
if answer.score >0.01:
|
296 |
+
temp = answer.to_dict()
|
297 |
+
start_idx = temp['offsets_in_document'][0]['start']
|
298 |
+
end_idx = temp['offsets_in_document'][0]['end']
|
299 |
+
|
300 |
+
matches.append([start_idx,end_idx])
|
301 |
+
doc.append(doc_store.get_document_by_id(temp['document_id']).content)
|
302 |
+
searchAnnotator(matches,doc)
|
303 |
+
|
304 |
+
|
305 |
+
|
306 |
+
|
307 |
+
|
308 |
+
|
309 |
+
|
310 |
+
|
311 |
+
return results
|
312 |
+
|
313 |
+
|