prashant commited on
Commit
63da636
1 Parent(s): 40cb026

update semantic search

Browse files
utils/{search.py → lexical_search.py} RENAMED
@@ -1,5 +1,4 @@
1
- from haystack.nodes import TfidfRetriever, TransformersQueryClassifier
2
- from haystack.nodes import EmbeddingRetriever, FARMReader
3
  from haystack.nodes.base import BaseComponent
4
  from haystack.document_stores import InMemoryDocumentStore
5
  import configparser
@@ -101,7 +100,7 @@ def runRegexMatcher(token_list:List[str], document:Text):
101
 
102
  return matches, document
103
 
104
- def searchAnnotator(matches: List[List[int]], document):
105
  """
106
  Annotates the text in the document defined by list of [start index, end index]
107
  Example: "How are you today", if document type is text, matches = [[0,3]]
@@ -127,27 +126,6 @@ def searchAnnotator(matches: List[List[int]], document):
127
  unsafe_allow_html=True,
128
  )
129
 
130
- def lexical_search(query:Text,documents:List[Document]):
131
- """
132
- Performs the Lexical search on the List of haystack documents which is
133
- returned by preprocessing Pipeline.
134
- """
135
-
136
- document_store = InMemoryDocumentStore()
137
- document_store.write_documents(documents)
138
-
139
- # Haystack Retriever works with document stores only.
140
- retriever = TfidfRetriever(document_store)
141
- results = retriever.retrieve(query=query,
142
- top_k= int(config.get('lexical_search','TOP_K')))
143
- query_tokens = tokenize_lexical_query(query)
144
- for count, result in enumerate(results):
145
- # if result.content != "":
146
- matches, doc = runSpacyMatcher(query_tokens,result.content)
147
- if len(matches) != 0:
148
- st.write("Result {}".format(count+1))
149
- searchAnnotator(matches, doc)
150
-
151
  def runLexicalPreprocessingPipeline()->List[Document]:
152
  """
153
  creates the pipeline and runs the preprocessing pipeline,
@@ -177,131 +155,25 @@ def runLexicalPreprocessingPipeline()->List[Document]:
177
  "split_overlap": split_overlap}})
178
 
179
  return output_lexical_pre['documents']
180
-
181
- def runSemanticPreprocessingPipeline()->List[Document]:
182
- """
183
- creates the pipeline and runs the preprocessing pipeline,
184
- the params for pipeline are fetched from paramconfig
185
-
186
- Return
187
- --------------
188
- List[Document]: When preprocessing pipeline is run, the output dictionary
189
- has four objects. For the Haysatck implementation of semantic search we,
190
- need to use the List of Haystack Document, which can be fetched by
191
- key = 'documents' on output.
192
-
193
- """
194
- file_path = st.session_state['filepath']
195
- file_name = st.session_state['filename']
196
- semantic_processing_pipeline = processingpipeline()
197
- split_by = config.get('semantic_search','SPLIT_BY')
198
- split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
199
- split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
200
-
201
- output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
202
- params= {"FileConverter": {"file_path": file_path, \
203
- "file_name": file_name},
204
- "UdfPreProcessor": {"removePunc": False, \
205
- "split_by": split_by, \
206
- "split_length":split_length,\
207
- "split_overlap": split_overlap}})
208
 
209
- return output_semantic_pre['documents']
210
-
211
- class QueryCheck(BaseComponent):
212
-
213
- outgoing_edges = 1
214
-
215
- def run(self, query):
216
-
217
- query_classifier = TransformersQueryClassifier(model_name_or_path=
218
- "shahrukhx01/bert-mini-finetune-question-detection")
219
-
220
-
221
- result = query_classifier.run(query=query)
222
-
223
- if result[1] == "output_1":
224
- output = {"query":query,
225
- "query_type": 'question/statement'}
226
- else:
227
- output = {"query": "find all issues related to {}".format(query),
228
- "query_type": 'statements/keyword'}
229
-
230
- return output, "output_1"
231
-
232
- def run_batch(self, query):
233
- pass
234
-
235
-
236
- def semanticSearchPipeline(documents, show_answers = False):
237
- document_store = InMemoryDocumentStore()
238
- document_store.write_documents(documents)
239
-
240
- embedding_model = config.get('semantic_search','RETRIEVER')
241
- embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
242
- embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
243
- retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
244
-
245
-
246
-
247
- querycheck = QueryCheck()
248
- retriever = EmbeddingRetriever(
249
- document_store=document_store,
250
- embedding_model=embedding_model,top_k = retriever_top_k,
251
- emb_extraction_layer=embedding_layer, scale_score =True,
252
- model_format=embedding_model_format, use_gpu = True)
253
- document_store.update_embeddings(retriever)
254
-
255
-
256
- semanticsearch_pipeline = Pipeline()
257
- semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
258
- inputs = ["Query"])
259
- semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
260
- inputs = ["QueryCheck.output_1"])
261
- if show_answers == True:
262
- reader_model = config.get('semantic_search','READER')
263
- reader_top_k = retriever_top_k
264
- reader = FARMReader(model_name_or_path=reader_model,
265
- top_k = reader_top_k, use_gpu=True)
266
-
267
- semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
268
- inputs= ["EmbeddingRetriever"])
269
-
270
- return semanticsearch_pipeline, document_store
271
-
272
- def semantic_search(query:Text,documents:List[Document],show_answers = False):
273
  """
274
  Performs the Lexical search on the List of haystack documents which is
275
  returned by preprocessing Pipeline.
276
  """
277
- threshold = 0.4
278
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
279
- show_answers=show_answers)
280
- results = semanticsearch_pipeline.run(query = query)
281
-
282
-
283
- if show_answers == False:
284
- results = results['documents']
285
- for i,queryhit in enumerate(results):
286
-
287
- if queryhit.score > threshold:
288
- st.write("\t {}: \t {}".format(i+1, queryhit.content.replace("\n", " ")))
289
- st.markdown("---")
290
-
291
- else:
292
-
293
- for answer in results['answers']:
294
- st.write(answer)
295
- matches = []
296
- doc = []
297
- if answer.score >0.01:
298
- temp = answer.to_dict()
299
- start_idx = temp['offsets_in_document'][0]['start']
300
- end_idx = temp['offsets_in_document'][0]['end']
301
-
302
- matches.append([start_idx,end_idx])
303
- doc.append(doc_store.get_document_by_id(temp['document_id']).content)
304
- searchAnnotator(matches,doc)
305
-
306
-
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TfidfRetriever
 
2
  from haystack.nodes.base import BaseComponent
3
  from haystack.document_stores import InMemoryDocumentStore
4
  import configparser
 
100
 
101
  return matches, document
102
 
103
+ def lexicalsearchAnnotator(matches: List[List[int]], document):
104
  """
105
  Annotates the text in the document defined by list of [start index, end index]
106
  Example: "How are you today", if document type is text, matches = [[0,3]]
 
126
  unsafe_allow_html=True,
127
  )
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  def runLexicalPreprocessingPipeline()->List[Document]:
130
  """
131
  creates the pipeline and runs the preprocessing pipeline,
 
155
  "split_overlap": split_overlap}})
156
 
157
  return output_lexical_pre['documents']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ def lexical_search(query:Text,documents:List[Document]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  """
161
  Performs the Lexical search on the List of haystack documents which is
162
  returned by preprocessing Pipeline.
163
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
+ document_store = InMemoryDocumentStore()
166
+ document_store.write_documents(documents)
167
+
168
+ # Haystack Retriever works with document stores only.
169
+ retriever = TfidfRetriever(document_store)
170
+ results = retriever.retrieve(query=query,
171
+ top_k= int(config.get('lexical_search','TOP_K')))
172
+ query_tokens = tokenize_lexical_query(query)
173
+ for count, result in enumerate(results):
174
+ # if result.content != "":
175
+ matches, doc = runSpacyMatcher(query_tokens,result.content)
176
+ if len(matches) != 0:
177
+ st.write("Result {}".format(count+1))
178
+ lexicalsearchAnnotator(matches, doc)
179
+
utils/semantic_search.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersQueryClassifier
2
+ from haystack.nodes import EmbeddingRetriever, FARMReader
3
+ from haystack.nodes.base import BaseComponent
4
+ from haystack.document_stores import InMemoryDocumentStore
5
+ import configparser
6
+ import streamlit as st
7
+ from markdown import markdown
8
+ from annotated_text import annotation
9
+ from haystack.schema import Document
10
+ from typing import List, Text
11
+ from utils.preprocessing import processingpipeline
12
+ from haystack.pipelines import Pipeline
13
+
14
+ config = configparser.ConfigParser()
15
+ config.read_file(open('paramconfig.cfg'))
16
+
17
+ class QueryCheck(BaseComponent):
18
+
19
+ outgoing_edges = 1
20
+
21
+ def run(self, query):
22
+
23
+ query_classifier = TransformersQueryClassifier(model_name_or_path=
24
+ "shahrukhx01/bert-mini-finetune-question-detection")
25
+
26
+
27
+ result = query_classifier.run(query=query)
28
+
29
+ if result[1] == "output_1":
30
+ output = {"query":query,
31
+ "query_type": 'question/statement'}
32
+ else:
33
+ output = {"query": "find all issues related to {}".format(query),
34
+ "query_type": 'statements/keyword'}
35
+
36
+ return output, "output_1"
37
+
38
+ def run_batch(self, query):
39
+ pass
40
+
41
+ def runSemanticPreprocessingPipeline()->List[Document]:
42
+ """
43
+ creates the pipeline and runs the preprocessing pipeline,
44
+ the params for pipeline are fetched from paramconfig
45
+
46
+ Return
47
+ --------------
48
+ List[Document]: When preprocessing pipeline is run, the output dictionary
49
+ has four objects. For the Haysatck implementation of semantic search we,
50
+ need to use the List of Haystack Document, which can be fetched by
51
+ key = 'documents' on output.
52
+
53
+ """
54
+ file_path = st.session_state['filepath']
55
+ file_name = st.session_state['filename']
56
+ semantic_processing_pipeline = processingpipeline()
57
+ split_by = config.get('semantic_search','SPLIT_BY')
58
+ split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
59
+ split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
60
+
61
+ output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
62
+ params= {"FileConverter": {"file_path": file_path, \
63
+ "file_name": file_name},
64
+ "UdfPreProcessor": {"removePunc": False, \
65
+ "split_by": split_by, \
66
+ "split_length":split_length,\
67
+ "split_overlap": split_overlap}})
68
+
69
+ return output_semantic_pre['documents']
70
+
71
+
72
+ def semanticSearchPipeline(documents, show_answers = False):
73
+ document_store = InMemoryDocumentStore()
74
+ document_store.write_documents(documents)
75
+
76
+ embedding_model = config.get('semantic_search','RETRIEVER')
77
+ embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
78
+ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
79
+ retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
80
+
81
+
82
+
83
+ querycheck = QueryCheck()
84
+ retriever = EmbeddingRetriever(
85
+ document_store=document_store,
86
+ embedding_model=embedding_model,top_k = retriever_top_k,
87
+ emb_extraction_layer=embedding_layer, scale_score =True,
88
+ model_format=embedding_model_format, use_gpu = True)
89
+ document_store.update_embeddings(retriever)
90
+
91
+
92
+ semanticsearch_pipeline = Pipeline()
93
+ semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
94
+ inputs = ["Query"])
95
+ semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
96
+ inputs = ["QueryCheck.output_1"])
97
+ if show_answers == True:
98
+ reader_model = config.get('semantic_search','READER')
99
+ reader_top_k = retriever_top_k
100
+ reader = FARMReader(model_name_or_path=reader_model,
101
+ top_k = reader_top_k, use_gpu=True)
102
+
103
+ semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
104
+ inputs= ["EmbeddingRetriever"])
105
+
106
+ return semanticsearch_pipeline, document_store
107
+
108
+ def semanticsearchAnnotator(matches: List[List[int]], document):
109
+ """
110
+ Annotates the text in the document defined by list of [start index, end index]
111
+ Example: "How are you today", if document type is text, matches = [[0,3]]
112
+ will give answer = "How", however in case we used the spacy matcher then the
113
+ matches = [[0,3]] will give answer = "How are you". However if spacy is used
114
+ to find "How" then the matches = [[0,1]] for the string defined above.
115
+
116
+ """
117
+ start = 0
118
+ annotated_text = ""
119
+ for match in matches:
120
+ start_idx = match[0]
121
+ end_idx = match[1]
122
+ annotated_text = (annotated_text + document[start:start_idx].text
123
+ + str(annotation(body=document[start_idx:end_idx].text,
124
+ label="ANSWER", background="#964448", color='#ffffff')))
125
+ start = end_idx
126
+
127
+ annotated_text = annotated_text + document[end_idx:].text
128
+
129
+ st.write(
130
+ markdown(annotated_text),
131
+ unsafe_allow_html=True,
132
+ )
133
+
134
+
135
+ def semantic_search(query:Text,documents:List[Document],show_answers = False):
136
+ """
137
+ Performs the Lexical search on the List of haystack documents which is
138
+ returned by preprocessing Pipeline.
139
+ """
140
+ threshold = 0.4
141
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents,
142
+ show_answers=show_answers)
143
+ results = semanticsearch_pipeline.run(query = query)
144
+
145
+
146
+ if show_answers == False:
147
+ results = results['documents']
148
+ for i,queryhit in enumerate(results):
149
+
150
+ if queryhit.score > threshold:
151
+ st.write("\t {}: \t {}".format(i+1, queryhit.content.replace("\n", " ")))
152
+ st.markdown("---")
153
+
154
+ else:
155
+
156
+ for answer in results['answers']:
157
+ st.write(answer)
158
+ # matches = []
159
+ # doc = []
160
+ if answer.score >0.01:
161
+ temp = answer.to_dict()
162
+ start_idx = temp['offsets_in_document'][0]['start']
163
+ end_idx = temp['offsets_in_document'][0]['end']
164
+
165
+ # matches.append([start_idx,end_idx])
166
+ # doc.append(doc_store.get_document_by_id(temp['document_id']).content)
167
+ match = [[start_idx,end_idx]]
168
+ doc = doc_store.get_document_by_id(temp['document_id']).content
169
+ semanticsearchAnnotator(match,doc)
170
+
171
+
172
+