prashant
commited on
Commit
•
99ae6d0
1
Parent(s):
ac18b03
semantic update
Browse files- appStore/keyword_search.py +7 -5
- paramconfig.cfg +0 -1
- utils/lexical_search.py +2 -3
- utils/semantic_search.py +180 -82
appStore/keyword_search.py
CHANGED
@@ -85,11 +85,13 @@ def app():
|
|
85 |
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
86 |
lexical_search(queryList,allDocuments['documents'])
|
87 |
else:
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
93 |
|
94 |
else:
|
95 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
|
|
85 |
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
86 |
lexical_search(queryList,allDocuments['documents'])
|
87 |
else:
|
88 |
+
allDocuments = runSemanticPreprocessingPipeline(
|
89 |
+
st.session_state['filepath'],
|
90 |
+
st.session_state['filename'])
|
91 |
+
|
92 |
+
logging.info("starting semantic search")
|
93 |
+
with st.spinner("Performing Similar/Contextual search"):
|
94 |
+
semantic_search(queryList,allDocuments['documents'])
|
95 |
|
96 |
else:
|
97 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
paramconfig.cfg
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
[lexical_search]
|
2 |
TOP_K = 20
|
3 |
-
THRESHOLD = 0.1
|
4 |
SPLIT_BY = sentence
|
5 |
SPLIT_LENGTH = 3
|
6 |
SPLIT_OVERLAP = 0
|
|
|
1 |
[lexical_search]
|
2 |
TOP_K = 20
|
|
|
3 |
SPLIT_BY = sentence
|
4 |
SPLIT_LENGTH = 3
|
5 |
SPLIT_OVERLAP = 0
|
utils/lexical_search.py
CHANGED
@@ -18,15 +18,14 @@ except:
|
|
18 |
pass
|
19 |
|
20 |
try:
|
21 |
-
import streamlit as st
|
22 |
-
|
23 |
except ImportError:
|
24 |
logging.info("Streamlit not installed")
|
25 |
config = configparser.ConfigParser()
|
26 |
try:
|
27 |
config.read_file(open('paramconfig.cfg'))
|
28 |
except Exception:
|
29 |
-
logging.
|
30 |
st.info("Please place the paramconfig file in the same directory as app.py")
|
31 |
|
32 |
|
|
|
18 |
pass
|
19 |
|
20 |
try:
|
21 |
+
import streamlit as st
|
|
|
22 |
except ImportError:
|
23 |
logging.info("Streamlit not installed")
|
24 |
config = configparser.ConfigParser()
|
25 |
try:
|
26 |
config.read_file(open('paramconfig.cfg'))
|
27 |
except Exception:
|
28 |
+
logging.warning("paramconfig file not found")
|
29 |
st.info("Please place the paramconfig file in the same directory as app.py")
|
30 |
|
31 |
|
utils/semantic_search.py
CHANGED
@@ -3,20 +3,41 @@ from haystack.nodes import EmbeddingRetriever, FARMReader
|
|
3 |
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
5 |
import configparser
|
6 |
-
import streamlit as st
|
7 |
from markdown import markdown
|
8 |
from annotated_text import annotation
|
9 |
from haystack.schema import Document
|
10 |
from typing import List, Text
|
11 |
from utils.preprocessing import processingpipeline
|
|
|
12 |
from haystack.pipelines import Pipeline
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
config = configparser.ConfigParser()
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
class QueryCheck(BaseComponent):
|
18 |
"""
|
19 |
Uses Query Classifier from Haystack, process the query based on query type
|
|
|
|
|
20 |
"""
|
21 |
|
22 |
outgoing_edges = 1
|
@@ -28,11 +49,7 @@ class QueryCheck(BaseComponent):
|
|
28 |
useful for sentence transoformers.
|
29 |
|
30 |
"""
|
31 |
-
|
32 |
-
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
-
"shahrukhx01/bert-mini-finetune-question-detection")
|
34 |
-
|
35 |
-
|
36 |
result = query_classifier.run(query=query)
|
37 |
|
38 |
if result[1] == "output_1":
|
@@ -46,11 +63,20 @@ class QueryCheck(BaseComponent):
|
|
46 |
def run_batch(self, query):
|
47 |
pass
|
48 |
|
49 |
-
|
|
|
50 |
"""
|
51 |
creates the pipeline and runs the preprocessing pipeline,
|
52 |
the params for pipeline are fetched from paramconfig
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
Return
|
55 |
--------------
|
56 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
@@ -59,8 +85,7 @@ def runSemanticPreprocessingPipeline()->List[Document]:
|
|
59 |
key = 'documents' on output.
|
60 |
|
61 |
"""
|
62 |
-
|
63 |
-
file_name = st.session_state['filename']
|
64 |
semantic_processing_pipeline = processingpipeline()
|
65 |
split_by = config.get('semantic_search','SPLIT_BY')
|
66 |
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
@@ -74,9 +99,48 @@ def runSemanticPreprocessingPipeline()->List[Document]:
|
|
74 |
"split_length":split_length,\
|
75 |
"split_overlap": split_overlap}})
|
76 |
|
77 |
-
return output_semantic_pre
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
|
|
|
|
80 |
def semanticSearchPipeline(documents:List[Document]):
|
81 |
"""
|
82 |
creates the semantic search pipeline and document Store object from the
|
@@ -100,73 +164,19 @@ def semanticSearchPipeline(documents:List[Document]):
|
|
100 |
list of document returned by preprocessing pipeline.
|
101 |
|
102 |
"""
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
document_store.write_documents(documents)
|
110 |
-
if 'retriever' in st.session_state:
|
111 |
-
retriever = st.session_state['retriever']
|
112 |
-
document_store.update_embeddings(retriever)
|
113 |
-
# querycheck =
|
114 |
-
|
115 |
-
|
116 |
-
# embedding_model = config.get('semantic_search','RETRIEVER')
|
117 |
-
# embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
118 |
-
# embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
119 |
-
# retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
120 |
-
# retriever = EmbeddingRetriever(
|
121 |
-
# document_store=document_store,
|
122 |
-
# embedding_model=embedding_model,top_k = retriever_top_k,
|
123 |
-
# emb_extraction_layer=embedding_layer, scale_score =True,
|
124 |
-
# model_format=embedding_model_format, use_gpu = True)
|
125 |
-
# document_store.update_embeddings(retriever)
|
126 |
-
else:
|
127 |
-
embedding_model = config.get('semantic_search','RETRIEVER')
|
128 |
-
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
129 |
-
retriever = EmbeddingRetriever(
|
130 |
-
document_store=document_store,
|
131 |
-
embedding_model=embedding_model,top_k = retriever_top_k,
|
132 |
-
emb_extraction_layer=embedding_layer, scale_score =True,
|
133 |
-
model_format=embedding_model_format, use_gpu = True)
|
134 |
-
|
135 |
else:
|
136 |
-
document_store = InMemoryDocumentStore()
|
137 |
-
document_store.write_documents(documents)
|
138 |
-
|
139 |
-
embedding_model = config.get('semantic_search','RETRIEVER')
|
140 |
-
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
141 |
-
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
142 |
-
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
143 |
-
|
144 |
-
|
145 |
-
retriever = EmbeddingRetriever(
|
146 |
-
document_store=document_store,
|
147 |
-
embedding_model=embedding_model,top_k = retriever_top_k,
|
148 |
-
emb_extraction_layer=embedding_layer, scale_score =True,
|
149 |
-
model_format=embedding_model_format, use_gpu = True)
|
150 |
-
st.session_state['retriever'] = retriever
|
151 |
-
document_store.update_embeddings(retriever)
|
152 |
-
st.session_state['document_store'] = document_store
|
153 |
-
querycheck = QueryCheck()
|
154 |
-
st.session_state['querycheck'] = querycheck
|
155 |
reader_model = config.get('semantic_search','READER')
|
156 |
-
reader_top_k =
|
157 |
reader = FARMReader(model_name_or_path=reader_model,
|
158 |
top_k = reader_top_k, use_gpu=True)
|
159 |
-
|
160 |
st.session_state['reader'] = reader
|
161 |
|
162 |
-
querycheck = QueryCheck()
|
163 |
-
|
164 |
-
reader_model = config.get('semantic_search','READER')
|
165 |
-
reader_top_k = retriever_top_k
|
166 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
167 |
-
top_k = reader_top_k, use_gpu=True)
|
168 |
-
|
169 |
-
|
170 |
semanticsearch_pipeline = Pipeline()
|
171 |
semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
172 |
inputs = ["Query"])
|
@@ -174,9 +184,88 @@ def semanticSearchPipeline(documents:List[Document]):
|
|
174 |
inputs = ["QueryCheck.output_1"])
|
175 |
semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
176 |
inputs= ["EmbeddingRetriever"])
|
177 |
-
|
178 |
return semanticsearch_pipeline, document_store
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
def semanticsearchAnnotator(matches: List[List[int]], document):
|
181 |
"""
|
182 |
Annotates the text in the document defined by list of [start index, end index]
|
@@ -191,18 +280,27 @@ def semanticsearchAnnotator(matches: List[List[int]], document):
|
|
191 |
for match in matches:
|
192 |
start_idx = match[0]
|
193 |
end_idx = match[1]
|
194 |
-
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
|
|
197 |
start = end_idx
|
198 |
|
199 |
annotated_text = annotated_text + document[end_idx:]
|
200 |
-
|
201 |
-
st.write(
|
202 |
-
markdown(annotated_text),
|
203 |
-
unsafe_allow_html=True,
|
204 |
-
)
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
def semantic_search(query:Text,documents:List[Document]):
|
208 |
"""
|
|
|
3 |
from haystack.nodes.base import BaseComponent
|
4 |
from haystack.document_stores import InMemoryDocumentStore
|
5 |
import configparser
|
|
|
6 |
from markdown import markdown
|
7 |
from annotated_text import annotation
|
8 |
from haystack.schema import Document
|
9 |
from typing import List, Text
|
10 |
from utils.preprocessing import processingpipeline
|
11 |
+
from utils.streamlitcheck import check_streamlit
|
12 |
from haystack.pipelines import Pipeline
|
13 |
+
import logging
|
14 |
+
try:
|
15 |
+
from termcolor import colored
|
16 |
+
except:
|
17 |
+
pass
|
18 |
+
try:
|
19 |
+
import streamlit as st
|
20 |
+
except ImportError:
|
21 |
+
logging.info("Streamlit not installed")
|
22 |
config = configparser.ConfigParser()
|
23 |
+
try:
|
24 |
+
config.read_file(open('paramconfig.cfg'))
|
25 |
+
except Exception:
|
26 |
+
logging.info("paramconfig file not found")
|
27 |
+
st.info("Please place the paramconfig file in the same directory as app.py")
|
28 |
+
|
29 |
+
|
30 |
+
@st.cache(allow_output_mutation=True)
|
31 |
+
def loadQueryClassifier():
|
32 |
+
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
+
"shahrukhx01/bert-mini-finetune-question-detection")
|
34 |
+
return query_classifier
|
35 |
|
36 |
class QueryCheck(BaseComponent):
|
37 |
"""
|
38 |
Uses Query Classifier from Haystack, process the query based on query type
|
39 |
+
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
40 |
+
|
41 |
"""
|
42 |
|
43 |
outgoing_edges = 1
|
|
|
49 |
useful for sentence transoformers.
|
50 |
|
51 |
"""
|
52 |
+
query_classifier = loadQueryClassifier()
|
|
|
|
|
|
|
|
|
53 |
result = query_classifier.run(query=query)
|
54 |
|
55 |
if result[1] == "output_1":
|
|
|
63 |
def run_batch(self, query):
|
64 |
pass
|
65 |
|
66 |
+
|
67 |
+
def runSemanticPreprocessingPipeline(file_path, file_name)->List[Document]:
|
68 |
"""
|
69 |
creates the pipeline and runs the preprocessing pipeline,
|
70 |
the params for pipeline are fetched from paramconfig
|
71 |
|
72 |
+
Params
|
73 |
+
------------
|
74 |
+
|
75 |
+
file_name: filename, in case of streamlit application use
|
76 |
+
st.session_state['filename']
|
77 |
+
file_path: filepath, in case of streamlit application use
|
78 |
+
st.session_state['filepath']
|
79 |
+
|
80 |
Return
|
81 |
--------------
|
82 |
List[Document]: When preprocessing pipeline is run, the output dictionary
|
|
|
85 |
key = 'documents' on output.
|
86 |
|
87 |
"""
|
88 |
+
|
|
|
89 |
semantic_processing_pipeline = processingpipeline()
|
90 |
split_by = config.get('semantic_search','SPLIT_BY')
|
91 |
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
|
|
99 |
"split_length":split_length,\
|
100 |
"split_overlap": split_overlap}})
|
101 |
|
102 |
+
return output_semantic_pre
|
103 |
+
|
104 |
+
|
105 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
106 |
+
def loadRetriever(embedding_model = None, embedding_model_format = None,
|
107 |
+
embedding_layer = None, retriever_top_k = 10, document_store = None):
|
108 |
+
logging.info("loading retriever")
|
109 |
+
if document_store is None:
|
110 |
+
logging.warning("Retriever initialization requires the DocumentStore")
|
111 |
+
return
|
112 |
+
|
113 |
|
114 |
+
if embedding_model is None:
|
115 |
+
try:
|
116 |
+
embedding_model = config.get('semantic_search','RETRIEVER')
|
117 |
+
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
118 |
+
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
119 |
+
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
120 |
+
except Exception as e:
|
121 |
+
logging.info(e)
|
122 |
+
st.info(e)
|
123 |
+
|
124 |
+
retriever = EmbeddingRetriever(
|
125 |
+
embedding_model=embedding_model,top_k = retriever_top_k,
|
126 |
+
document_store = document_store,
|
127 |
+
emb_extraction_layer=embedding_layer, scale_score =True,
|
128 |
+
model_format=embedding_model_format, use_gpu = True)
|
129 |
+
st.session_state['retriever'] = retriever
|
130 |
+
return retriever
|
131 |
+
|
132 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
133 |
+
def createDocumentStore(documents:List[Document], similarity:str = 'cosine'):
|
134 |
+
document_store = InMemoryDocumentStore(similarity = similarity)
|
135 |
+
document_store.write_documents(documents)
|
136 |
+
if 'retriever' in st.session_state:
|
137 |
+
retriever = st.session_state['retriever']
|
138 |
+
document_store.update_embeddings(retriever)
|
139 |
+
|
140 |
+
return document_store
|
141 |
|
142 |
+
|
143 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
|
144 |
def semanticSearchPipeline(documents:List[Document]):
|
145 |
"""
|
146 |
creates the semantic search pipeline and document Store object from the
|
|
|
164 |
list of document returned by preprocessing pipeline.
|
165 |
|
166 |
"""
|
167 |
+
document_store = createDocumentStore(documents)
|
168 |
+
retriever = loadRetriever(document_store=document_store)
|
169 |
+
document_store.update_embeddings(retriever)
|
170 |
+
querycheck = QueryCheck()
|
171 |
+
if 'reader' in st.session_state:
|
172 |
+
reader = st.session_state['reader']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
reader_model = config.get('semantic_search','READER')
|
175 |
+
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
176 |
reader = FARMReader(model_name_or_path=reader_model,
|
177 |
top_k = reader_top_k, use_gpu=True)
|
|
|
178 |
st.session_state['reader'] = reader
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
semanticsearch_pipeline = Pipeline()
|
181 |
semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
182 |
inputs = ["Query"])
|
|
|
184 |
inputs = ["QueryCheck.output_1"])
|
185 |
semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
186 |
inputs= ["EmbeddingRetriever"])
|
187 |
+
|
188 |
return semanticsearch_pipeline, document_store
|
189 |
|
190 |
+
|
191 |
+
|
192 |
+
# if 'document_store' in st.session_state:
|
193 |
+
# document_store = st.session_state['document_store']
|
194 |
+
# temp = document_store.get_all_documents()
|
195 |
+
# if st.session_state['filename'] != temp[0].meta['name']:
|
196 |
+
|
197 |
+
# document_store = InMemoryDocumentStore()
|
198 |
+
# document_store.write_documents(documents)
|
199 |
+
# if 'retriever' in st.session_state:
|
200 |
+
# retriever = st.session_state['retriever']
|
201 |
+
# document_store.update_embeddings(retriever)
|
202 |
+
# # querycheck =
|
203 |
+
|
204 |
+
|
205 |
+
# # embedding_model = config.get('semantic_search','RETRIEVER')
|
206 |
+
# # embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
207 |
+
# # embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
208 |
+
# # retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
209 |
+
# # retriever = EmbeddingRetriever(
|
210 |
+
# # document_store=document_store,
|
211 |
+
# # embedding_model=embedding_model,top_k = retriever_top_k,
|
212 |
+
# # emb_extraction_layer=embedding_layer, scale_score =True,
|
213 |
+
# # model_format=embedding_model_format, use_gpu = True)
|
214 |
+
# # document_store.update_embeddings(retriever)
|
215 |
+
# else:
|
216 |
+
# embedding_model = config.get('semantic_search','RETRIEVER')
|
217 |
+
# embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
218 |
+
# retriever = EmbeddingRetriever(
|
219 |
+
# document_store=document_store,
|
220 |
+
# embedding_model=embedding_model,top_k = retriever_top_k,
|
221 |
+
# emb_extraction_layer=embedding_layer, scale_score =True,
|
222 |
+
# model_format=embedding_model_format, use_gpu = True)
|
223 |
+
|
224 |
+
# else:
|
225 |
+
# document_store = InMemoryDocumentStore()
|
226 |
+
# document_store.write_documents(documents)
|
227 |
+
|
228 |
+
# embedding_model = config.get('semantic_search','RETRIEVER')
|
229 |
+
# embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
230 |
+
# embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
231 |
+
# retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
232 |
+
|
233 |
+
|
234 |
+
# retriever = EmbeddingRetriever(
|
235 |
+
# document_store=document_store,
|
236 |
+
# embedding_model=embedding_model,top_k = retriever_top_k,
|
237 |
+
# emb_extraction_layer=embedding_layer, scale_score =True,
|
238 |
+
# model_format=embedding_model_format, use_gpu = True)
|
239 |
+
# st.session_state['retriever'] = retriever
|
240 |
+
# document_store.update_embeddings(retriever)
|
241 |
+
# st.session_state['document_store'] = document_store
|
242 |
+
# querycheck = QueryCheck()
|
243 |
+
# st.session_state['querycheck'] = querycheck
|
244 |
+
# reader_model = config.get('semantic_search','READER')
|
245 |
+
# reader_top_k = retriever_top_k
|
246 |
+
# reader = FARMReader(model_name_or_path=reader_model,
|
247 |
+
# top_k = reader_top_k, use_gpu=True)
|
248 |
+
|
249 |
+
# st.session_state['reader'] = reader
|
250 |
+
|
251 |
+
# querycheck = QueryCheck()
|
252 |
+
|
253 |
+
# reader_model = config.get('semantic_search','READER')
|
254 |
+
# reader_top_k = retriever_top_k
|
255 |
+
# reader = FARMReader(model_name_or_path=reader_model,
|
256 |
+
# top_k = reader_top_k, use_gpu=True)
|
257 |
+
|
258 |
+
|
259 |
+
# semanticsearch_pipeline = Pipeline()
|
260 |
+
# semanticsearch_pipeline.add_node(component = querycheck, name = "QueryCheck",
|
261 |
+
# inputs = ["Query"])
|
262 |
+
# semanticsearch_pipeline.add_node(component = retriever, name = "EmbeddingRetriever",
|
263 |
+
# inputs = ["QueryCheck.output_1"])
|
264 |
+
# semanticsearch_pipeline.add_node(component = reader, name = "FARMReader",
|
265 |
+
# inputs= ["EmbeddingRetriever"])
|
266 |
+
|
267 |
+
# return semanticsearch_pipeline, document_store
|
268 |
+
|
269 |
def semanticsearchAnnotator(matches: List[List[int]], document):
|
270 |
"""
|
271 |
Annotates the text in the document defined by list of [start index, end index]
|
|
|
280 |
for match in matches:
|
281 |
start_idx = match[0]
|
282 |
end_idx = match[1]
|
283 |
+
if check_streamlit():
|
284 |
+
annotated_text = (annotated_text + document[start:start_idx]
|
285 |
+
+ str(annotation(body=document[start_idx:end_idx],
|
286 |
+
label="ANSWER", background="#964448", color='#ffffff')))
|
287 |
+
else:
|
288 |
+
annotated_text = (annotated_text + document[start:start_idx]
|
289 |
+
+ colored(document[start_idx:end_idx],
|
290 |
+
"green", attrs = ['bold']))
|
291 |
start = end_idx
|
292 |
|
293 |
annotated_text = annotated_text + document[end_idx:]
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
+
if check_streamlit():
|
296 |
+
|
297 |
+
st.write(
|
298 |
+
markdown(annotated_text),
|
299 |
+
unsafe_allow_html=True,
|
300 |
+
)
|
301 |
+
else:
|
302 |
+
print(annotated_text)
|
303 |
+
|
304 |
|
305 |
def semantic_search(query:Text,documents:List[Document]):
|
306 |
"""
|