prashant
commited on
Commit
•
3d34c75
1
Parent(s):
87b80d6
update lexical
Browse files- appStore/keyword_search.py +1 -0
- paramconfig.cfg +1 -1
- utils/search.py +10 -4
appStore/keyword_search.py
CHANGED
@@ -75,5 +75,6 @@ def app():
|
|
75 |
logging.info("performing lexical search")
|
76 |
# token_list = tokenize_lexical_query(queryList)
|
77 |
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
|
|
78 |
lexical_search(queryList,paraList)
|
79 |
|
|
|
75 |
logging.info("performing lexical search")
|
76 |
# token_list = tokenize_lexical_query(queryList)
|
77 |
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
78 |
+
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
79 |
lexical_search(queryList,paraList)
|
80 |
|
paramconfig.cfg
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
[lexical_search]
|
2 |
-
TOP_K =
|
3 |
THRESHOLD = 0.1
|
4 |
SPLIT_BY = sentence
|
5 |
SPLIT_LENGTH = 3
|
|
|
1 |
[lexical_search]
|
2 |
+
TOP_K = 20
|
3 |
THRESHOLD = 0.1
|
4 |
SPLIT_BY = sentence
|
5 |
SPLIT_LENGTH = 3
|
utils/search.py
CHANGED
@@ -33,7 +33,8 @@ def tokenize_lexical_query(query:str)-> List[str]:
|
|
33 |
|
34 |
"""
|
35 |
nlp = spacy.load("en_core_web_sm")
|
36 |
-
token_list = [token.text.lower() for token in nlp(query)
|
|
|
37 |
return token_list
|
38 |
|
39 |
def runSpacyMatcher(token_list:List[str], document:Text):
|
@@ -91,7 +92,9 @@ def runRegexMatcher(token_list:List[str], document:Text):
|
|
91 |
"""
|
92 |
matches = []
|
93 |
for token in token_list:
|
94 |
-
matches = matches +
|
|
|
|
|
95 |
|
96 |
return matches, document
|
97 |
|
@@ -109,7 +112,9 @@ def searchAnnotator(matches: List[List[int]], document):
|
|
109 |
for match in matches:
|
110 |
start_idx = match[0]
|
111 |
end_idx = match[1]
|
112 |
-
annotated_text = annotated_text + document[start:start_idx].text
|
|
|
|
|
113 |
start = end_idx
|
114 |
|
115 |
st.write(
|
@@ -131,8 +136,9 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
131 |
results = retriever.retrieve(query=query,
|
132 |
top_k= int(config.get('lexical_search','TOP_K')))
|
133 |
query_tokens = tokenize_lexical_query(query)
|
134 |
-
for result in results:
|
135 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
|
|
136 |
searchAnnotator(matches, doc)
|
137 |
|
138 |
def runLexicalPreprocessingPipeline()->List[Document]:
|
|
|
33 |
|
34 |
"""
|
35 |
nlp = spacy.load("en_core_web_sm")
|
36 |
+
token_list = [token.text.lower() for token in nlp(query)
|
37 |
+
if not (token.is_stop or token.is_punct)]
|
38 |
return token_list
|
39 |
|
40 |
def runSpacyMatcher(token_list:List[str], document:Text):
|
|
|
92 |
"""
|
93 |
matches = []
|
94 |
for token in token_list:
|
95 |
+
matches = (matches +
|
96 |
+
[[val.start(), val.start() +
|
97 |
+
len(token)] for val in re.finditer(token, document)])
|
98 |
|
99 |
return matches, document
|
100 |
|
|
|
112 |
for match in matches:
|
113 |
start_idx = match[0]
|
114 |
end_idx = match[1]
|
115 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
116 |
+
+ str(annotation(body=document[start_idx:end_idx].text,
|
117 |
+
label="ANSWER", background="#964448", color='#ffffff')))
|
118 |
start = end_idx
|
119 |
|
120 |
st.write(
|
|
|
136 |
results = retriever.retrieve(query=query,
|
137 |
top_k= int(config.get('lexical_search','TOP_K')))
|
138 |
query_tokens = tokenize_lexical_query(query)
|
139 |
+
for count, result in enumerate(results):
|
140 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
141 |
+
st.write("Result {}".format(count))
|
142 |
searchAnnotator(matches, doc)
|
143 |
|
144 |
def runLexicalPreprocessingPipeline()->List[Document]:
|