Spaces:

GIZ
/

SDSN-demo

Runtime error

prashant commited on Nov 2, 2022

Commit

3d34c75

•

1 Parent(s): 87b80d6

update lexical

Files changed (3) hide show

appStore/keyword_search.py CHANGED Viewed

@@ -75,5 +75,6 @@ def app():
                         logging.info("performing lexical search")
                         # token_list = tokenize_lexical_query(queryList)
                         with st.spinner("Performing Exact matching search (Lexical search) for you"):
                             lexical_search(queryList,paraList)

                         logging.info("performing lexical search")
                         # token_list = tokenize_lexical_query(queryList)
                         with st.spinner("Performing Exact matching search (Lexical search) for you"):
+                            st.markdown("##### Top few lexical search (TFIDF) hits #####")
                             lexical_search(queryList,paraList)

paramconfig.cfg CHANGED Viewed

@@ -1,5 +1,5 @@
 [lexical_search]
-TOP_K = 10
 THRESHOLD = 0.1
 SPLIT_BY = sentence
 SPLIT_LENGTH = 3

 [lexical_search]
+TOP_K = 20
 THRESHOLD = 0.1
 SPLIT_BY = sentence
 SPLIT_LENGTH = 3

utils/search.py CHANGED Viewed

@@ -33,7 +33,8 @@ def tokenize_lexical_query(query:str)-> List[str]:
     """
     nlp = spacy.load("en_core_web_sm")
-    token_list = [token.text.lower() for token in nlp(query) if not (token.is_stop or token.is_punct)]
     return token_list
 def runSpacyMatcher(token_list:List[str], document:Text):
@@ -91,7 +92,9 @@ def runRegexMatcher(token_list:List[str], document:Text):
     """
     matches = []
     for token in token_list:
-        matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
     return matches, document
@@ -109,7 +112,9 @@ def searchAnnotator(matches: List[List[int]], document):
     for match in matches:
         start_idx = match[0]
         end_idx = match[1]
-        annotated_text = annotated_text + document[start:start_idx].text + str(annotation(body=document[start_idx:end_idx].text, label="ANSWER", background="#964448", color='#ffffff'))
         start = end_idx
     st.write(
@@ -131,8 +136,9 @@ def lexical_search(query:Text,documents:List[Document]):
     results = retriever.retrieve(query=query,
                             top_k= int(config.get('lexical_search','TOP_K')))
     query_tokens = tokenize_lexical_query(query)
-    for result in results:
         matches, doc = runSpacyMatcher(query_tokens,result.content)
         searchAnnotator(matches, doc)
 def runLexicalPreprocessingPipeline()->List[Document]:

     """
     nlp = spacy.load("en_core_web_sm")
+    token_list = [token.text.lower() for token in nlp(query)
+                  if not (token.is_stop or token.is_punct)]
     return token_list
 def runSpacyMatcher(token_list:List[str], document:Text):
     """
     matches = []
     for token in token_list:
+        matches = (matches +
+                  [[val.start(), val.start() +
+                  len(token)] for val in re.finditer(token, document)])
     return matches, document
     for match in matches:
         start_idx = match[0]
         end_idx = match[1]
+        annotated_text = (annotated_text + document[start:start_idx].text
+                          + str(annotation(body=document[start_idx:end_idx].text,
+                         label="ANSWER", background="#964448", color='#ffffff')))
         start = end_idx
     st.write(
     results = retriever.retrieve(query=query,
                             top_k= int(config.get('lexical_search','TOP_K')))
     query_tokens = tokenize_lexical_query(query)
+    for count, result in enumerate(results):
         matches, doc = runSpacyMatcher(query_tokens,result.content)
+        st.write("Result {}".format(count))
         searchAnnotator(matches, doc)
 def runLexicalPreprocessingPipeline()->List[Document]: