prashant commited on
Commit
3d34c75
1 Parent(s): 87b80d6

update lexical

Browse files
Files changed (3) hide show
  1. appStore/keyword_search.py +1 -0
  2. paramconfig.cfg +1 -1
  3. utils/search.py +10 -4
appStore/keyword_search.py CHANGED
@@ -75,5 +75,6 @@ def app():
75
  logging.info("performing lexical search")
76
  # token_list = tokenize_lexical_query(queryList)
77
  with st.spinner("Performing Exact matching search (Lexical search) for you"):
 
78
  lexical_search(queryList,paraList)
79
 
 
75
  logging.info("performing lexical search")
76
  # token_list = tokenize_lexical_query(queryList)
77
  with st.spinner("Performing Exact matching search (Lexical search) for you"):
78
+ st.markdown("##### Top few lexical search (TFIDF) hits #####")
79
  lexical_search(queryList,paraList)
80
 
paramconfig.cfg CHANGED
@@ -1,5 +1,5 @@
1
  [lexical_search]
2
- TOP_K = 10
3
  THRESHOLD = 0.1
4
  SPLIT_BY = sentence
5
  SPLIT_LENGTH = 3
 
1
  [lexical_search]
2
+ TOP_K = 20
3
  THRESHOLD = 0.1
4
  SPLIT_BY = sentence
5
  SPLIT_LENGTH = 3
utils/search.py CHANGED
@@ -33,7 +33,8 @@ def tokenize_lexical_query(query:str)-> List[str]:
33
 
34
  """
35
  nlp = spacy.load("en_core_web_sm")
36
- token_list = [token.text.lower() for token in nlp(query) if not (token.is_stop or token.is_punct)]
 
37
  return token_list
38
 
39
  def runSpacyMatcher(token_list:List[str], document:Text):
@@ -91,7 +92,9 @@ def runRegexMatcher(token_list:List[str], document:Text):
91
  """
92
  matches = []
93
  for token in token_list:
94
- matches = matches + [[val.start(), val.start()+ len(token)] for val in re.finditer(token, document)]
 
 
95
 
96
  return matches, document
97
 
@@ -109,7 +112,9 @@ def searchAnnotator(matches: List[List[int]], document):
109
  for match in matches:
110
  start_idx = match[0]
111
  end_idx = match[1]
112
- annotated_text = annotated_text + document[start:start_idx].text + str(annotation(body=document[start_idx:end_idx].text, label="ANSWER", background="#964448", color='#ffffff'))
 
 
113
  start = end_idx
114
 
115
  st.write(
@@ -131,8 +136,9 @@ def lexical_search(query:Text,documents:List[Document]):
131
  results = retriever.retrieve(query=query,
132
  top_k= int(config.get('lexical_search','TOP_K')))
133
  query_tokens = tokenize_lexical_query(query)
134
- for result in results:
135
  matches, doc = runSpacyMatcher(query_tokens,result.content)
 
136
  searchAnnotator(matches, doc)
137
 
138
  def runLexicalPreprocessingPipeline()->List[Document]:
 
33
 
34
  """
35
  nlp = spacy.load("en_core_web_sm")
36
+ token_list = [token.text.lower() for token in nlp(query)
37
+ if not (token.is_stop or token.is_punct)]
38
  return token_list
39
 
40
  def runSpacyMatcher(token_list:List[str], document:Text):
 
92
  """
93
  matches = []
94
  for token in token_list:
95
+ matches = (matches +
96
+ [[val.start(), val.start() +
97
+ len(token)] for val in re.finditer(token, document)])
98
 
99
  return matches, document
100
 
 
112
  for match in matches:
113
  start_idx = match[0]
114
  end_idx = match[1]
115
+ annotated_text = (annotated_text + document[start:start_idx].text
116
+ + str(annotation(body=document[start_idx:end_idx].text,
117
+ label="ANSWER", background="#964448", color='#ffffff')))
118
  start = end_idx
119
 
120
  st.write(
 
136
  results = retriever.retrieve(query=query,
137
  top_k= int(config.get('lexical_search','TOP_K')))
138
  query_tokens = tokenize_lexical_query(query)
139
+ for count, result in enumerate(results):
140
  matches, doc = runSpacyMatcher(query_tokens,result.content)
141
+ st.write("Result {}".format(count))
142
  searchAnnotator(matches, doc)
143
 
144
  def runLexicalPreprocessingPipeline()->List[Document]: