ugmSorcero commited on
Commit
101be32
β€’
1 Parent(s): 213d365

Adds keyword search and print pipeline

Browse files
app.py CHANGED
@@ -5,6 +5,9 @@ st.set_page_config(
5
  page_icon="πŸ”Ž",
6
  layout="wide",
7
  initial_sidebar_state="expanded",
 
 
 
8
  )
9
 
10
  from streamlit_option_menu import option_menu
 
5
  page_icon="πŸ”Ž",
6
  layout="wide",
7
  initial_sidebar_state="expanded",
8
+ menu_items={
9
+ 'About': "https://github.com/ugm2/neural-search-demo"
10
+ }
11
  )
12
 
13
  from streamlit_option_menu import option_menu
core/pipelines.py CHANGED
@@ -5,10 +5,39 @@ Haystack Pipelines
5
  import tokenizers
6
  from haystack import Pipeline
7
  from haystack.document_stores import InMemoryDocumentStore
8
- from haystack.nodes.retriever import DensePassageRetriever
9
  from haystack.nodes.preprocessor import PreProcessor
10
  import streamlit as st
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  @st.cache(hash_funcs={tokenizers.Tokenizer: lambda _: None, tokenizers.AddedToken: lambda _: None}, allow_output_mutation=True)
13
  def dense_passage_retrieval(
14
  index='documents',
@@ -42,4 +71,4 @@ def dense_passage_retrieval(
42
  document_store, name="DocumentStore", inputs=["DPRRetriever"]
43
  )
44
 
45
- return search_pipeline, index_pipeline
 
5
  import tokenizers
6
  from haystack import Pipeline
7
  from haystack.document_stores import InMemoryDocumentStore
8
+ from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
9
  from haystack.nodes.preprocessor import PreProcessor
10
  import streamlit as st
11
 
12
+ @st.cache(allow_output_mutation=True)
13
+ def keyword_search(
14
+ index='documents',
15
+ ):
16
+ document_store = InMemoryDocumentStore(index=index)
17
+ keyword_retriever = TfidfRetriever(document_store=(document_store))
18
+ processor = PreProcessor(
19
+ clean_empty_lines=True,
20
+ clean_whitespace=True,
21
+ clean_header_footer=True,
22
+ split_by="word",
23
+ split_length=100,
24
+ split_respect_sentence_boundary=True,
25
+ split_overlap=0,
26
+ )
27
+ # SEARCH PIPELINE
28
+ search_pipeline = Pipeline()
29
+ search_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Query"])
30
+
31
+ # INDEXING PIPELINE
32
+ index_pipeline = Pipeline()
33
+ index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
34
+ index_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Preprocessor"])
35
+ index_pipeline.add_node(
36
+ document_store, name="DocumentStore", inputs=["TfidfRetriever"]
37
+ )
38
+
39
+ return search_pipeline, index_pipeline
40
+
41
  @st.cache(hash_funcs={tokenizers.Tokenizer: lambda _: None, tokenizers.AddedToken: lambda _: None}, allow_output_mutation=True)
42
  def dense_passage_retrieval(
43
  index='documents',
 
71
  document_store, name="DocumentStore", inputs=["DPRRetriever"]
72
  )
73
 
74
+ return search_pipeline, index_pipeline
core/search_index.py CHANGED
@@ -26,8 +26,10 @@ def search(queries, pipeline):
26
  matches_queries = pipeline.run_batch(queries=queries)
27
  for matches in matches_queries["documents"]:
28
  query_results = []
 
29
  for res in matches:
30
- metadata = res.meta
 
31
  query_results.append(
32
  {
33
  "text": res.content,
@@ -36,7 +38,7 @@ def search(queries, pipeline):
36
  "fragment_id": res.id
37
  }
38
  )
39
- results.append(
40
- sorted(query_results, key=lambda x: x["score"], reverse=True)
41
- )
42
  return results
 
26
  matches_queries = pipeline.run_batch(queries=queries)
27
  for matches in matches_queries["documents"]:
28
  query_results = []
29
+ score_is_empty = False
30
  for res in matches:
31
+ if not score_is_empty:
32
+ score_is_empty = True if res.score is None else False
33
  query_results.append(
34
  {
35
  "text": res.content,
 
38
  "fragment_id": res.id
39
  }
40
  )
41
+ if not score_is_empty:
42
+ query_results = sorted(query_results, key=lambda x: x["score"], reverse=True)
43
+ results.append(query_results)
44
  return results
interface/components.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
  import core.pipelines as pipelines_functions
3
  from inspect import getmembers, isfunction
 
4
 
5
  def component_select_pipeline(container):
6
  pipeline_names, pipeline_funcs = list(zip(*getmembers(pipelines_functions, isfunction)))
@@ -8,7 +9,8 @@ def component_select_pipeline(container):
8
  with container:
9
  selected_pipeline = st.selectbox(
10
  'Select pipeline',
11
- pipeline_names
 
12
  )
13
  st.session_state['search_pipeline'], \
14
  st.session_state['index_pipeline'] = \
@@ -16,8 +18,10 @@ def component_select_pipeline(container):
16
 
17
  def component_show_pipeline(container, pipeline):
18
  """Draw the pipeline"""
19
- with container:
20
- pass
 
 
21
 
22
  def component_show_search_result(container, results):
23
  with container:
@@ -25,7 +29,8 @@ def component_show_search_result(container, results):
25
  st.markdown(f"### Match {idx+1}")
26
  st.markdown(f"**Text**: {document['text']}")
27
  st.markdown(f"**Document**: {document['id']}")
28
- st.markdown(f"**Score**: {document['score']:.3f}")
 
29
  st.markdown("---")
30
 
31
  def component_text_input(container):
 
1
  import streamlit as st
2
  import core.pipelines as pipelines_functions
3
  from inspect import getmembers, isfunction
4
+ from networkx.drawing.nx_agraph import to_agraph
5
 
6
  def component_select_pipeline(container):
7
  pipeline_names, pipeline_funcs = list(zip(*getmembers(pipelines_functions, isfunction)))
 
9
  with container:
10
  selected_pipeline = st.selectbox(
11
  'Select pipeline',
12
+ pipeline_names,
13
+ index=pipeline_names.index('Keyword Search') if 'Keyword Search' in pipeline_names else 0
14
  )
15
  st.session_state['search_pipeline'], \
16
  st.session_state['index_pipeline'] = \
 
18
 
19
  def component_show_pipeline(container, pipeline):
20
  """Draw the pipeline"""
21
+ with st.expander('Show pipeline'):
22
+ graphviz = to_agraph(pipeline.graph)
23
+ graphviz.layout("dot")
24
+ st.graphviz_chart(graphviz.string())
25
 
26
  def component_show_search_result(container, results):
27
  with container:
 
29
  st.markdown(f"### Match {idx+1}")
30
  st.markdown(f"**Text**: {document['text']}")
31
  st.markdown(f"**Document**: {document['id']}")
32
+ if document['score'] is not None:
33
+ st.markdown(f"**Score**: {document['score']:.3f}")
34
  st.markdown("---")
35
 
36
  def component_text_input(container):
interface/pages.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  from streamlit_option_menu import option_menu
3
  from core.search_index import index, search
4
- from interface.components import component_show_search_result, component_text_input
5
 
6
  def page_landing_page(container):
7
  with container:
@@ -18,7 +18,6 @@ def page_landing_page(container):
18
  )
19
  st.markdown(
20
  "TODO list:"
21
- "\n - Option to print pipeline structure on page"
22
  "\n - Build other pipelines"
23
  "\n - Include file/url indexing"
24
  "\n - [Optional] Include text to audio to read responses"
@@ -31,6 +30,8 @@ def page_search(container):
31
  ## SEARCH ##
32
  query = st.text_input("Query")
33
 
 
 
34
  if st.button("Search"):
35
  st.session_state['search_results'] = search(
36
  queries=[query],
@@ -46,6 +47,8 @@ def page_index(container):
46
  with container:
47
  st.title("Index time!")
48
 
 
 
49
  input_funcs = {
50
  "Raw Text": (component_text_input, "card-text"),
51
  }
 
1
  import streamlit as st
2
  from streamlit_option_menu import option_menu
3
  from core.search_index import index, search
4
+ from interface.components import component_show_pipeline, component_show_search_result, component_text_input
5
 
6
  def page_landing_page(container):
7
  with container:
 
18
  )
19
  st.markdown(
20
  "TODO list:"
 
21
  "\n - Build other pipelines"
22
  "\n - Include file/url indexing"
23
  "\n - [Optional] Include text to audio to read responses"
 
30
  ## SEARCH ##
31
  query = st.text_input("Query")
32
 
33
+ component_show_pipeline(container, st.session_state['search_pipeline'])
34
+
35
  if st.button("Search"):
36
  st.session_state['search_results'] = search(
37
  queries=[query],
 
47
  with container:
48
  st.title("Index time!")
49
 
50
+ component_show_pipeline(container, st.session_state['index_pipeline'])
51
+
52
  input_funcs = {
53
  "Raw Text": (component_text_input, "card-text"),
54
  }
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  streamlit
2
  streamlit_option_menu
3
- farm-haystack
 
 
1
  streamlit
2
  streamlit_option_menu
3
+ farm-haystack
4
+ pygraphviz