Spaces:

ugaray96
/

neural-search

Sleeping

App Files Files Community

ugmSorcero commited on Sep 5, 2022

Commit

101be32

1 Parent(s): 213d365

Adds keyword search and print pipeline

Browse files

Files changed (6) hide show

app.py +3 -0
core/pipelines.py +31 -2
core/search_index.py +6 -4
interface/components.py +9 -4
interface/pages.py +5 -2
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -5,6 +5,9 @@ st.set_page_config(
     page_icon="🔎",
     layout="wide",
     initial_sidebar_state="expanded",
 )
 from streamlit_option_menu import option_menu

     page_icon="🔎",
     layout="wide",
     initial_sidebar_state="expanded",
+    menu_items={
+        'About': "https://github.com/ugm2/neural-search-demo"
+    }
 )
 from streamlit_option_menu import option_menu

core/pipelines.py CHANGED Viewed

@@ -5,10 +5,39 @@ Haystack Pipelines
 import tokenizers
 from haystack import Pipeline
 from haystack.document_stores import InMemoryDocumentStore
-from haystack.nodes.retriever import DensePassageRetriever
 from haystack.nodes.preprocessor import PreProcessor
 import streamlit as st
 @st.cache(hash_funcs={tokenizers.Tokenizer: lambda _: None, tokenizers.AddedToken: lambda _: None}, allow_output_mutation=True)
 def dense_passage_retrieval(
     index='documents',
@@ -42,4 +71,4 @@ def dense_passage_retrieval(
         document_store, name="DocumentStore", inputs=["DPRRetriever"]
     )
-    return search_pipeline, index_pipeline

 import tokenizers
 from haystack import Pipeline
 from haystack.document_stores import InMemoryDocumentStore
+from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
 from haystack.nodes.preprocessor import PreProcessor
 import streamlit as st
+@st.cache(allow_output_mutation=True)
+def keyword_search(
+    index='documents',
+):
+    document_store = InMemoryDocumentStore(index=index)
+    keyword_retriever = TfidfRetriever(document_store=(document_store))
+    processor = PreProcessor(
+        clean_empty_lines=True,
+        clean_whitespace=True,
+        clean_header_footer=True,
+        split_by="word",
+        split_length=100,
+        split_respect_sentence_boundary=True,
+        split_overlap=0,
+    )
+    # SEARCH PIPELINE
+    search_pipeline = Pipeline()
+    search_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Query"])
+    # INDEXING PIPELINE
+    index_pipeline = Pipeline()
+    index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
+    index_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Preprocessor"])
+    index_pipeline.add_node(
+        document_store, name="DocumentStore", inputs=["TfidfRetriever"]
+    )
+    return search_pipeline, index_pipeline
 @st.cache(hash_funcs={tokenizers.Tokenizer: lambda _: None, tokenizers.AddedToken: lambda _: None}, allow_output_mutation=True)
 def dense_passage_retrieval(
     index='documents',
         document_store, name="DocumentStore", inputs=["DPRRetriever"]
     )
+    return search_pipeline, index_pipeline

core/search_index.py CHANGED Viewed

@@ -26,8 +26,10 @@ def search(queries, pipeline):
     matches_queries = pipeline.run_batch(queries=queries)
     for matches in matches_queries["documents"]:
         query_results = []
         for res in matches:
-            metadata = res.meta
             query_results.append(
                 {
                     "text": res.content,
@@ -36,7 +38,7 @@ def search(queries, pipeline):
                     "fragment_id": res.id
                 }
             )
-        results.append(
-            sorted(query_results, key=lambda x: x["score"], reverse=True)
-        )
     return results

     matches_queries = pipeline.run_batch(queries=queries)
     for matches in matches_queries["documents"]:
         query_results = []
+        score_is_empty = False
         for res in matches:
+            if not score_is_empty:
+                score_is_empty = True if res.score is None else False
             query_results.append(
                 {
                     "text": res.content,
                     "fragment_id": res.id
                 }
             )
+        if not score_is_empty:
+            query_results = sorted(query_results, key=lambda x: x["score"], reverse=True)
+        results.append(query_results)
     return results

interface/components.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 import core.pipelines as pipelines_functions
 from inspect import getmembers, isfunction
 def component_select_pipeline(container):
     pipeline_names, pipeline_funcs = list(zip(*getmembers(pipelines_functions, isfunction)))
@@ -8,7 +9,8 @@ def component_select_pipeline(container):
     with container:
         selected_pipeline = st.selectbox(
             'Select pipeline',
-            pipeline_names
         )
         st.session_state['search_pipeline'], \
             st.session_state['index_pipeline'] = \
@@ -16,8 +18,10 @@ def component_select_pipeline(container):
 def component_show_pipeline(container, pipeline):
     """Draw the pipeline"""
-    with container:
-        pass
 def component_show_search_result(container, results):
     with container:
@@ -25,7 +29,8 @@ def component_show_search_result(container, results):
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
-            st.markdown(f"**Score**: {document['score']:.3f}")
             st.markdown("---")
 def component_text_input(container):

 import streamlit as st
 import core.pipelines as pipelines_functions
 from inspect import getmembers, isfunction
+from networkx.drawing.nx_agraph import to_agraph
 def component_select_pipeline(container):
     pipeline_names, pipeline_funcs = list(zip(*getmembers(pipelines_functions, isfunction)))
     with container:
         selected_pipeline = st.selectbox(
             'Select pipeline',
+            pipeline_names,
+            index=pipeline_names.index('Keyword Search') if 'Keyword Search' in pipeline_names else 0
         )
         st.session_state['search_pipeline'], \
             st.session_state['index_pipeline'] = \
 def component_show_pipeline(container, pipeline):
     """Draw the pipeline"""
+    with st.expander('Show pipeline'):
+        graphviz = to_agraph(pipeline.graph)
+        graphviz.layout("dot")
+        st.graphviz_chart(graphviz.string())
 def component_show_search_result(container, results):
     with container:
             st.markdown(f"### Match {idx+1}")
             st.markdown(f"**Text**: {document['text']}")
             st.markdown(f"**Document**: {document['id']}")
+            if document['score'] is not None:
+                st.markdown(f"**Score**: {document['score']:.3f}")
             st.markdown("---")
 def component_text_input(container):

interface/pages.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
 from streamlit_option_menu import option_menu
 from core.search_index import index, search
-from interface.components import component_show_search_result, component_text_input
 def page_landing_page(container):
     with container:
@@ -18,7 +18,6 @@ def page_landing_page(container):
         )
         st.markdown(
             "TODO list:"
-            "\n  - Option to print pipeline structure on page"
             "\n  - Build other pipelines"
             "\n  - Include file/url indexing"
             "\n  - [Optional] Include text to audio to read responses"
@@ -31,6 +30,8 @@ def page_search(container):
         ## SEARCH ##
         query = st.text_input("Query")
         if st.button("Search"):
             st.session_state['search_results'] = search(
                 queries=[query],
@@ -46,6 +47,8 @@ def page_index(container):
     with container:
         st.title("Index time!")
         input_funcs = {
             "Raw Text": (component_text_input, "card-text"),
         }

 import streamlit as st
 from streamlit_option_menu import option_menu
 from core.search_index import index, search
+from interface.components import component_show_pipeline, component_show_search_result, component_text_input
 def page_landing_page(container):
     with container:
         )
         st.markdown(
             "TODO list:"
             "\n  - Build other pipelines"
             "\n  - Include file/url indexing"
             "\n  - [Optional] Include text to audio to read responses"
         ## SEARCH ##
         query = st.text_input("Query")
+        component_show_pipeline(container, st.session_state['search_pipeline'])
         if st.button("Search"):
             st.session_state['search_results'] = search(
                 queries=[query],
     with container:
         st.title("Index time!")
+        component_show_pipeline(container, st.session_state['index_pipeline'])
         input_funcs = {
             "Raw Text": (component_text_input, "card-text"),
         }

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 streamlit
 streamlit_option_menu
-farm-haystack

 streamlit
 streamlit_option_menu
+farm-haystack
+pygraphviz