Spaces:

ugaray96
/

neural-search

Running

App Files Files Community

ugmSorcero commited on Sep 20, 2022

Commit

5634055

1 Parent(s): 304cf45

Pipeline Function Documentation

Browse files

Files changed (3) hide show

core/pipelines.py +34 -5
interface/components.py +5 -2
interface/pages.py +2 -2

core/pipelines.py CHANGED Viewed

@@ -9,9 +9,17 @@ from haystack.nodes.preprocessor import PreProcessor
 from haystack.nodes.ranker import SentenceTransformersRanker
-def keyword_search(
-    index="documents",
-):
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
     processor = PreProcessor(
@@ -19,7 +27,7 @@ def keyword_search(
         clean_whitespace=True,
         clean_header_footer=True,
         split_by="word",
-        split_length=100,
         split_respect_sentence_boundary=True,
         split_overlap=0,
     )
@@ -42,9 +50,19 @@ def keyword_search(
 def dense_passage_retrieval(
     index="documents",
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
 ):
     document_store = InMemoryDocumentStore(index=index)
     dpr_retriever = DensePassageRetriever(
         document_store=document_store,
@@ -56,7 +74,7 @@ def dense_passage_retrieval(
         clean_whitespace=True,
         clean_header_footer=True,
         split_by="word",
-        split_length=100,
         split_respect_sentence_boundary=True,
         split_overlap=0,
     )
@@ -77,12 +95,23 @@ def dense_passage_retrieval(
 def dense_passage_retrieval_ranker(
     index="documents",
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
     ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
 ):
     search_pipeline, index_pipeline = dense_passage_retrieval(
         index=index,
         query_embedding_model=query_embedding_model,
         passage_embedding_model=passage_embedding_model,
     )

 from haystack.nodes.ranker import SentenceTransformersRanker
+def keyword_search(index="documents", split_word_length=100):
+    """
+    **Keyword Search Pipeline**
+    It looks for words in the documents that match the query by using TF-IDF.
+    TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions:
+      - Documents that have more lexical overlap with the query are more likely to be relevant
+      - Words that occur in fewer documents are more significant than words that occur in many documents
+    """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
     processor = PreProcessor(
         clean_whitespace=True,
         clean_header_footer=True,
         split_by="word",
+        split_length=split_word_length,
         split_respect_sentence_boundary=True,
         split_overlap=0,
     )
 def dense_passage_retrieval(
     index="documents",
+    split_word_length=100,
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
 ):
+    """
+    **Dense Passage Retrieval Pipeline**
+    Dense Passage Retrieval is a highly performant retrieval method that calculates relevance using dense representations. Key features:
+      - One BERT base model to encode documents
+      - One BERT base model to encode queries
+      - Ranking of documents done by dot product similarity between query and document embeddings
+    """
     document_store = InMemoryDocumentStore(index=index)
     dpr_retriever = DensePassageRetriever(
         document_store=document_store,
         clean_whitespace=True,
         clean_header_footer=True,
         split_by="word",
+        split_length=split_word_length,
         split_respect_sentence_boundary=True,
         split_overlap=0,
     )
 def dense_passage_retrieval_ranker(
     index="documents",
+    split_word_length=100,
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
     ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
 ):
+    """
+    **Dense Passage Retrieval Ranker Pipeline**
+    It adds a Ranker to the `Dense Passage Retrieval Pipeline`.
+      - A Ranker reorders a set of Documents based on their relevance to the Query.
+      - It is particularly useful when your Retriever has high recall but poor relevance scoring.
+      - The improvement that the Ranker brings comes at the cost of some additional computation time.
+    """
     search_pipeline, index_pipeline = dense_passage_retrieval(
         index=index,
+        split_word_length=split_word_length,
         query_embedding_model=query_embedding_model,
         passage_embedding_model=passage_embedding_model,
     )

interface/components.py CHANGED Viewed

@@ -40,13 +40,16 @@ def component_select_pipeline(container):
                 "name": selected_pipeline,
                 "search_pipeline": search_pipeline,
                 "index_pipeline": index_pipeline,
             }
-def component_show_pipeline(pipeline):
     """Draw the pipeline"""
     with st.expander("Show pipeline"):
-        fig = get_pipeline_graph(pipeline)
         st.plotly_chart(fig, use_container_width=True)

                 "name": selected_pipeline,
                 "search_pipeline": search_pipeline,
                 "index_pipeline": index_pipeline,
+                "doc": pipeline_funcs[index_pipe].__doc__,
             }
+def component_show_pipeline(pipeline, pipeline_name):
     """Draw the pipeline"""
     with st.expander("Show pipeline"):
+        if pipeline["doc"] is not None:
+            st.markdown(pipeline["doc"])
+        fig = get_pipeline_graph(pipeline[pipeline_name])
         st.plotly_chart(fig, use_container_width=True)

interface/pages.py CHANGED Viewed

@@ -44,7 +44,7 @@ def page_search(container):
         ## SEARCH ##
         query = st.text_input("Query")
-        component_show_pipeline(st.session_state["pipeline"]["search_pipeline"])
         if st.button("Search"):
             st.session_state["search_results"] = search(
@@ -61,7 +61,7 @@ def page_index(container):
     with container:
         st.title("Index time!")
-        component_show_pipeline(st.session_state["pipeline"]["index_pipeline"])
         input_funcs = {
             "Raw Text": (component_text_input, "card-text"),

         ## SEARCH ##
         query = st.text_input("Query")
+        component_show_pipeline(st.session_state["pipeline"], "search_pipeline")
         if st.button("Search"):
             st.session_state["search_results"] = search(
     with container:
         st.title("Index time!")
+        component_show_pipeline(st.session_state["pipeline"], "index_pipeline")
         input_funcs = {
             "Raw Text": (component_text_input, "card-text"),