ugmSorcero commited on
Commit
5634055
1 Parent(s): 304cf45

Pipeline Function Documentation

Browse files
core/pipelines.py CHANGED
@@ -9,9 +9,17 @@ from haystack.nodes.preprocessor import PreProcessor
9
  from haystack.nodes.ranker import SentenceTransformersRanker
10
 
11
 
12
- def keyword_search(
13
- index="documents",
14
- ):
 
 
 
 
 
 
 
 
15
  document_store = InMemoryDocumentStore(index=index)
16
  keyword_retriever = TfidfRetriever(document_store=(document_store))
17
  processor = PreProcessor(
@@ -19,7 +27,7 @@ def keyword_search(
19
  clean_whitespace=True,
20
  clean_header_footer=True,
21
  split_by="word",
22
- split_length=100,
23
  split_respect_sentence_boundary=True,
24
  split_overlap=0,
25
  )
@@ -42,9 +50,19 @@ def keyword_search(
42
 
43
  def dense_passage_retrieval(
44
  index="documents",
 
45
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
46
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
47
  ):
 
 
 
 
 
 
 
 
 
48
  document_store = InMemoryDocumentStore(index=index)
49
  dpr_retriever = DensePassageRetriever(
50
  document_store=document_store,
@@ -56,7 +74,7 @@ def dense_passage_retrieval(
56
  clean_whitespace=True,
57
  clean_header_footer=True,
58
  split_by="word",
59
- split_length=100,
60
  split_respect_sentence_boundary=True,
61
  split_overlap=0,
62
  )
@@ -77,12 +95,23 @@ def dense_passage_retrieval(
77
 
78
  def dense_passage_retrieval_ranker(
79
  index="documents",
 
80
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
81
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
82
  ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
83
  ):
 
 
 
 
 
 
 
 
 
84
  search_pipeline, index_pipeline = dense_passage_retrieval(
85
  index=index,
 
86
  query_embedding_model=query_embedding_model,
87
  passage_embedding_model=passage_embedding_model,
88
  )
 
9
  from haystack.nodes.ranker import SentenceTransformersRanker
10
 
11
 
12
+ def keyword_search(index="documents", split_word_length=100):
13
+ """
14
+ **Keyword Search Pipeline**
15
+
16
+ It looks for words in the documents that match the query by using TF-IDF.
17
+
18
+ TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions:
19
+
20
+ - Documents that have more lexical overlap with the query are more likely to be relevant
21
+ - Words that occur in fewer documents are more significant than words that occur in many documents
22
+ """
23
  document_store = InMemoryDocumentStore(index=index)
24
  keyword_retriever = TfidfRetriever(document_store=(document_store))
25
  processor = PreProcessor(
 
27
  clean_whitespace=True,
28
  clean_header_footer=True,
29
  split_by="word",
30
+ split_length=split_word_length,
31
  split_respect_sentence_boundary=True,
32
  split_overlap=0,
33
  )
 
50
 
51
  def dense_passage_retrieval(
52
  index="documents",
53
+ split_word_length=100,
54
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
55
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
56
  ):
57
+ """
58
+ **Dense Passage Retrieval Pipeline**
59
+
60
+ Dense Passage Retrieval is a highly performant retrieval method that calculates relevance using dense representations. Key features:
61
+
62
+ - One BERT base model to encode documents
63
+ - One BERT base model to encode queries
64
+ - Ranking of documents done by dot product similarity between query and document embeddings
65
+ """
66
  document_store = InMemoryDocumentStore(index=index)
67
  dpr_retriever = DensePassageRetriever(
68
  document_store=document_store,
 
74
  clean_whitespace=True,
75
  clean_header_footer=True,
76
  split_by="word",
77
+ split_length=split_word_length,
78
  split_respect_sentence_boundary=True,
79
  split_overlap=0,
80
  )
 
95
 
96
  def dense_passage_retrieval_ranker(
97
  index="documents",
98
+ split_word_length=100,
99
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
100
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
101
  ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
102
  ):
103
+ """
104
+ **Dense Passage Retrieval Ranker Pipeline**
105
+
106
+ It adds a Ranker to the `Dense Passage Retrieval Pipeline`.
107
+
108
+ - A Ranker reorders a set of Documents based on their relevance to the Query.
109
+ - It is particularly useful when your Retriever has high recall but poor relevance scoring.
110
+ - The improvement that the Ranker brings comes at the cost of some additional computation time.
111
+ """
112
  search_pipeline, index_pipeline = dense_passage_retrieval(
113
  index=index,
114
+ split_word_length=split_word_length,
115
  query_embedding_model=query_embedding_model,
116
  passage_embedding_model=passage_embedding_model,
117
  )
interface/components.py CHANGED
@@ -40,13 +40,16 @@ def component_select_pipeline(container):
40
  "name": selected_pipeline,
41
  "search_pipeline": search_pipeline,
42
  "index_pipeline": index_pipeline,
 
43
  }
44
 
45
 
46
- def component_show_pipeline(pipeline):
47
  """Draw the pipeline"""
48
  with st.expander("Show pipeline"):
49
- fig = get_pipeline_graph(pipeline)
 
 
50
  st.plotly_chart(fig, use_container_width=True)
51
 
52
 
 
40
  "name": selected_pipeline,
41
  "search_pipeline": search_pipeline,
42
  "index_pipeline": index_pipeline,
43
+ "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
45
 
46
 
47
+ def component_show_pipeline(pipeline, pipeline_name):
48
  """Draw the pipeline"""
49
  with st.expander("Show pipeline"):
50
+ if pipeline["doc"] is not None:
51
+ st.markdown(pipeline["doc"])
52
+ fig = get_pipeline_graph(pipeline[pipeline_name])
53
  st.plotly_chart(fig, use_container_width=True)
54
 
55
 
interface/pages.py CHANGED
@@ -44,7 +44,7 @@ def page_search(container):
44
  ## SEARCH ##
45
  query = st.text_input("Query")
46
 
47
- component_show_pipeline(st.session_state["pipeline"]["search_pipeline"])
48
 
49
  if st.button("Search"):
50
  st.session_state["search_results"] = search(
@@ -61,7 +61,7 @@ def page_index(container):
61
  with container:
62
  st.title("Index time!")
63
 
64
- component_show_pipeline(st.session_state["pipeline"]["index_pipeline"])
65
 
66
  input_funcs = {
67
  "Raw Text": (component_text_input, "card-text"),
 
44
  ## SEARCH ##
45
  query = st.text_input("Query")
46
 
47
+ component_show_pipeline(st.session_state["pipeline"], "search_pipeline")
48
 
49
  if st.button("Search"):
50
  st.session_state["search_results"] = search(
 
61
  with container:
62
  st.title("Index time!")
63
 
64
+ component_show_pipeline(st.session_state["pipeline"], "index_pipeline")
65
 
66
  input_funcs = {
67
  "Raw Text": (component_text_input, "card-text"),