Spaces:
Runtime error
Runtime error
ugmSorcero
commited on
Commit
•
5634055
1
Parent(s):
304cf45
Pipeline Function Documentation
Browse files- core/pipelines.py +34 -5
- interface/components.py +5 -2
- interface/pages.py +2 -2
core/pipelines.py
CHANGED
@@ -9,9 +9,17 @@ from haystack.nodes.preprocessor import PreProcessor
|
|
9 |
from haystack.nodes.ranker import SentenceTransformersRanker
|
10 |
|
11 |
|
12 |
-
def keyword_search(
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
document_store = InMemoryDocumentStore(index=index)
|
16 |
keyword_retriever = TfidfRetriever(document_store=(document_store))
|
17 |
processor = PreProcessor(
|
@@ -19,7 +27,7 @@ def keyword_search(
|
|
19 |
clean_whitespace=True,
|
20 |
clean_header_footer=True,
|
21 |
split_by="word",
|
22 |
-
split_length=
|
23 |
split_respect_sentence_boundary=True,
|
24 |
split_overlap=0,
|
25 |
)
|
@@ -42,9 +50,19 @@ def keyword_search(
|
|
42 |
|
43 |
def dense_passage_retrieval(
|
44 |
index="documents",
|
|
|
45 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
46 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
47 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
document_store = InMemoryDocumentStore(index=index)
|
49 |
dpr_retriever = DensePassageRetriever(
|
50 |
document_store=document_store,
|
@@ -56,7 +74,7 @@ def dense_passage_retrieval(
|
|
56 |
clean_whitespace=True,
|
57 |
clean_header_footer=True,
|
58 |
split_by="word",
|
59 |
-
split_length=
|
60 |
split_respect_sentence_boundary=True,
|
61 |
split_overlap=0,
|
62 |
)
|
@@ -77,12 +95,23 @@ def dense_passage_retrieval(
|
|
77 |
|
78 |
def dense_passage_retrieval_ranker(
|
79 |
index="documents",
|
|
|
80 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
81 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
82 |
ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
|
83 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
search_pipeline, index_pipeline = dense_passage_retrieval(
|
85 |
index=index,
|
|
|
86 |
query_embedding_model=query_embedding_model,
|
87 |
passage_embedding_model=passage_embedding_model,
|
88 |
)
|
|
|
9 |
from haystack.nodes.ranker import SentenceTransformersRanker
|
10 |
|
11 |
|
12 |
+
def keyword_search(index="documents", split_word_length=100):
|
13 |
+
"""
|
14 |
+
**Keyword Search Pipeline**
|
15 |
+
|
16 |
+
It looks for words in the documents that match the query by using TF-IDF.
|
17 |
+
|
18 |
+
TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions:
|
19 |
+
|
20 |
+
- Documents that have more lexical overlap with the query are more likely to be relevant
|
21 |
+
- Words that occur in fewer documents are more significant than words that occur in many documents
|
22 |
+
"""
|
23 |
document_store = InMemoryDocumentStore(index=index)
|
24 |
keyword_retriever = TfidfRetriever(document_store=(document_store))
|
25 |
processor = PreProcessor(
|
|
|
27 |
clean_whitespace=True,
|
28 |
clean_header_footer=True,
|
29 |
split_by="word",
|
30 |
+
split_length=split_word_length,
|
31 |
split_respect_sentence_boundary=True,
|
32 |
split_overlap=0,
|
33 |
)
|
|
|
50 |
|
51 |
def dense_passage_retrieval(
|
52 |
index="documents",
|
53 |
+
split_word_length=100,
|
54 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
55 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
56 |
):
|
57 |
+
"""
|
58 |
+
**Dense Passage Retrieval Pipeline**
|
59 |
+
|
60 |
+
Dense Passage Retrieval is a highly performant retrieval method that calculates relevance using dense representations. Key features:
|
61 |
+
|
62 |
+
- One BERT base model to encode documents
|
63 |
+
- One BERT base model to encode queries
|
64 |
+
- Ranking of documents done by dot product similarity between query and document embeddings
|
65 |
+
"""
|
66 |
document_store = InMemoryDocumentStore(index=index)
|
67 |
dpr_retriever = DensePassageRetriever(
|
68 |
document_store=document_store,
|
|
|
74 |
clean_whitespace=True,
|
75 |
clean_header_footer=True,
|
76 |
split_by="word",
|
77 |
+
split_length=split_word_length,
|
78 |
split_respect_sentence_boundary=True,
|
79 |
split_overlap=0,
|
80 |
)
|
|
|
95 |
|
96 |
def dense_passage_retrieval_ranker(
|
97 |
index="documents",
|
98 |
+
split_word_length=100,
|
99 |
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
|
100 |
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
|
101 |
ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
|
102 |
):
|
103 |
+
"""
|
104 |
+
**Dense Passage Retrieval Ranker Pipeline**
|
105 |
+
|
106 |
+
It adds a Ranker to the `Dense Passage Retrieval Pipeline`.
|
107 |
+
|
108 |
+
- A Ranker reorders a set of Documents based on their relevance to the Query.
|
109 |
+
- It is particularly useful when your Retriever has high recall but poor relevance scoring.
|
110 |
+
- The improvement that the Ranker brings comes at the cost of some additional computation time.
|
111 |
+
"""
|
112 |
search_pipeline, index_pipeline = dense_passage_retrieval(
|
113 |
index=index,
|
114 |
+
split_word_length=split_word_length,
|
115 |
query_embedding_model=query_embedding_model,
|
116 |
passage_embedding_model=passage_embedding_model,
|
117 |
)
|
interface/components.py
CHANGED
@@ -40,13 +40,16 @@ def component_select_pipeline(container):
|
|
40 |
"name": selected_pipeline,
|
41 |
"search_pipeline": search_pipeline,
|
42 |
"index_pipeline": index_pipeline,
|
|
|
43 |
}
|
44 |
|
45 |
|
46 |
-
def component_show_pipeline(pipeline):
|
47 |
"""Draw the pipeline"""
|
48 |
with st.expander("Show pipeline"):
|
49 |
-
|
|
|
|
|
50 |
st.plotly_chart(fig, use_container_width=True)
|
51 |
|
52 |
|
|
|
40 |
"name": selected_pipeline,
|
41 |
"search_pipeline": search_pipeline,
|
42 |
"index_pipeline": index_pipeline,
|
43 |
+
"doc": pipeline_funcs[index_pipe].__doc__,
|
44 |
}
|
45 |
|
46 |
|
47 |
+
def component_show_pipeline(pipeline, pipeline_name):
|
48 |
"""Draw the pipeline"""
|
49 |
with st.expander("Show pipeline"):
|
50 |
+
if pipeline["doc"] is not None:
|
51 |
+
st.markdown(pipeline["doc"])
|
52 |
+
fig = get_pipeline_graph(pipeline[pipeline_name])
|
53 |
st.plotly_chart(fig, use_container_width=True)
|
54 |
|
55 |
|
interface/pages.py
CHANGED
@@ -44,7 +44,7 @@ def page_search(container):
|
|
44 |
## SEARCH ##
|
45 |
query = st.text_input("Query")
|
46 |
|
47 |
-
component_show_pipeline(st.session_state["pipeline"]
|
48 |
|
49 |
if st.button("Search"):
|
50 |
st.session_state["search_results"] = search(
|
@@ -61,7 +61,7 @@ def page_index(container):
|
|
61 |
with container:
|
62 |
st.title("Index time!")
|
63 |
|
64 |
-
component_show_pipeline(st.session_state["pipeline"]
|
65 |
|
66 |
input_funcs = {
|
67 |
"Raw Text": (component_text_input, "card-text"),
|
|
|
44 |
## SEARCH ##
|
45 |
query = st.text_input("Query")
|
46 |
|
47 |
+
component_show_pipeline(st.session_state["pipeline"], "search_pipeline")
|
48 |
|
49 |
if st.button("Search"):
|
50 |
st.session_state["search_results"] = search(
|
|
|
61 |
with container:
|
62 |
st.title("Index time!")
|
63 |
|
64 |
+
component_show_pipeline(st.session_state["pipeline"], "index_pipeline")
|
65 |
|
66 |
input_funcs = {
|
67 |
"Raw Text": (component_text_input, "card-text"),
|