Spaces:
Runtime error
Runtime error
File size: 5,821 Bytes
01b8e8e 6bb1fd5 01b8e8e 101be32 01b8e8e f456ef3 6bb1fd5 01b8e8e 42468fb 6bb1fd5 39503cb 893d078 42468fb 893d078 5634055 893d078 101be32 5634055 101be32 39503cb 27e0350 101be32 42468fb 6bb1fd5 42468fb 6bb1fd5 101be32 39503cb 01b8e8e 39503cb 5634055 01b8e8e 893d078 0f09d43 01b8e8e 5634055 710a34d 01b8e8e 893d078 01b8e8e 5634055 01b8e8e 0f09d43 cfc1673 01b8e8e 101be32 f456ef3 304cf45 f456ef3 5634055 f456ef3 304cf45 893d078 0f09d43 f456ef3 5634055 f456ef3 5634055 f456ef3 acb72cc f456ef3 893d078 304cf45 f456ef3 b8acde7 0f09d43 b8acde7 304cf45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
"""
Haystack Pipelines
"""
from pathlib import Path
from haystack import Pipeline
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
from haystack.nodes.preprocessor import PreProcessor
from haystack.nodes.ranker import SentenceTransformersRanker
from haystack.nodes.audio.document_to_speech import DocumentToSpeech
import os
data_path = "data/"
os.makedirs(data_path, exist_ok=True)
index = "documents"
document_store = InMemoryDocumentStore(index=index)
def keyword_search(
index="documents", split_word_length=100, top_k=10, audio_output=False
):
"""
**Keyword Search Pipeline**
It looks for words in the documents that match the query by using TF-IDF.
TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions:
- Documents that have more lexical overlap with the query are more likely to be relevant
- Words that occur in fewer documents are more significant than words that occur in many documents
"""
global document_store
if index != document_store.index:
document_store = InMemoryDocumentStore(index=index)
keyword_retriever = TfidfRetriever(document_store=(document_store), top_k=top_k)
processor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=split_word_length,
split_respect_sentence_boundary=True,
split_overlap=0,
)
# SEARCH PIPELINE
search_pipeline = Pipeline()
search_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Query"])
# INDEXING PIPELINE
index_pipeline = Pipeline()
index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
index_pipeline.add_node(
document_store, name="DocumentStore", inputs=["Preprocessor"]
)
if audio_output:
doc2speech = DocumentToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
generated_audio_dir=Path(data_path + "audio"),
)
search_pipeline.add_node(
doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
)
return search_pipeline, index_pipeline
def dense_passage_retrieval(
index="documents",
split_word_length=100,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
top_k=10,
audio_output=False,
):
"""
**Dense Passage Retrieval Pipeline**
Dense Passage Retrieval is a highly performant retrieval method that calculates relevance using dense representations. Key features:
- One BERT base model to encode documents
- One BERT base model to encode queries
- Ranking of documents done by dot product similarity between query and document embeddings
"""
document_store = InMemoryDocumentStore(index=index)
dpr_retriever = DensePassageRetriever(
document_store=document_store,
query_embedding_model=query_embedding_model,
passage_embedding_model=passage_embedding_model,
top_k=top_k,
)
processor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=split_word_length,
split_respect_sentence_boundary=True,
split_overlap=0,
)
# SEARCH PIPELINE
search_pipeline = Pipeline()
search_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Query"])
# INDEXING PIPELINE
index_pipeline = Pipeline()
index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
index_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Preprocessor"])
index_pipeline.add_node(
document_store, name="DocumentStore", inputs=["DPRRetriever"]
)
if audio_output:
doc2speech = DocumentToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
generated_audio_dir=Path(data_path + "audio"),
)
search_pipeline.add_node(
doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
)
return search_pipeline, index_pipeline
def dense_passage_retrieval_ranker(
index="documents",
split_word_length=100,
query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
top_k=10,
audio_output=False,
):
"""
**Dense Passage Retrieval Ranker Pipeline**
It adds a Ranker to the `Dense Passage Retrieval Pipeline`.
- A Ranker reorders a set of Documents based on their relevance to the Query.
- It is particularly useful when your Retriever has high recall but poor relevance scoring.
- The improvement that the Ranker brings comes at the cost of some additional computation time.
"""
search_pipeline, index_pipeline = dense_passage_retrieval(
index=index,
split_word_length=split_word_length,
query_embedding_model=query_embedding_model,
passage_embedding_model=passage_embedding_model,
top_k=top_k,
)
ranker = SentenceTransformersRanker(model_name_or_path=ranker_model, top_k=top_k)
search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
if audio_output:
doc2speech = DocumentToSpeech(
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
generated_audio_dir=Path(data_path + "audio"),
)
search_pipeline.add_node(doc2speech, name="DocumentToSpeech", inputs=["Ranker"])
return search_pipeline, index_pipeline
|