Spaces:

ugaray96
/

neural-search

Running

File size: 5,593 Bytes

"""
Haystack Pipelines
"""

from pathlib import Path
from haystack import Pipeline
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
from haystack.nodes.preprocessor import PreProcessor
from haystack.nodes.ranker import SentenceTransformersRanker
from haystack.nodes.audio.document_to_speech import DocumentToSpeech
import os

data_path = "data/"
os.makedirs(data_path, exist_ok=True)


def keyword_search(index="documents", split_word_length=100, audio_output=False):
    """
    **Keyword Search Pipeline**

    It looks for words in the documents that match the query by using TF-IDF.

    TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions:

      - Documents that have more lexical overlap with the query are more likely to be relevant
      - Words that occur in fewer documents are more significant than words that occur in many documents
    """
    document_store = InMemoryDocumentStore(index=index)
    keyword_retriever = TfidfRetriever(document_store=(document_store))
    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=split_word_length,
        split_respect_sentence_boundary=True,
        split_overlap=0,
    )
    # SEARCH PIPELINE
    search_pipeline = Pipeline()
    search_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Query"])

    # INDEXING PIPELINE
    index_pipeline = Pipeline()
    index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
    index_pipeline.add_node(
        document_store, name="DocumentStore", inputs=["Preprocessor"]
    )

    if audio_output:
        doc2speech = DocumentToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            generated_audio_dir=Path(data_path + "audio"),
        )
        search_pipeline.add_node(
            doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
        )

    return search_pipeline, index_pipeline


def dense_passage_retrieval(
    index="documents",
    split_word_length=100,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    audio_output=False,
):
    """
    **Dense Passage Retrieval Pipeline**

    Dense Passage Retrieval is a highly performant retrieval method that calculates relevance using dense representations. Key features:

      - One BERT base model to encode documents
      - One BERT base model to encode queries
      - Ranking of documents done by dot product similarity between query and document embeddings
    """
    document_store = InMemoryDocumentStore(index=index)
    dpr_retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model=query_embedding_model,
        passage_embedding_model=passage_embedding_model,
    )
    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=split_word_length,
        split_respect_sentence_boundary=True,
        split_overlap=0,
    )
    # SEARCH PIPELINE
    search_pipeline = Pipeline()
    search_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Query"])

    # INDEXING PIPELINE
    index_pipeline = Pipeline()
    index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
    index_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Preprocessor"])
    index_pipeline.add_node(
        document_store, name="DocumentStore", inputs=["DPRRetriever"]
    )

    if audio_output:
        doc2speech = DocumentToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            generated_audio_dir=Path(data_path + "audio"),
        )
        search_pipeline.add_node(
            doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
        )

    return search_pipeline, index_pipeline


def dense_passage_retrieval_ranker(
    index="documents",
    split_word_length=100,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
    audio_output=False,
):
    """
    **Dense Passage Retrieval Ranker Pipeline**

    It adds a Ranker to the `Dense Passage Retrieval Pipeline`.

      - A Ranker reorders a set of Documents based on their relevance to the Query.
      - It is particularly useful when your Retriever has high recall but poor relevance scoring.
      - The improvement that the Ranker brings comes at the cost of some additional computation time.
    """
    search_pipeline, index_pipeline = dense_passage_retrieval(
        index=index,
        split_word_length=split_word_length,
        query_embedding_model=query_embedding_model,
        passage_embedding_model=passage_embedding_model,
    )
    ranker = SentenceTransformersRanker(model_name_or_path=ranker_model)

    search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
    
    if audio_output:
        doc2speech = DocumentToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            generated_audio_dir=Path(data_path + "audio"),
        )
        search_pipeline.add_node(
            doc2speech, name="DocumentToSpeech", inputs=["Ranker"]
        )

    return search_pipeline, index_pipeline