File size: 5,593 Bytes
01b8e8e
 
 
 
6bb1fd5
01b8e8e
 
101be32
01b8e8e
f456ef3
6bb1fd5
 
01b8e8e
42468fb
6bb1fd5
39503cb
42468fb
6bb1fd5
5634055
 
 
 
 
 
 
 
 
 
101be32
 
 
 
 
 
 
5634055
101be32
 
 
 
 
 
 
 
 
 
39503cb
27e0350
101be32
42468fb
6bb1fd5
 
 
42468fb
 
 
 
6bb1fd5
101be32
 
 
39503cb
01b8e8e
39503cb
5634055
01b8e8e
 
0f09d43
01b8e8e
5634055
 
 
 
 
 
 
 
 
01b8e8e
 
 
 
 
 
 
 
 
 
 
5634055
01b8e8e
 
 
 
 
 
 
 
 
 
 
 
 
 
0f09d43
cfc1673
 
 
 
 
 
 
 
01b8e8e
101be32
f456ef3
304cf45
f456ef3
 
5634055
f456ef3
 
304cf45
0f09d43
f456ef3
5634055
 
 
 
 
 
 
 
 
f456ef3
 
5634055
f456ef3
 
 
 
304cf45
f456ef3
0f09d43
 
 
 
 
 
 
 
 
304cf45
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Haystack Pipelines
"""

from pathlib import Path
from haystack import Pipeline
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
from haystack.nodes.preprocessor import PreProcessor
from haystack.nodes.ranker import SentenceTransformersRanker
from haystack.nodes.audio.document_to_speech import DocumentToSpeech
import os

data_path = "data/"
os.makedirs(data_path, exist_ok=True)


def keyword_search(index="documents", split_word_length=100, audio_output=False):
    """
    **Keyword Search Pipeline**

    It looks for words in the documents that match the query by using TF-IDF.

    TF-IDF is a commonly used baseline for information retrieval that exploits two key intuitions:

      - Documents that have more lexical overlap with the query are more likely to be relevant
      - Words that occur in fewer documents are more significant than words that occur in many documents
    """
    document_store = InMemoryDocumentStore(index=index)
    keyword_retriever = TfidfRetriever(document_store=(document_store))
    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=split_word_length,
        split_respect_sentence_boundary=True,
        split_overlap=0,
    )
    # SEARCH PIPELINE
    search_pipeline = Pipeline()
    search_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Query"])

    # INDEXING PIPELINE
    index_pipeline = Pipeline()
    index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
    index_pipeline.add_node(
        document_store, name="DocumentStore", inputs=["Preprocessor"]
    )

    if audio_output:
        doc2speech = DocumentToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            generated_audio_dir=Path(data_path + "audio"),
        )
        search_pipeline.add_node(
            doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
        )

    return search_pipeline, index_pipeline


def dense_passage_retrieval(
    index="documents",
    split_word_length=100,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    audio_output=False,
):
    """
    **Dense Passage Retrieval Pipeline**

    Dense Passage Retrieval is a highly performant retrieval method that calculates relevance using dense representations. Key features:

      - One BERT base model to encode documents
      - One BERT base model to encode queries
      - Ranking of documents done by dot product similarity between query and document embeddings
    """
    document_store = InMemoryDocumentStore(index=index)
    dpr_retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model=query_embedding_model,
        passage_embedding_model=passage_embedding_model,
    )
    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=split_word_length,
        split_respect_sentence_boundary=True,
        split_overlap=0,
    )
    # SEARCH PIPELINE
    search_pipeline = Pipeline()
    search_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Query"])

    # INDEXING PIPELINE
    index_pipeline = Pipeline()
    index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
    index_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Preprocessor"])
    index_pipeline.add_node(
        document_store, name="DocumentStore", inputs=["DPRRetriever"]
    )

    if audio_output:
        doc2speech = DocumentToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            generated_audio_dir=Path(data_path + "audio"),
        )
        search_pipeline.add_node(
            doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
        )

    return search_pipeline, index_pipeline


def dense_passage_retrieval_ranker(
    index="documents",
    split_word_length=100,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
    audio_output=False,
):
    """
    **Dense Passage Retrieval Ranker Pipeline**

    It adds a Ranker to the `Dense Passage Retrieval Pipeline`.

      - A Ranker reorders a set of Documents based on their relevance to the Query.
      - It is particularly useful when your Retriever has high recall but poor relevance scoring.
      - The improvement that the Ranker brings comes at the cost of some additional computation time.
    """
    search_pipeline, index_pipeline = dense_passage_retrieval(
        index=index,
        split_word_length=split_word_length,
        query_embedding_model=query_embedding_model,
        passage_embedding_model=passage_embedding_model,
    )
    ranker = SentenceTransformersRanker(model_name_or_path=ranker_model)

    search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
    
    if audio_output:
        doc2speech = DocumentToSpeech(
            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
            generated_audio_dir=Path(data_path + "audio"),
        )
        search_pipeline.add_node(
            doc2speech, name="DocumentToSpeech", inputs=["Ranker"]
        )

    return search_pipeline, index_pipeline