Update rag.py
Browse files
rag.py
CHANGED
@@ -1,229 +1,230 @@
|
|
1 |
-
"This file contains the implementation of the RAG pipeline."
|
2 |
-
|
3 |
-
from pathlib import Path
|
4 |
-
|
5 |
-
from haystack import Pipeline
|
6 |
-
from haystack.components.builders import PromptBuilder
|
7 |
-
from haystack.components.converters import MarkdownToDocument
|
8 |
-
from haystack.components.embedders import (
|
9 |
-
SentenceTransformersDocumentEmbedder,
|
10 |
-
SentenceTransformersTextEmbedder,
|
11 |
-
)
|
12 |
-
from haystack.components.generators import HuggingFaceAPIGenerator
|
13 |
-
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
|
14 |
-
from haystack.components.retrievers import InMemoryEmbeddingRetriever
|
15 |
-
from haystack.components.writers import DocumentWriter
|
16 |
-
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
17 |
-
from haystack.utils import Secret
|
18 |
-
|
19 |
-
# Define the paths to the document and the model for embedding the documents and the user query
|
20 |
-
DOCUMENT_PATH = Path("gender_document.md")
|
21 |
-
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
22 |
-
|
23 |
-
|
24 |
-
def process_document(document_store: InMemoryDocumentStore) -> Pipeline:
|
25 |
-
"""This function processes the document and stores it in the document store.
|
26 |
-
It contains of the following components:
|
27 |
-
- MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument)
|
28 |
-
- DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner)
|
29 |
-
- DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter)
|
30 |
-
- DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter)
|
31 |
-
- SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)
|
32 |
-
|
33 |
-
|
34 |
-
Parameters
|
35 |
-
----------
|
36 |
-
document_store : InMemoryDocumentStore
|
37 |
-
The document store where the processed document should be stored.
|
38 |
-
|
39 |
-
Returns
|
40 |
-
-------
|
41 |
-
Pipeline
|
42 |
-
The pipeline containing the components to parse, clean, split, embed and write the document to the document store.
|
43 |
-
To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
|
44 |
-
For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`.
|
45 |
-
"""
|
46 |
-
|
47 |
-
# initialize the pipeline
|
48 |
-
pipeline = Pipeline()
|
49 |
-
|
50 |
-
# add the components to the pipeline. If you want to add more components, you can do it here.
|
51 |
-
# If you want to the settings of the components, you can do it here.
|
52 |
-
# MarkdownToDocument
|
53 |
-
pipeline.add_component("converter", MarkdownToDocument())
|
54 |
-
|
55 |
-
# DocumentCleaner
|
56 |
-
pipeline.add_component("cleaner", DocumentCleaner())
|
57 |
-
|
58 |
-
# DocumentSplitter
|
59 |
-
pipeline.add_component(
|
60 |
-
"splitter",
|
61 |
-
DocumentSplitter(
|
62 |
-
split_by="word", split_length=300, respect_sentence_boundary=True
|
63 |
-
),
|
64 |
-
)
|
65 |
-
|
66 |
-
# DocumentWriter
|
67 |
-
pipeline.add_component("writer", DocumentWriter(document_store=document_store))
|
68 |
-
|
69 |
-
# SentenceTransformersDocumentEmbedder
|
70 |
-
pipeline.add_component(
|
71 |
-
"embedder",
|
72 |
-
SentenceTransformersDocumentEmbedder(
|
73 |
-
EMBEDDING_MODEL,
|
74 |
-
),
|
75 |
-
)
|
76 |
-
|
77 |
-
# connect the components
|
78 |
-
pipeline.connect("converter", "cleaner")
|
79 |
-
pipeline.connect("cleaner", "splitter")
|
80 |
-
pipeline.connect("splitter", "embedder")
|
81 |
-
pipeline.connect("embedder", "writer")
|
82 |
-
return pipeline
|
83 |
-
|
84 |
-
|
85 |
-
def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore:
|
86 |
-
"""This function loads the document store with the given settings.
|
87 |
-
|
88 |
-
Parameters
|
89 |
-
----------
|
90 |
-
document_store_settings : dict
|
91 |
-
The settings for the document store. The settings are passed as a dictionary.
|
92 |
-
You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore
|
93 |
-
|
94 |
-
Returns
|
95 |
-
-------
|
96 |
-
InMemoryDocumentStore
|
97 |
-
_description_
|
98 |
-
"""
|
99 |
-
document_store = InMemoryDocumentStore(**document_store_settings)
|
100 |
-
return document_store
|
101 |
-
|
102 |
-
|
103 |
-
def get_query_pipeline(
|
104 |
-
document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator
|
105 |
-
) -> Pipeline:
|
106 |
-
"""
|
107 |
-
This function creates a query pipeline that contains the following components:
|
108 |
-
- SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)
|
109 |
-
- InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever)
|
110 |
-
- PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder)
|
111 |
-
- HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator)
|
112 |
-
|
113 |
-
Parameters
|
114 |
-
----------
|
115 |
-
document_store : InMemoryDocumentStore
|
116 |
-
The document store where the documents are stored.
|
117 |
-
llm_provider : HuggingFaceAPIGenerator
|
118 |
-
The llm_provider that generates the answer to the user query.
|
119 |
-
|
120 |
-
Returns
|
121 |
-
-------
|
122 |
-
Pipeline
|
123 |
-
The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer.
|
124 |
-
"""
|
125 |
-
|
126 |
-
# initialize the query pipeline
|
127 |
-
query_pipeline = Pipeline()
|
128 |
-
|
129 |
-
# add the components to the query pipeline
|
130 |
-
# SentenceTransformersTextEmbedder
|
131 |
-
query_pipeline.add_component(
|
132 |
-
"text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL)
|
133 |
-
)
|
134 |
-
|
135 |
-
# InMemoryEmbeddingRetriever
|
136 |
-
query_pipeline.add_component(
|
137 |
-
"retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10)
|
138 |
-
)
|
139 |
-
|
140 |
-
# template for the PromptBuilder
|
141 |
-
template = """
|
142 |
-
Given the following information, answer the question. If the information is insufficient, answer with "Answer is not possible". Please do not provide any additional information and ask clarifying questions.
|
143 |
-
|
144 |
-
Context:
|
145 |
-
{% for document in documents %}
|
146 |
-
{{ document.content }}
|
147 |
-
{% endfor %}
|
148 |
-
|
149 |
-
Question: {{ query }}?
|
150 |
-
"""
|
151 |
-
|
152 |
-
# PromptBuilder
|
153 |
-
query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
|
154 |
-
|
155 |
-
# HuggingFaceAPIGenerator
|
156 |
-
query_pipeline.add_component("llm", generator)
|
157 |
-
|
158 |
-
# connect the components
|
159 |
-
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
|
160 |
-
query_pipeline.connect("retriever", "prompt_builder.documents")
|
161 |
-
query_pipeline.connect("prompt_builder", "llm")
|
162 |
-
return query_pipeline
|
163 |
-
|
164 |
-
|
165 |
-
def init_generator() -> HuggingFaceAPIGenerator:
|
166 |
-
"""This function initializes the HuggingFaceAPIGenerator with the given settings.
|
167 |
-
You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending
|
168 |
-
Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator.
|
169 |
-
For testing purposes, you can hardcode the token in the script.
|
170 |
-
For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`.
|
171 |
-
|
172 |
-
Returns
|
173 |
-
-------
|
174 |
-
HuggingFaceAPIGenerator
|
175 |
-
_description_
|
176 |
-
"""
|
177 |
-
|
178 |
-
# initialize the HuggingFaceAPIGenerator
|
179 |
-
llm_provider = HuggingFaceAPIGenerator(
|
180 |
-
api_type="serverless_inference_api",
|
181 |
-
api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]},
|
182 |
-
token=Secret.from_token(""),
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
|
|
|
1 |
+
"This file contains the implementation of the RAG pipeline."
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
from haystack import Pipeline
|
6 |
+
from haystack.components.builders import PromptBuilder
|
7 |
+
from haystack.components.converters import MarkdownToDocument
|
8 |
+
from haystack.components.embedders import (
|
9 |
+
SentenceTransformersDocumentEmbedder,
|
10 |
+
SentenceTransformersTextEmbedder,
|
11 |
+
)
|
12 |
+
from haystack.components.generators import HuggingFaceAPIGenerator
|
13 |
+
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
|
14 |
+
from haystack.components.retrievers import InMemoryEmbeddingRetriever
|
15 |
+
from haystack.components.writers import DocumentWriter
|
16 |
+
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
17 |
+
from haystack.utils import Secret
|
18 |
+
|
19 |
+
# Define the paths to the document and the model for embedding the documents and the user query
|
20 |
+
DOCUMENT_PATH = Path("gender_document.md")
|
21 |
+
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
22 |
+
|
23 |
+
|
24 |
+
def process_document(document_store: InMemoryDocumentStore) -> Pipeline:
|
25 |
+
"""This function processes the document and stores it in the document store.
|
26 |
+
It contains of the following components:
|
27 |
+
- MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument)
|
28 |
+
- DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner)
|
29 |
+
- DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter)
|
30 |
+
- DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter)
|
31 |
+
- SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)
|
32 |
+
|
33 |
+
|
34 |
+
Parameters
|
35 |
+
----------
|
36 |
+
document_store : InMemoryDocumentStore
|
37 |
+
The document store where the processed document should be stored.
|
38 |
+
|
39 |
+
Returns
|
40 |
+
-------
|
41 |
+
Pipeline
|
42 |
+
The pipeline containing the components to parse, clean, split, embed and write the document to the document store.
|
43 |
+
To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
|
44 |
+
For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`.
|
45 |
+
"""
|
46 |
+
|
47 |
+
# initialize the pipeline
|
48 |
+
pipeline = Pipeline()
|
49 |
+
|
50 |
+
# add the components to the pipeline. If you want to add more components, you can do it here.
|
51 |
+
# If you want to the settings of the components, you can do it here.
|
52 |
+
# MarkdownToDocument
|
53 |
+
pipeline.add_component("converter", MarkdownToDocument())
|
54 |
+
|
55 |
+
# DocumentCleaner
|
56 |
+
pipeline.add_component("cleaner", DocumentCleaner())
|
57 |
+
|
58 |
+
# DocumentSplitter
|
59 |
+
pipeline.add_component(
|
60 |
+
"splitter",
|
61 |
+
DocumentSplitter(
|
62 |
+
split_by="word", split_length=300, respect_sentence_boundary=True
|
63 |
+
),
|
64 |
+
)
|
65 |
+
|
66 |
+
# DocumentWriter
|
67 |
+
pipeline.add_component("writer", DocumentWriter(document_store=document_store))
|
68 |
+
|
69 |
+
# SentenceTransformersDocumentEmbedder
|
70 |
+
pipeline.add_component(
|
71 |
+
"embedder",
|
72 |
+
SentenceTransformersDocumentEmbedder(
|
73 |
+
EMBEDDING_MODEL,
|
74 |
+
),
|
75 |
+
)
|
76 |
+
|
77 |
+
# connect the components
|
78 |
+
pipeline.connect("converter", "cleaner")
|
79 |
+
pipeline.connect("cleaner", "splitter")
|
80 |
+
pipeline.connect("splitter", "embedder")
|
81 |
+
pipeline.connect("embedder", "writer")
|
82 |
+
return pipeline
|
83 |
+
|
84 |
+
|
85 |
+
def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore:
|
86 |
+
"""This function loads the document store with the given settings.
|
87 |
+
|
88 |
+
Parameters
|
89 |
+
----------
|
90 |
+
document_store_settings : dict
|
91 |
+
The settings for the document store. The settings are passed as a dictionary.
|
92 |
+
You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore
|
93 |
+
|
94 |
+
Returns
|
95 |
+
-------
|
96 |
+
InMemoryDocumentStore
|
97 |
+
_description_
|
98 |
+
"""
|
99 |
+
document_store = InMemoryDocumentStore(**document_store_settings)
|
100 |
+
return document_store
|
101 |
+
|
102 |
+
|
103 |
+
def get_query_pipeline(
|
104 |
+
document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator
|
105 |
+
) -> Pipeline:
|
106 |
+
"""
|
107 |
+
This function creates a query pipeline that contains the following components:
|
108 |
+
- SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)
|
109 |
+
- InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever)
|
110 |
+
- PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder)
|
111 |
+
- HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator)
|
112 |
+
|
113 |
+
Parameters
|
114 |
+
----------
|
115 |
+
document_store : InMemoryDocumentStore
|
116 |
+
The document store where the documents are stored.
|
117 |
+
llm_provider : HuggingFaceAPIGenerator
|
118 |
+
The llm_provider that generates the answer to the user query.
|
119 |
+
|
120 |
+
Returns
|
121 |
+
-------
|
122 |
+
Pipeline
|
123 |
+
The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer.
|
124 |
+
"""
|
125 |
+
|
126 |
+
# initialize the query pipeline
|
127 |
+
query_pipeline = Pipeline()
|
128 |
+
|
129 |
+
# add the components to the query pipeline
|
130 |
+
# SentenceTransformersTextEmbedder
|
131 |
+
query_pipeline.add_component(
|
132 |
+
"text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL)
|
133 |
+
)
|
134 |
+
|
135 |
+
# InMemoryEmbeddingRetriever
|
136 |
+
query_pipeline.add_component(
|
137 |
+
"retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10)
|
138 |
+
)
|
139 |
+
|
140 |
+
# template for the PromptBuilder
|
141 |
+
template = """
|
142 |
+
Given the following information, answer the question. If the information is insufficient, answer with "Answer is not possible". Please do not provide any additional information and ask clarifying questions.
|
143 |
+
|
144 |
+
Context:
|
145 |
+
{% for document in documents %}
|
146 |
+
{{ document.content }}
|
147 |
+
{% endfor %}
|
148 |
+
|
149 |
+
Question: {{ query }}?
|
150 |
+
"""
|
151 |
+
|
152 |
+
# PromptBuilder
|
153 |
+
query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
|
154 |
+
|
155 |
+
# HuggingFaceAPIGenerator
|
156 |
+
query_pipeline.add_component("llm", generator)
|
157 |
+
|
158 |
+
# connect the components
|
159 |
+
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
|
160 |
+
query_pipeline.connect("retriever", "prompt_builder.documents")
|
161 |
+
query_pipeline.connect("prompt_builder", "llm")
|
162 |
+
return query_pipeline
|
163 |
+
|
164 |
+
|
165 |
+
def init_generator() -> HuggingFaceAPIGenerator:
|
166 |
+
"""This function initializes the HuggingFaceAPIGenerator with the given settings.
|
167 |
+
You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending
|
168 |
+
Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator.
|
169 |
+
For testing purposes, you can hardcode the token in the script.
|
170 |
+
For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`.
|
171 |
+
|
172 |
+
Returns
|
173 |
+
-------
|
174 |
+
HuggingFaceAPIGenerator
|
175 |
+
_description_
|
176 |
+
"""
|
177 |
+
|
178 |
+
# initialize the HuggingFaceAPIGenerator
|
179 |
+
llm_provider = HuggingFaceAPIGenerator(
|
180 |
+
api_type="serverless_inference_api",
|
181 |
+
api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]},
|
182 |
+
#token=Secret.from_token(""),
|
183 |
+
token=Secret.from_env_var("hftoken"),
|
184 |
+
)
|
185 |
+
return llm_provider
|
186 |
+
|
187 |
+
|
188 |
+
def rag_pipeline() -> Pipeline:
|
189 |
+
"""This function wraps the whole RAG pipeline.
|
190 |
+
It loads the document store, processes the document, initializes the generator and
|
191 |
+
creates the query pipeline.
|
192 |
+
|
193 |
+
Returns
|
194 |
+
-------
|
195 |
+
Pipeline
|
196 |
+
The RAG pipeline containing the components to process the document and generate
|
197 |
+
the answer to the user query. It is enough to import and load this function for the chat application.
|
198 |
+
You can run the pipeline with the `pipeline.run()` method.
|
199 |
+
If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
|
200 |
+
For example:
|
201 |
+
result = rag.run(
|
202 |
+
{"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
|
203 |
+
)
|
204 |
+
For debugging purposes, you can include the outputs for example from the retriever
|
205 |
+
result = rag.run(
|
206 |
+
{"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
|
207 |
+
include_outputs_from=["retriever", "llm"],
|
208 |
+
)
|
209 |
+
"""
|
210 |
+
# define document_store_settings
|
211 |
+
document_store_settings = {"embedding_similarity_function": "cosine"}
|
212 |
+
|
213 |
+
# load the document store
|
214 |
+
document_store = load_document_store(document_store_settings)
|
215 |
+
|
216 |
+
# process the document and write it to the document store
|
217 |
+
document_pipeline = process_document(document_store=document_store)
|
218 |
+
|
219 |
+
# run the document pipeline
|
220 |
+
document_pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})
|
221 |
+
|
222 |
+
# initialize the generator
|
223 |
+
llm_provider = init_generator()
|
224 |
+
|
225 |
+
# create the query pipeline
|
226 |
+
query_pipeline = get_query_pipeline(
|
227 |
+
document_store=document_store, generator=llm_provider
|
228 |
+
)
|
229 |
+
|
230 |
+
return query_pipeline
|