VanessaHochwald commited on
Commit
88c2f0a
·
verified ·
1 Parent(s): 08eb4c6

Update rag.py

Browse files
Files changed (1) hide show
  1. rag.py +230 -229
rag.py CHANGED
@@ -1,229 +1,230 @@
1
- "This file contains the implementation of the RAG pipeline."
2
-
3
- from pathlib import Path
4
-
5
- from haystack import Pipeline
6
- from haystack.components.builders import PromptBuilder
7
- from haystack.components.converters import MarkdownToDocument
8
- from haystack.components.embedders import (
9
- SentenceTransformersDocumentEmbedder,
10
- SentenceTransformersTextEmbedder,
11
- )
12
- from haystack.components.generators import HuggingFaceAPIGenerator
13
- from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
14
- from haystack.components.retrievers import InMemoryEmbeddingRetriever
15
- from haystack.components.writers import DocumentWriter
16
- from haystack.document_stores.in_memory import InMemoryDocumentStore
17
- from haystack.utils import Secret
18
-
19
- # Define the paths to the document and the model for embedding the documents and the user query
20
- DOCUMENT_PATH = Path("gender_document.md")
21
- EMBEDDING_MODEL = "all-MiniLM-L6-v2"
22
-
23
-
24
- def process_document(document_store: InMemoryDocumentStore) -> Pipeline:
25
- """This function processes the document and stores it in the document store.
26
- It contains of the following components:
27
- - MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument)
28
- - DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner)
29
- - DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter)
30
- - DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter)
31
- - SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)
32
-
33
-
34
- Parameters
35
- ----------
36
- document_store : InMemoryDocumentStore
37
- The document store where the processed document should be stored.
38
-
39
- Returns
40
- -------
41
- Pipeline
42
- The pipeline containing the components to parse, clean, split, embed and write the document to the document store.
43
- To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
44
- For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`.
45
- """
46
-
47
- # initialize the pipeline
48
- pipeline = Pipeline()
49
-
50
- # add the components to the pipeline. If you want to add more components, you can do it here.
51
- # If you want to the settings of the components, you can do it here.
52
- # MarkdownToDocument
53
- pipeline.add_component("converter", MarkdownToDocument())
54
-
55
- # DocumentCleaner
56
- pipeline.add_component("cleaner", DocumentCleaner())
57
-
58
- # DocumentSplitter
59
- pipeline.add_component(
60
- "splitter",
61
- DocumentSplitter(
62
- split_by="word", split_length=300, respect_sentence_boundary=True
63
- ),
64
- )
65
-
66
- # DocumentWriter
67
- pipeline.add_component("writer", DocumentWriter(document_store=document_store))
68
-
69
- # SentenceTransformersDocumentEmbedder
70
- pipeline.add_component(
71
- "embedder",
72
- SentenceTransformersDocumentEmbedder(
73
- EMBEDDING_MODEL,
74
- ),
75
- )
76
-
77
- # connect the components
78
- pipeline.connect("converter", "cleaner")
79
- pipeline.connect("cleaner", "splitter")
80
- pipeline.connect("splitter", "embedder")
81
- pipeline.connect("embedder", "writer")
82
- return pipeline
83
-
84
-
85
- def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore:
86
- """This function loads the document store with the given settings.
87
-
88
- Parameters
89
- ----------
90
- document_store_settings : dict
91
- The settings for the document store. The settings are passed as a dictionary.
92
- You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore
93
-
94
- Returns
95
- -------
96
- InMemoryDocumentStore
97
- _description_
98
- """
99
- document_store = InMemoryDocumentStore(**document_store_settings)
100
- return document_store
101
-
102
-
103
- def get_query_pipeline(
104
- document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator
105
- ) -> Pipeline:
106
- """
107
- This function creates a query pipeline that contains the following components:
108
- - SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)
109
- - InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever)
110
- - PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder)
111
- - HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator)
112
-
113
- Parameters
114
- ----------
115
- document_store : InMemoryDocumentStore
116
- The document store where the documents are stored.
117
- llm_provider : HuggingFaceAPIGenerator
118
- The llm_provider that generates the answer to the user query.
119
-
120
- Returns
121
- -------
122
- Pipeline
123
- The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer.
124
- """
125
-
126
- # initialize the query pipeline
127
- query_pipeline = Pipeline()
128
-
129
- # add the components to the query pipeline
130
- # SentenceTransformersTextEmbedder
131
- query_pipeline.add_component(
132
- "text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL)
133
- )
134
-
135
- # InMemoryEmbeddingRetriever
136
- query_pipeline.add_component(
137
- "retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10)
138
- )
139
-
140
- # template for the PromptBuilder
141
- template = """
142
- Given the following information, answer the question. If the information is insufficient, answer with "Answer is not possible". Please do not provide any additional information and ask clarifying questions.
143
-
144
- Context:
145
- {% for document in documents %}
146
- {{ document.content }}
147
- {% endfor %}
148
-
149
- Question: {{ query }}?
150
- """
151
-
152
- # PromptBuilder
153
- query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
154
-
155
- # HuggingFaceAPIGenerator
156
- query_pipeline.add_component("llm", generator)
157
-
158
- # connect the components
159
- query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
160
- query_pipeline.connect("retriever", "prompt_builder.documents")
161
- query_pipeline.connect("prompt_builder", "llm")
162
- return query_pipeline
163
-
164
-
165
- def init_generator() -> HuggingFaceAPIGenerator:
166
- """This function initializes the HuggingFaceAPIGenerator with the given settings.
167
- You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending
168
- Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator.
169
- For testing purposes, you can hardcode the token in the script.
170
- For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`.
171
-
172
- Returns
173
- -------
174
- HuggingFaceAPIGenerator
175
- _description_
176
- """
177
-
178
- # initialize the HuggingFaceAPIGenerator
179
- llm_provider = HuggingFaceAPIGenerator(
180
- api_type="serverless_inference_api",
181
- api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]},
182
- token=Secret.from_token(""),
183
- )
184
- return llm_provider
185
-
186
-
187
- def rag_pipeline() -> Pipeline:
188
- """This function wraps the whole RAG pipeline.
189
- It loads the document store, processes the document, initializes the generator and
190
- creates the query pipeline.
191
-
192
- Returns
193
- -------
194
- Pipeline
195
- The RAG pipeline containing the components to process the document and generate
196
- the answer to the user query. It is enough to import and load this function for the chat application.
197
- You can run the pipeline with the `pipeline.run()` method.
198
- If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
199
- For example:
200
- result = rag.run(
201
- {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
202
- )
203
- For debugging purposes, you can include the outputs for example from the retriever
204
- result = rag.run(
205
- {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
206
- include_outputs_from=["retriever", "llm"],
207
- )
208
- """
209
- # define document_store_settings
210
- document_store_settings = {"embedding_similarity_function": "cosine"}
211
-
212
- # load the document store
213
- document_store = load_document_store(document_store_settings)
214
-
215
- # process the document and write it to the document store
216
- document_pipeline = process_document(document_store=document_store)
217
-
218
- # run the document pipeline
219
- document_pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})
220
-
221
- # initialize the generator
222
- llm_provider = init_generator()
223
-
224
- # create the query pipeline
225
- query_pipeline = get_query_pipeline(
226
- document_store=document_store, generator=llm_provider
227
- )
228
-
229
- return query_pipeline
 
 
1
+ "This file contains the implementation of the RAG pipeline."
2
+
3
+ from pathlib import Path
4
+
5
+ from haystack import Pipeline
6
+ from haystack.components.builders import PromptBuilder
7
+ from haystack.components.converters import MarkdownToDocument
8
+ from haystack.components.embedders import (
9
+ SentenceTransformersDocumentEmbedder,
10
+ SentenceTransformersTextEmbedder,
11
+ )
12
+ from haystack.components.generators import HuggingFaceAPIGenerator
13
+ from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
14
+ from haystack.components.retrievers import InMemoryEmbeddingRetriever
15
+ from haystack.components.writers import DocumentWriter
16
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
17
+ from haystack.utils import Secret
18
+
19
+ # Define the paths to the document and the model for embedding the documents and the user query
20
+ DOCUMENT_PATH = Path("gender_document.md")
21
+ EMBEDDING_MODEL = "all-MiniLM-L6-v2"
22
+
23
+
24
+ def process_document(document_store: InMemoryDocumentStore) -> Pipeline:
25
+ """This function processes the document and stores it in the document store.
26
+ It contains of the following components:
27
+ - MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument)
28
+ - DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner)
29
+ - DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter)
30
+ - DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter)
31
+ - SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)
32
+
33
+
34
+ Parameters
35
+ ----------
36
+ document_store : InMemoryDocumentStore
37
+ The document store where the processed document should be stored.
38
+
39
+ Returns
40
+ -------
41
+ Pipeline
42
+ The pipeline containing the components to parse, clean, split, embed and write the document to the document store.
43
+ To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
44
+ For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`.
45
+ """
46
+
47
+ # initialize the pipeline
48
+ pipeline = Pipeline()
49
+
50
+ # add the components to the pipeline. If you want to add more components, you can do it here.
51
+ # If you want to the settings of the components, you can do it here.
52
+ # MarkdownToDocument
53
+ pipeline.add_component("converter", MarkdownToDocument())
54
+
55
+ # DocumentCleaner
56
+ pipeline.add_component("cleaner", DocumentCleaner())
57
+
58
+ # DocumentSplitter
59
+ pipeline.add_component(
60
+ "splitter",
61
+ DocumentSplitter(
62
+ split_by="word", split_length=300, respect_sentence_boundary=True
63
+ ),
64
+ )
65
+
66
+ # DocumentWriter
67
+ pipeline.add_component("writer", DocumentWriter(document_store=document_store))
68
+
69
+ # SentenceTransformersDocumentEmbedder
70
+ pipeline.add_component(
71
+ "embedder",
72
+ SentenceTransformersDocumentEmbedder(
73
+ EMBEDDING_MODEL,
74
+ ),
75
+ )
76
+
77
+ # connect the components
78
+ pipeline.connect("converter", "cleaner")
79
+ pipeline.connect("cleaner", "splitter")
80
+ pipeline.connect("splitter", "embedder")
81
+ pipeline.connect("embedder", "writer")
82
+ return pipeline
83
+
84
+
85
+ def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore:
86
+ """This function loads the document store with the given settings.
87
+
88
+ Parameters
89
+ ----------
90
+ document_store_settings : dict
91
+ The settings for the document store. The settings are passed as a dictionary.
92
+ You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore
93
+
94
+ Returns
95
+ -------
96
+ InMemoryDocumentStore
97
+ _description_
98
+ """
99
+ document_store = InMemoryDocumentStore(**document_store_settings)
100
+ return document_store
101
+
102
+
103
+ def get_query_pipeline(
104
+ document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator
105
+ ) -> Pipeline:
106
+ """
107
+ This function creates a query pipeline that contains the following components:
108
+ - SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)
109
+ - InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever)
110
+ - PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder)
111
+ - HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator)
112
+
113
+ Parameters
114
+ ----------
115
+ document_store : InMemoryDocumentStore
116
+ The document store where the documents are stored.
117
+ llm_provider : HuggingFaceAPIGenerator
118
+ The llm_provider that generates the answer to the user query.
119
+
120
+ Returns
121
+ -------
122
+ Pipeline
123
+ The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer.
124
+ """
125
+
126
+ # initialize the query pipeline
127
+ query_pipeline = Pipeline()
128
+
129
+ # add the components to the query pipeline
130
+ # SentenceTransformersTextEmbedder
131
+ query_pipeline.add_component(
132
+ "text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL)
133
+ )
134
+
135
+ # InMemoryEmbeddingRetriever
136
+ query_pipeline.add_component(
137
+ "retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10)
138
+ )
139
+
140
+ # template for the PromptBuilder
141
+ template = """
142
+ Given the following information, answer the question. If the information is insufficient, answer with "Answer is not possible". Please do not provide any additional information and ask clarifying questions.
143
+
144
+ Context:
145
+ {% for document in documents %}
146
+ {{ document.content }}
147
+ {% endfor %}
148
+
149
+ Question: {{ query }}?
150
+ """
151
+
152
+ # PromptBuilder
153
+ query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
154
+
155
+ # HuggingFaceAPIGenerator
156
+ query_pipeline.add_component("llm", generator)
157
+
158
+ # connect the components
159
+ query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
160
+ query_pipeline.connect("retriever", "prompt_builder.documents")
161
+ query_pipeline.connect("prompt_builder", "llm")
162
+ return query_pipeline
163
+
164
+
165
+ def init_generator() -> HuggingFaceAPIGenerator:
166
+ """This function initializes the HuggingFaceAPIGenerator with the given settings.
167
+ You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending
168
+ Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator.
169
+ For testing purposes, you can hardcode the token in the script.
170
+ For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`.
171
+
172
+ Returns
173
+ -------
174
+ HuggingFaceAPIGenerator
175
+ _description_
176
+ """
177
+
178
+ # initialize the HuggingFaceAPIGenerator
179
+ llm_provider = HuggingFaceAPIGenerator(
180
+ api_type="serverless_inference_api",
181
+ api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]},
182
+ #token=Secret.from_token(""),
183
+ token=Secret.from_env_var("hftoken"),
184
+ )
185
+ return llm_provider
186
+
187
+
188
+ def rag_pipeline() -> Pipeline:
189
+ """This function wraps the whole RAG pipeline.
190
+ It loads the document store, processes the document, initializes the generator and
191
+ creates the query pipeline.
192
+
193
+ Returns
194
+ -------
195
+ Pipeline
196
+ The RAG pipeline containing the components to process the document and generate
197
+ the answer to the user query. It is enough to import and load this function for the chat application.
198
+ You can run the pipeline with the `pipeline.run()` method.
199
+ If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
200
+ For example:
201
+ result = rag.run(
202
+ {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
203
+ )
204
+ For debugging purposes, you can include the outputs for example from the retriever
205
+ result = rag.run(
206
+ {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
207
+ include_outputs_from=["retriever", "llm"],
208
+ )
209
+ """
210
+ # define document_store_settings
211
+ document_store_settings = {"embedding_similarity_function": "cosine"}
212
+
213
+ # load the document store
214
+ document_store = load_document_store(document_store_settings)
215
+
216
+ # process the document and write it to the document store
217
+ document_pipeline = process_document(document_store=document_store)
218
+
219
+ # run the document pipeline
220
+ document_pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})
221
+
222
+ # initialize the generator
223
+ llm_provider = init_generator()
224
+
225
+ # create the query pipeline
226
+ query_pipeline = get_query_pipeline(
227
+ document_store=document_store, generator=llm_provider
228
+ )
229
+
230
+ return query_pipeline