|
"This file contains the implementation of the RAG pipeline." |
|
|
|
from pathlib import Path |
|
|
|
from haystack import Pipeline |
|
from haystack.components.builders import PromptBuilder |
|
from haystack.components.converters import MarkdownToDocument |
|
from haystack.components.embedders import ( |
|
SentenceTransformersDocumentEmbedder, |
|
SentenceTransformersTextEmbedder, |
|
) |
|
from haystack.components.generators import HuggingFaceAPIGenerator |
|
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter |
|
from haystack.components.retrievers import InMemoryEmbeddingRetriever |
|
from haystack.components.writers import DocumentWriter |
|
from haystack.document_stores.in_memory import InMemoryDocumentStore |
|
from haystack.utils import Secret |
|
|
|
|
|
DOCUMENT_PATH = Path("gender_document.md") |
|
EMBEDDING_MODEL = "all-MiniLM-L6-v2" |
|
|
|
|
|
def process_document(document_store: InMemoryDocumentStore) -> Pipeline: |
|
"""This function processes the document and stores it in the document store. |
|
It contains of the following components: |
|
- MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument) |
|
- DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner) |
|
- DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter) |
|
- DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter) |
|
- SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder) |
|
|
|
|
|
Parameters |
|
---------- |
|
document_store : InMemoryDocumentStore |
|
The document store where the processed document should be stored. |
|
|
|
Returns |
|
------- |
|
Pipeline |
|
The pipeline containing the components to parse, clean, split, embed and write the document to the document store. |
|
To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method. |
|
For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`. |
|
""" |
|
|
|
|
|
pipeline = Pipeline() |
|
|
|
|
|
|
|
|
|
pipeline.add_component("converter", MarkdownToDocument()) |
|
|
|
|
|
pipeline.add_component("cleaner", DocumentCleaner()) |
|
|
|
|
|
pipeline.add_component( |
|
"splitter", |
|
DocumentSplitter( |
|
split_by="word", split_length=300, respect_sentence_boundary=True |
|
), |
|
) |
|
|
|
|
|
pipeline.add_component("writer", DocumentWriter(document_store=document_store)) |
|
|
|
|
|
pipeline.add_component( |
|
"embedder", |
|
SentenceTransformersDocumentEmbedder( |
|
EMBEDDING_MODEL, |
|
), |
|
) |
|
|
|
|
|
pipeline.connect("converter", "cleaner") |
|
pipeline.connect("cleaner", "splitter") |
|
pipeline.connect("splitter", "embedder") |
|
pipeline.connect("embedder", "writer") |
|
return pipeline |
|
|
|
|
|
def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore: |
|
"""This function loads the document store with the given settings. |
|
|
|
Parameters |
|
---------- |
|
document_store_settings : dict |
|
The settings for the document store. The settings are passed as a dictionary. |
|
You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore |
|
|
|
Returns |
|
------- |
|
InMemoryDocumentStore |
|
_description_ |
|
""" |
|
document_store = InMemoryDocumentStore(**document_store_settings) |
|
return document_store |
|
|
|
|
|
def get_query_pipeline( |
|
document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator |
|
) -> Pipeline: |
|
""" |
|
This function creates a query pipeline that contains the following components: |
|
- SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder) |
|
- InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever) |
|
- PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder) |
|
- HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator) |
|
|
|
Parameters |
|
---------- |
|
document_store : InMemoryDocumentStore |
|
The document store where the documents are stored. |
|
llm_provider : HuggingFaceAPIGenerator |
|
The llm_provider that generates the answer to the user query. |
|
|
|
Returns |
|
------- |
|
Pipeline |
|
The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer. |
|
""" |
|
|
|
|
|
query_pipeline = Pipeline() |
|
|
|
|
|
|
|
query_pipeline.add_component( |
|
"text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL) |
|
) |
|
|
|
|
|
query_pipeline.add_component( |
|
"retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10) |
|
) |
|
|
|
|
|
template = """ |
|
You are an expert on gender strategies and sustainable development. Your task is to provide detailed, well-structured, and informative answers based on the given context. |
|
|
|
### Instructions: |
|
- Provide a **comprehensive** and **well-structured** response. |
|
- Include **specific details, key concepts, and relevant examples** where applicable. |
|
- Explain **how and why** aspects of the Gender Strategy are relevant to the given question. |
|
- If necessary, cite relevant sections from the provided context. |
|
- If the available information is insufficient, state clearly: **"The available information does not provide a full answer."** However, summarize the most relevant points that can still help address the question. |
|
|
|
### Context: |
|
{% for document in documents %} |
|
{{ document.content }} |
|
{% endfor %} |
|
|
|
### Question: |
|
{{ query }} |
|
|
|
### Answer: |
|
""" |
|
|
|
|
|
|
|
query_pipeline.add_component("prompt_builder", PromptBuilder(template=template)) |
|
|
|
|
|
query_pipeline.add_component("llm", generator) |
|
|
|
|
|
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") |
|
query_pipeline.connect("retriever", "prompt_builder.documents") |
|
query_pipeline.connect("prompt_builder", "llm") |
|
return query_pipeline |
|
|
|
|
|
def init_generator() -> HuggingFaceAPIGenerator: |
|
"""This function initializes the HuggingFaceAPIGenerator with the given settings. |
|
You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending |
|
Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator. |
|
For testing purposes, you can hardcode the token in the script. |
|
For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`. |
|
|
|
Returns |
|
------- |
|
HuggingFaceAPIGenerator |
|
_description_ |
|
""" |
|
|
|
|
|
llm_provider = HuggingFaceAPIGenerator( |
|
api_type="serverless_inference_api", |
|
api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]}, |
|
|
|
token=Secret.from_env_var("hftoken"), |
|
) |
|
return llm_provider |
|
|
|
|
|
def rag_pipeline() -> Pipeline: |
|
"""This function wraps the whole RAG pipeline. |
|
It loads the document store, processes the document, initializes the generator and |
|
creates the query pipeline. |
|
|
|
Returns |
|
------- |
|
Pipeline |
|
The RAG pipeline containing the components to process the document and generate |
|
the answer to the user query. It is enough to import and load this function for the chat application. |
|
You can run the pipeline with the `pipeline.run()` method. |
|
If a component needs input or arguments, you can pass them as a dictionary to the `run()` method. |
|
For example: |
|
result = rag.run( |
|
{"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}}, |
|
) |
|
For debugging purposes, you can include the outputs for example from the retriever |
|
result = rag.run( |
|
{"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}}, |
|
include_outputs_from=["retriever", "llm"], |
|
) |
|
""" |
|
|
|
document_store_settings = {"embedding_similarity_function": "cosine"} |
|
|
|
|
|
document_store = load_document_store(document_store_settings) |
|
|
|
|
|
document_pipeline = process_document(document_store=document_store) |
|
|
|
|
|
document_pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}}) |
|
|
|
|
|
llm_provider = init_generator() |
|
|
|
|
|
query_pipeline = get_query_pipeline( |
|
document_store=document_store, generator=llm_provider |
|
) |
|
|
|
return query_pipeline |
|
|