Spaces:

GIZ
/

gender-strategy-chatbot-giz

Running

App Files Files Community

VanessaHochwald commited on 26 days ago

Commit

88c2f0a

verified ·

1 Parent(s): 08eb4c6

Update rag.py

Browse files

Files changed (1) hide show

rag.py +230 -229

rag.py CHANGED Viewed

@@ -1,229 +1,230 @@
-"This file contains the implementation of the RAG pipeline."
-from pathlib import Path
-from haystack import Pipeline
-from haystack.components.builders import PromptBuilder
-from haystack.components.converters import MarkdownToDocument
-from haystack.components.embedders import (
-    SentenceTransformersDocumentEmbedder,
-    SentenceTransformersTextEmbedder,
-)
-from haystack.components.generators import HuggingFaceAPIGenerator
-from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
-from haystack.components.retrievers import InMemoryEmbeddingRetriever
-from haystack.components.writers import DocumentWriter
-from haystack.document_stores.in_memory import InMemoryDocumentStore
-from haystack.utils import Secret
-# Define the paths to the document and the model for embedding the documents and the user query
-DOCUMENT_PATH = Path("gender_document.md")
-EMBEDDING_MODEL = "all-MiniLM-L6-v2"
-def process_document(document_store: InMemoryDocumentStore) -> Pipeline:
-    """This function processes the document and stores it in the document store.
-    It contains of the following components:
-    - MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument)
-    - DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner)
-    - DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter)
-    - DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter)
-    - SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)
-    Parameters
-    ----------
-    document_store : InMemoryDocumentStore
-        The document store where the processed document should be stored.
-    Returns
-    -------
-    Pipeline
-        The pipeline containing the components to parse, clean, split, embed and write the document to the document store.
-        To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
-        For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`.
-    """
-    # initialize the pipeline
-    pipeline = Pipeline()
-    # add the components to the pipeline. If you want to add more components, you can do it here.
-    # If you want to the settings of the components, you can do it here.
-    # MarkdownToDocument
-    pipeline.add_component("converter", MarkdownToDocument())
-    # DocumentCleaner
-    pipeline.add_component("cleaner", DocumentCleaner())
-    # DocumentSplitter
-    pipeline.add_component(
-        "splitter",
-        DocumentSplitter(
-            split_by="word", split_length=300, respect_sentence_boundary=True
-        ),
-    )
-    # DocumentWriter
-    pipeline.add_component("writer", DocumentWriter(document_store=document_store))
-    # SentenceTransformersDocumentEmbedder
-    pipeline.add_component(
-        "embedder",
-        SentenceTransformersDocumentEmbedder(
-            EMBEDDING_MODEL,
-        ),
-    )
-    # connect the components
-    pipeline.connect("converter", "cleaner")
-    pipeline.connect("cleaner", "splitter")
-    pipeline.connect("splitter", "embedder")
-    pipeline.connect("embedder", "writer")
-    return pipeline
-def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore:
-    """This function loads the document store with the given settings.
-    Parameters
-    ----------
-    document_store_settings : dict
-        The settings for the document store. The settings are passed as a dictionary.
-        You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore
-    Returns
-    -------
-    InMemoryDocumentStore
-        _description_
-    """
-    document_store = InMemoryDocumentStore(**document_store_settings)
-    return document_store
-def get_query_pipeline(
-    document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator
-) -> Pipeline:
-    """
-    This function creates a query pipeline that contains the following components:
-    - SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)
-    - InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever)
-    - PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder)
-    - HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator)
-    Parameters
-    ----------
-    document_store : InMemoryDocumentStore
-        The document store where the documents are stored.
-    llm_provider : HuggingFaceAPIGenerator
-        The llm_provider that generates the answer to the user query.
-    Returns
-    -------
-    Pipeline
-        The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer.
-    """
-    # initialize the query pipeline
-    query_pipeline = Pipeline()
-    # add the components to the query pipeline
-    # SentenceTransformersTextEmbedder
-    query_pipeline.add_component(
-        "text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL)
-    )
-    # InMemoryEmbeddingRetriever
-    query_pipeline.add_component(
-        "retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10)
-    )
-    # template for the PromptBuilder
-    template = """
-    Given the following information, answer the question. If the information is insufficient, answer with "Answer is not possible". Please do not provide any additional information and ask clarifying questions.
-    Context:
-    {% for document in documents %}
-        {{ document.content }}
-    {% endfor %}
-    Question: {{ query }}?
-    """
-    # PromptBuilder
-    query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
-    # HuggingFaceAPIGenerator
-    query_pipeline.add_component("llm", generator)
-    # connect the components
-    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
-    query_pipeline.connect("retriever", "prompt_builder.documents")
-    query_pipeline.connect("prompt_builder", "llm")
-    return query_pipeline
-def init_generator() -> HuggingFaceAPIGenerator:
-    """This function initializes the HuggingFaceAPIGenerator with the given settings.
-    You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending
-    Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator.
-    For testing purposes, you can hardcode the token in the script.
-    For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`.
-    Returns
-    -------
-    HuggingFaceAPIGenerator
-        _description_
-    """
-    # initialize the HuggingFaceAPIGenerator
-    llm_provider = HuggingFaceAPIGenerator(
-        api_type="serverless_inference_api",
-        api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]},
-        token=Secret.from_token(""),
-    )
-    return llm_provider
-def rag_pipeline() -> Pipeline:
-    """This function wraps the whole RAG pipeline.
-    It loads the document store, processes the document, initializes the generator and
-    creates the query pipeline.
-    Returns
-    -------
-    Pipeline
-        The RAG pipeline containing the components to process the document and generate
-        the answer to the user query. It is enough to import and load this function for the chat application.
-        You can run the pipeline with the `pipeline.run()` method.
-        If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
-        For example:
-        result = rag.run(
-            {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
-        )
-        For debugging purposes, you can include the outputs for example from the retriever
-        result = rag.run(
-            {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
-            include_outputs_from=["retriever", "llm"],
-        )
-    """
-    # define document_store_settings
-    document_store_settings = {"embedding_similarity_function": "cosine"}
-    # load the document store
-    document_store = load_document_store(document_store_settings)
-    # process the document and write it to the document store
-    document_pipeline = process_document(document_store=document_store)
-    # run the document pipeline
-    document_pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})
-    # initialize the generator
-    llm_provider = init_generator()
-    # create the query pipeline
-    query_pipeline = get_query_pipeline(
-        document_store=document_store, generator=llm_provider
-    )
-    return query_pipeline

+"This file contains the implementation of the RAG pipeline."
+from pathlib import Path
+from haystack import Pipeline
+from haystack.components.builders import PromptBuilder
+from haystack.components.converters import MarkdownToDocument
+from haystack.components.embedders import (
+    SentenceTransformersDocumentEmbedder,
+    SentenceTransformersTextEmbedder,
+)
+from haystack.components.generators import HuggingFaceAPIGenerator
+from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
+from haystack.components.retrievers import InMemoryEmbeddingRetriever
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.utils import Secret
+# Define the paths to the document and the model for embedding the documents and the user query
+DOCUMENT_PATH = Path("gender_document.md")
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+def process_document(document_store: InMemoryDocumentStore) -> Pipeline:
+    """This function processes the document and stores it in the document store.
+    It contains of the following components:
+    - MarkdownToDocument: Converts the markdown file to a document (https://docs.haystack.deepset.ai/docs/markdowntodocument)
+    - DocumentCleaner: Cleans the document (https://docs.haystack.deepset.ai/docs/documentcleaner)
+    - DocumentSplitter: Splits the document into chunks (https://docs.haystack.deepset.ai/docs/documentsplitter)
+    - DocumentWriter: Writes the document to the document store (https://docs.haystack.deepset.ai/docs/documentwriter)
+    - SentenceTransformersDocumentEmbedder: Embeds the documents, more precisely the chunks (https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder)
+    Parameters
+    ----------
+    document_store : InMemoryDocumentStore
+        The document store where the processed document should be stored.
+    Returns
+    -------
+    Pipeline
+        The pipeline containing the components to parse, clean, split, embed and write the document to the document store.
+        To run the pipeline, you can use the `pipeline.run()` method. If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
+        For example: `pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})`.
+    """
+    # initialize the pipeline
+    pipeline = Pipeline()
+    # add the components to the pipeline. If you want to add more components, you can do it here.
+    # If you want to the settings of the components, you can do it here.
+    # MarkdownToDocument
+    pipeline.add_component("converter", MarkdownToDocument())
+    # DocumentCleaner
+    pipeline.add_component("cleaner", DocumentCleaner())
+    # DocumentSplitter
+    pipeline.add_component(
+        "splitter",
+        DocumentSplitter(
+            split_by="word", split_length=300, respect_sentence_boundary=True
+        ),
+    )
+    # DocumentWriter
+    pipeline.add_component("writer", DocumentWriter(document_store=document_store))
+    # SentenceTransformersDocumentEmbedder
+    pipeline.add_component(
+        "embedder",
+        SentenceTransformersDocumentEmbedder(
+            EMBEDDING_MODEL,
+        ),
+    )
+    # connect the components
+    pipeline.connect("converter", "cleaner")
+    pipeline.connect("cleaner", "splitter")
+    pipeline.connect("splitter", "embedder")
+    pipeline.connect("embedder", "writer")
+    return pipeline
+def load_document_store(document_store_settings: dict) -> InMemoryDocumentStore:
+    """This function loads the document store with the given settings.
+    Parameters
+    ----------
+    document_store_settings : dict
+        The settings for the document store. The settings are passed as a dictionary.
+        You can find the available settings here: https://docs.haystack.deepset.ai/docs/inmemorydocumentstore
+    Returns
+    -------
+    InMemoryDocumentStore
+        _description_
+    """
+    document_store = InMemoryDocumentStore(**document_store_settings)
+    return document_store
+def get_query_pipeline(
+    document_store: InMemoryDocumentStore, generator: HuggingFaceAPIGenerator
+) -> Pipeline:
+    """
+    This function creates a query pipeline that contains the following components:
+    - SentenceTransformersTextEmbedder: Embeds the user query (https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)
+    - InMemoryEmbeddingRetriever: Retrieves the most similar documents to the user query (https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever)
+    - PromptBuilder: Builds the prompt for the generator (https://docs.haystack.deepset.ai/docs/promptbuilder)
+    - HuggingFaceAPIGenerator: Generates the answer to the user query (https://docs.haystack.deepset.ai/docs/huggingfaceapigenerator)
+    Parameters
+    ----------
+    document_store : InMemoryDocumentStore
+        The document store where the documents are stored.
+    llm_provider : HuggingFaceAPIGenerator
+        The llm_provider that generates the answer to the user query.
+    Returns
+    -------
+    Pipeline
+        The query pipeline containing the components to embed the user query, retrieve the most similar documents, build the prompt and generate the answer.
+    """
+    # initialize the query pipeline
+    query_pipeline = Pipeline()
+    # add the components to the query pipeline
+    # SentenceTransformersTextEmbedder
+    query_pipeline.add_component(
+        "text_embedder", SentenceTransformersTextEmbedder(EMBEDDING_MODEL)
+    )
+    # InMemoryEmbeddingRetriever
+    query_pipeline.add_component(
+        "retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=10)
+    )
+    # template for the PromptBuilder
+    template = """
+    Given the following information, answer the question. If the information is insufficient, answer with "Answer is not possible". Please do not provide any additional information and ask clarifying questions.
+    Context:
+    {% for document in documents %}
+        {{ document.content }}
+    {% endfor %}
+    Question: {{ query }}?
+    """
+    # PromptBuilder
+    query_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
+    # HuggingFaceAPIGenerator
+    query_pipeline.add_component("llm", generator)
+    # connect the components
+    query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
+    query_pipeline.connect("retriever", "prompt_builder.documents")
+    query_pipeline.connect("prompt_builder", "llm")
+    return query_pipeline
+def init_generator() -> HuggingFaceAPIGenerator:
+    """This function initializes the HuggingFaceAPIGenerator with the given settings.
+    You can find the available models here: https://huggingface.co/models?inference=warm&pipeline_tag=text-generation&sort=trending
+    Please note that you need to provide a valid token to use the HuggingFaceAPIGenerator.
+    For testing purposes, you can hardcode the token in the script.
+    For deployment on Hugging Face Spaces, please safe the token as a secret (Settings -> Secrets) and load it with `Secret.from_env_var("your_token_name")`.
+    Returns
+    -------
+    HuggingFaceAPIGenerator
+        _description_
+    """
+    # initialize the HuggingFaceAPIGenerator
+    llm_provider = HuggingFaceAPIGenerator(
+        api_type="serverless_inference_api",
+        api_params={"model": "HuggingFaceH4/zephyr-7b-beta", "stop": ["Question"]},
+        #token=Secret.from_token(""),
+        token=Secret.from_env_var("hftoken"),
+    )
+    return llm_provider
+def rag_pipeline() -> Pipeline:
+    """This function wraps the whole RAG pipeline.
+    It loads the document store, processes the document, initializes the generator and
+    creates the query pipeline.
+    Returns
+    -------
+    Pipeline
+        The RAG pipeline containing the components to process the document and generate
+        the answer to the user query. It is enough to import and load this function for the chat application.
+        You can run the pipeline with the `pipeline.run()` method.
+        If a component needs input or arguments, you can pass them as a dictionary to the `run()` method.
+        For example:
+        result = rag.run(
+            {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
+        )
+        For debugging purposes, you can include the outputs for example from the retriever
+        result = rag.run(
+            {"prompt_builder": {"query": prompt}, "text_embedder": {"text": prompt}},
+            include_outputs_from=["retriever", "llm"],
+        )
+    """
+    # define document_store_settings
+    document_store_settings = {"embedding_similarity_function": "cosine"}
+    # load the document store
+    document_store = load_document_store(document_store_settings)
+    # process the document and write it to the document store
+    document_pipeline = process_document(document_store=document_store)
+    # run the document pipeline
+    document_pipeline.run({"converter": {"sources": [DOCUMENT_PATH]}})
+    # initialize the generator
+    llm_provider = init_generator()
+    # create the query pipeline
+    query_pipeline = get_query_pipeline(
+        document_store=document_store, generator=llm_provider
+    )
+    return query_pipeline