from haystack.components.generators import OpenAIGenerator from haystack.utils import Secret from haystack.components.builders.prompt_builder import PromptBuilder from haystack.components.routers import ConditionalRouter from haystack import Pipeline from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack_integrations.document_stores.chroma import ChromaDocumentStore from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever import rsa from cryptography.fernet import Fernet from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine import gradio as gr embedding_model = "Alibaba-NLP/gte-multilingual-base" document_store = ChromaDocumentStore( persist_path="vstore_4012" ) ################################## ####### Answering pipeline ####### ################################## no_answer_message = ( "I'm not allowed to answer this question. Please ask something related to " "APIs access in accordance DSA’s transparency and data-sharing provisions. " "Is there anything else I can do for you? " ) relevance_prompt_template = """ Classify whether this user is asking for something related to social media APIs, the Digital Services Act (DSA), or any topic related to online platforms’ compliance with legal and data-sharing frameworks. Relevant topics include: - Social media API access - Data transparency - Compliance with DSA provisions - Online platform regulations Here is their message: {{query}} Here are the two previous messages. ONLY refer to these if the above message refers previous ones. {% for message in user_history[-2:] %} * {{message["content"]}} {% endfor %} Instructions: - Respond with “YES” if the query pertains to any of the relevant topics listed above and not mixed with off-topic content. - Respond with “NO” if the query is off-topic and does not relate to the topics listed above. Examples: - Query: "How does the DSA affect API usage?" - Response: "YES" - Query: "How to make a pancake with APIs?" - Response: "NO" """ routes = [ { "condition": "{{'YES' in replies[0]}}", "output": "{{query}}", "output_name": "query", "output_type": str, }, { "condition": "{{'NO' in replies[0]}}", "output": no_answer_message, "output_name": "no_answer", "output_type": str, } ] query_prompt_template = """ Conversation history: {{conv_history}} Here is what the user has requested: {{query}} Instructions: - Craft a concise, short informative answer to the user's request using the information provided below. - Synthesize the key points into a seamless response that appears as your own expert knowledge. - Avoid direct quotes or explicit references to the documents. - You are directly answering the user's query. Relevant Information: {% for document in documents %} - {{ document.content }} {% endfor %} """ def setup_generator(model_name, api_key_env_var="OPENAI_API_KEY", max_tokens=8192): return OpenAIGenerator( api_key=Secret.from_env_var(api_key_env_var), model=model_name, generation_kwargs={"max_tokens": max_tokens} ) llm = setup_generator("gpt-4o-mini", max_tokens=30) llm2 = setup_generator("gpt-4o-mini") embedder = SentenceTransformersTextEmbedder( model=embedding_model, trust_remote_code=True, progress_bar=False ) retriever = ChromaEmbeddingRetriever(document_store) router = ConditionalRouter(routes=routes) prompt_builder = PromptBuilder(template=relevance_prompt_template) prompt_builder2 = PromptBuilder(template=query_prompt_template) answer_query = Pipeline() answer_query.add_component("prompt_builder", prompt_builder) answer_query.add_component("llm", llm) answer_query.add_component("router", router) answer_query.add_component("embedder", embedder) answer_query.add_component("retriever", retriever) answer_query.add_component("prompt_builder2", prompt_builder2) answer_query.add_component("llm2", llm2) answer_query.connect("prompt_builder", "llm") answer_query.connect("llm", "router") answer_query.connect("router.query", "embedder") answer_query.connect("embedder", "retriever") answer_query.connect("retriever", "prompt_builder2") answer_query.connect("prompt_builder2", "llm2") answer_query.warm_up() ########################## ####### Logging ########## ########################## analyzer = AnalyzerEngine() anonymizer = AnonymizerEngine() def hide_sensitive_info(text): analysis = analyzer.analyze( text=text, language="en", ) result = anonymizer.anonymize( text=text, analyzer_results=analysis ) return result.text def log_QA(question, answer): message = f"User: {question}\nAssistant: {answer}" message_no_info = hide_sensitive_info(message) print(message_no_info, end="\n\n") ########################## ####### Gradio app ####### ########################## def chat(message, history): """ Chat function for Gradio. Uses the pipeline to produce next answer. """ conv_history = "\n\n".join([f'{message["role"]}: {message["content"]}' for message in history[-2:]]) user_history = [message for message in history if message["role"] == "user"] results = answer_query.run({"user_history": user_history, "query": message, "conv_history": conv_history, "top_k":3}) if "llm2" in results: answer = results["llm2"]["replies"][0] elif "router" in results and "no_answer" in results["router"]: answer = results["router"]["no_answer"] else: answer = "Sorry, a mistake occured" log_QA(message, answer) return answer examples = [ "What is Article 40.12 of the Digital Services Act, and how does it help researchers?", "How can I start the process of requesting platform data for research?", "How do I submit a data access request for Meta’s API under the DSA?", "What are the authentication and setup steps for Youtube’s API?", "What specific types of data can I access through Snapchat’s API?" ] if __name__ == "__main__": interface = gr.ChatInterface( fn=chat, type="messages", title="40.12 Chatbot", description="Ask me anything about social media APIs, the Digital Services Act (DSA), or online platform regulations.", examples=examples ) interface.launch()