|
from haystack.components.generators import OpenAIGenerator |
|
from haystack.utils import Secret |
|
from haystack.components.builders.prompt_builder import PromptBuilder |
|
from haystack.components.routers import ConditionalRouter |
|
from haystack import Pipeline |
|
from haystack.components.embedders import SentenceTransformersTextEmbedder |
|
from haystack_integrations.document_stores.chroma import ChromaDocumentStore |
|
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever |
|
import rsa |
|
from cryptography.fernet import Fernet |
|
from presidio_analyzer import AnalyzerEngine |
|
from presidio_anonymizer import AnonymizerEngine |
|
|
|
import gradio as gr |
|
|
|
embedding_model = "Alibaba-NLP/gte-multilingual-base" |
|
|
|
|
|
document_store = ChromaDocumentStore( |
|
persist_path="vstore_4012" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
no_answer_message = ( |
|
"I'm not allowed to answer this question. Please ask something related to " |
|
"APIs access in accordance DSA’s transparency and data-sharing provisions. " |
|
"Is there anything else I can do for you? " |
|
) |
|
|
|
relevance_prompt_template = """ |
|
Classify whether this user is asking for something related to social media APIs, |
|
the Digital Services Act (DSA), or any topic related to online platforms’ compliance |
|
with legal and data-sharing frameworks. |
|
|
|
Relevant topics include: |
|
- Social media API access |
|
- Data transparency |
|
- Compliance with DSA provisions |
|
- Online platform regulations |
|
|
|
Here is their message: |
|
{{query}} |
|
|
|
Here are the two previous messages. ONLY refer to these if the above message refers previous ones. |
|
|
|
{% for message in user_history[-2:] %} |
|
* {{message["content"]}} |
|
|
|
{% endfor %} |
|
|
|
Instructions: |
|
- Respond with “YES” if the query pertains to any of the relevant topics listed above and not mixed with off-topic content. |
|
- Respond with “NO” if the query is off-topic and does not relate to the topics listed above. |
|
|
|
Examples: |
|
- Query: "How does the DSA affect API usage?" |
|
- Response: "YES" |
|
|
|
- Query: "How to make a pancake with APIs?" |
|
- Response: "NO" |
|
|
|
""" |
|
|
|
routes = [ |
|
{ |
|
"condition": "{{'YES' in replies[0]}}", |
|
"output": "{{query}}", |
|
"output_name": "query", |
|
"output_type": str, |
|
}, |
|
{ |
|
"condition": "{{'NO' in replies[0]}}", |
|
"output": no_answer_message, |
|
"output_name": "no_answer", |
|
"output_type": str, |
|
} |
|
] |
|
|
|
query_prompt_template = """ |
|
Conversation history: |
|
{{conv_history}} |
|
|
|
Here is what the user has requested: |
|
{{query}} |
|
|
|
Instructions: |
|
- Craft a concise, short informative answer to the user's request using the information provided below. |
|
- Synthesize the key points into a seamless response that appears as your own expert knowledge. |
|
- Avoid direct quotes or explicit references to the documents. |
|
- You are directly answering the user's query. |
|
|
|
Relevant Information: |
|
{% for document in documents %} |
|
- {{ document.content }} |
|
{% endfor %} |
|
|
|
""" |
|
|
|
def setup_generator(model_name, api_key_env_var="OPENAI_API_KEY", max_tokens=8192): |
|
return OpenAIGenerator( |
|
api_key=Secret.from_env_var(api_key_env_var), |
|
model=model_name, |
|
generation_kwargs={"max_tokens": max_tokens} |
|
) |
|
|
|
|
|
llm = setup_generator("gpt-4o-mini", max_tokens=30) |
|
llm2 = setup_generator("gpt-4o-mini") |
|
|
|
|
|
embedder = SentenceTransformersTextEmbedder( |
|
model=embedding_model, |
|
trust_remote_code=True, |
|
progress_bar=False |
|
) |
|
retriever = ChromaEmbeddingRetriever(document_store) |
|
|
|
router = ConditionalRouter(routes=routes) |
|
prompt_builder = PromptBuilder(template=relevance_prompt_template) |
|
prompt_builder2 = PromptBuilder(template=query_prompt_template) |
|
|
|
|
|
answer_query = Pipeline() |
|
|
|
answer_query.add_component("prompt_builder", prompt_builder) |
|
answer_query.add_component("llm", llm) |
|
answer_query.add_component("router", router) |
|
answer_query.add_component("embedder", embedder) |
|
answer_query.add_component("retriever", retriever) |
|
answer_query.add_component("prompt_builder2", prompt_builder2) |
|
answer_query.add_component("llm2", llm2) |
|
|
|
answer_query.connect("prompt_builder", "llm") |
|
answer_query.connect("llm", "router") |
|
answer_query.connect("router.query", "embedder") |
|
answer_query.connect("embedder", "retriever") |
|
answer_query.connect("retriever", "prompt_builder2") |
|
answer_query.connect("prompt_builder2", "llm2") |
|
|
|
answer_query.warm_up() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
analyzer = AnalyzerEngine() |
|
anonymizer = AnonymizerEngine() |
|
|
|
def hide_sensitive_info(text): |
|
analysis = analyzer.analyze( |
|
text=text, |
|
language="en", |
|
) |
|
result = anonymizer.anonymize( |
|
text=text, |
|
analyzer_results=analysis |
|
) |
|
return result.text |
|
|
|
def log_QA(question, answer): |
|
message = f"User: {question}\nAssistant: {answer}" |
|
message_no_info = hide_sensitive_info(message) |
|
print(message_no_info, end="\n\n") |
|
|
|
|
|
|
|
|
|
|
|
def chat(message, history): |
|
""" |
|
Chat function for Gradio. Uses the pipeline to produce next answer. |
|
""" |
|
conv_history = "\n\n".join([f'{message["role"]}: {message["content"]}' for message in history[-2:]]) |
|
user_history = [message for message in history if message["role"] == "user"] |
|
results = answer_query.run({"user_history": user_history, |
|
"query": message, |
|
"conv_history": conv_history, |
|
"top_k":3}) |
|
if "llm2" in results: |
|
answer = results["llm2"]["replies"][0] |
|
elif "router" in results and "no_answer" in results["router"]: |
|
answer = results["router"]["no_answer"] |
|
else: |
|
answer = "Sorry, a mistake occured" |
|
log_QA(message, answer) |
|
return answer |
|
|
|
examples = [ |
|
"What is Article 40.12 of the Digital Services Act, and how does it help researchers?", |
|
"How can I start the process of requesting platform data for research?", |
|
"How do I submit a data access request for Meta’s API under the DSA?", |
|
"What are the authentication and setup steps for Youtube’s API?", |
|
"What specific types of data can I access through Snapchat’s API?" |
|
] |
|
|
|
if __name__ == "__main__": |
|
interface = gr.ChatInterface( |
|
fn=chat, |
|
type="messages", |
|
title="40.12 Chatbot", |
|
description="Ask me anything about social media APIs, the Digital Services Act (DSA), or online platform regulations.", |
|
examples=examples |
|
) |
|
|
|
interface.launch() |