|
import os |
|
import re |
|
import logging |
|
import requests |
|
import pandas as pd |
|
from bs4 import BeautifulSoup |
|
from langdetect import detect, DetectorFactory |
|
from langdetect.lang_detect_exception import LangDetectException |
|
import langid |
|
from deep_translator import GoogleTranslator |
|
import gradio as gr |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import Chroma |
|
from langchain.docstore.document import Document |
|
from langchain_community.vectorstores.utils import filter_complex_metadata |
|
from langchain_core.prompts import ChatPromptTemplate |
|
from langchain_core.pydantic_v1 import BaseModel, Field |
|
from langchain_openai import ChatOpenAI |
|
from langchain_core.runnables import RunnablePassthrough, RunnableLambda |
|
from langchain_core.output_parsers import StrOutputParser |
|
from operator import itemgetter |
|
from langchain_community.tools.tavily_search import TavilySearchResults |
|
from typing import List |
|
from typing_extensions import TypedDict |
|
from langgraph.graph import END, StateGraph |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain_community.document_loaders import UnstructuredURLLoader |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.chains import create_retrieval_chain |
|
from langchain.chains.combine_documents import create_stuff_documents_chain |
|
from langchain.chains import create_history_aware_retriever |
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder |
|
from langchain_core.messages import HumanMessage |
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG) |
|
|
|
OPENAI_API_TOKEN = "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" |
|
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN |
|
|
|
|
|
hf_api_token = os.getenv('HF_API_TOKEN') |
|
|
|
|
|
if hf_api_token is None: |
|
raise ValueError("HF_API_TOKEN environment variable not set") |
|
|
|
|
|
DetectorFactory.seed = 0 |
|
|
|
|
|
def translate_content(text): |
|
try: |
|
detected_lang = detect(text) |
|
if detected_lang == 'fr': |
|
return GoogleTranslator(source='fr', target='en').translate(text) |
|
elif detected_lang == 'en': |
|
return GoogleTranslator(source='en', target='fr').translate(text) |
|
else: |
|
return text |
|
except Exception as e: |
|
print(f"Error detecting language or translating: {e}") |
|
return text |
|
|
|
|
|
def chunk_content(content, chunk_size=1250, overlap=250): |
|
chunks = [] |
|
start = 0 |
|
while start < len(content): |
|
end = start + chunk_size |
|
chunk = content[start:end] |
|
chunks.append(chunk) |
|
start += chunk_size - overlap |
|
return chunks |
|
|
|
|
|
chunked_web_doc = [] |
|
|
|
|
|
df = pd.read_excel("UNTEanswers.xlsx") |
|
|
|
|
|
df['merged_content'] = df['prompt'] + " " + df['reference'] |
|
|
|
|
|
text_entries = [] |
|
|
|
for index, row in df.iterrows(): |
|
|
|
merged_content = row['merged_content'] |
|
text_entries.append(merged_content) |
|
|
|
|
|
translated_content = translate_content(merged_content) |
|
if translated_content and translated_content != merged_content: |
|
text_entries.append(translated_content) |
|
|
|
|
|
excel_text = "\n".join(text_entries) |
|
|
|
|
|
for index, row in df.iterrows(): |
|
merged_content = row['merged_content'] |
|
|
|
|
|
en_chunks = chunk_content(merged_content) |
|
for chunk in en_chunks: |
|
chunked_web_doc.append({ |
|
"url": "UNTEanswers.xlsx", |
|
"language": detect(merged_content), |
|
"chunk": chunk |
|
}) |
|
|
|
|
|
translated_content = translate_content(merged_content) |
|
if translated_content and translated_content != merged_content: |
|
translated_chunks = chunk_content(translated_content) |
|
for chunk in translated_chunks: |
|
chunked_web_doc.append({ |
|
"url": "UNTEanswers.xlsx", |
|
"language": detect(translated_content), |
|
"chunk": chunk |
|
}) |
|
|
|
|
|
with open('fetched_contentt.txt', 'r', encoding='utf-8') as f: |
|
fetched_content = f.read() |
|
|
|
|
|
content = fetched_content + "\n" + excel_text |
|
|
|
|
|
with open('merged_content.txt', 'w', encoding='utf-8') as f: |
|
f.write(content) |
|
|
|
|
|
web_contents = content.split("-" * 80 + "\n\n") |
|
|
|
for block in web_contents: |
|
if block.strip(): |
|
lines = block.strip().splitlines() |
|
url = "" |
|
title = "" |
|
en_content = "" |
|
fr_content = "" |
|
language = None |
|
|
|
for i, line in enumerate(lines): |
|
if line.startswith("URL:"): |
|
url = line.split("URL:")[1].strip() |
|
elif line.startswith("Title:"): |
|
title = line.split("Title:")[1].strip() |
|
elif line == "English Content:": |
|
language = "en" |
|
elif line == "French Content:": |
|
language = "fr" |
|
else: |
|
if language == "en": |
|
en_content += line + "\n" |
|
elif language == "fr": |
|
fr_content += line + "\n" |
|
|
|
if en_content.strip(): |
|
en_chunks = chunk_content(en_content.strip()) |
|
for chunk in en_chunks: |
|
chunked_web_doc.append({ |
|
"url": url, |
|
"language": "en", |
|
"chunk": chunk |
|
}) |
|
|
|
if fr_content.strip(): |
|
fr_chunks = chunk_content(fr_content.strip()) |
|
for chunk in fr_chunks: |
|
chunked_web_doc.append({ |
|
"url": url, |
|
"language": "fr", |
|
"chunk": chunk |
|
}) |
|
|
|
model_id = 'sentence-transformers/all-MiniLM-L6-v2' |
|
model_kwargs = {'device': 'cpu'} |
|
embeddings = HuggingFaceEmbeddings( |
|
model_name=model_id, |
|
model_kwargs=model_kwargs |
|
) |
|
|
|
documents = [ |
|
Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']}) |
|
for chunk in chunked_web_doc |
|
] |
|
|
|
chroma_db = Chroma.from_documents(documents=documents, |
|
collection_name='rag_web_db', |
|
embedding=embeddings, |
|
collection_metadata={"hnsw:space": "cosine"}, |
|
persist_directory="./web_db") |
|
|
|
similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold", |
|
search_kwargs={"k": 3, |
|
"score_threshold": 0.3}) |
|
|
|
|
|
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) |
|
|
|
|
|
|
|
|
|
|
|
from langchain.chains import create_history_aware_retriever |
|
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder |
|
|
|
contextualize_q_system_prompt = """Given a chat history and the latest user question \ |
|
which might reference context in the chat history, formulate a standalone question \ |
|
which can be understood without the chat history. Do NOT answer the question, \ |
|
just reformulate it if needed and otherwise return it as is.""" |
|
contextualize_q_prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", contextualize_q_system_prompt), |
|
MessagesPlaceholder("chat_history"), |
|
("human", "{input}"), |
|
] |
|
) |
|
history_aware_retriever = create_history_aware_retriever( |
|
llm, similarity_threshold_retriever, contextualize_q_prompt |
|
) |
|
|
|
|
|
|
|
|
|
|
|
from langchain.chains import create_retrieval_chain |
|
from langchain.chains.combine_documents import create_stuff_documents_chain |
|
|
|
qa_system_prompt = """You are an assistant for question-answering tasks. \ |
|
Use the following pieces of retrieved context to answer the question. \ |
|
If you don't know the answer, just say that you don't know. \ |
|
Use three sentences maximum and keep the answer concise.\ |
|
{context}""" |
|
qa_prompt = ChatPromptTemplate.from_messages( |
|
[ |
|
("system", qa_system_prompt), |
|
MessagesPlaceholder("chat_history"), |
|
("human", "{input}"), |
|
] |
|
) |
|
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) |
|
|
|
|
|
|
|
|
|
|
|
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) |
|
|
|
chat_history = [] |
|
|
|
def ask(question, chat_history): |
|
|
|
prepended_phrase = "using platform Moodle :" |
|
modified_question = prepended_phrase + question |
|
|
|
|
|
|
|
ai_message = rag_chain.invoke({"input": modified_question, "chat_history": chat_history}) |
|
chat_history.append(("user", question)) |
|
|
|
answer = ai_message["answer"] |
|
|
|
|
|
document_links = [] |
|
for doc in ai_message.get('context', []): |
|
if 'url' in doc.metadata: |
|
document_links.append(doc.metadata['url']) |
|
|
|
|
|
|
|
chat_history.append(("assistant", answer)) |
|
|
|
|
|
display_chat_history = [] |
|
for role, content in chat_history: |
|
if role == "user": |
|
display_chat_history.append((None, content)) |
|
else: |
|
display_chat_history.append((content, None)) |
|
|
|
|
|
if document_links: |
|
document_links_text = "\n".join(document_links) |
|
display_chat_history[-1] = (display_chat_history[-1][0] + f"\nSources: {document_links_text}", None) |
|
|
|
|
|
return display_chat_history, chat_history, "" |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
chatbot = gr.Chatbot() |
|
clear_button = gr.Button("Clear") |
|
|
|
question = gr.Textbox(placeholder="Ask me anything about Moodle...") |
|
chat_history = gr.State([]) |
|
|
|
question.submit(ask, [question, chat_history], [chatbot, chat_history, question]) |
|
clear_button.click(lambda: ([], [], ""), None, [chatbot, chat_history, question], queue=False) |
|
|
|
|
|
demo.queue() |
|
demo.launch(share=False) |
|
|