Spaces:
Sleeping
Sleeping
import os | |
import re | |
import logging | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from langdetect import detect, DetectorFactory | |
from langdetect.lang_detect_exception import LangDetectException | |
import langid | |
from deep_translator import GoogleTranslator | |
import gradio as gr | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import Chroma | |
from langchain.docstore.document import Document | |
from langchain_community.vectorstores.utils import filter_complex_metadata | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_core.pydantic_v1 import BaseModel, Field | |
from langchain_openai import ChatOpenAI | |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda | |
from langchain_core.output_parsers import StrOutputParser | |
from operator import itemgetter | |
from langchain_community.tools.tavily_search import TavilySearchResults | |
from typing import List | |
from typing_extensions import TypedDict | |
from langgraph.graph import END, StateGraph | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_community.document_loaders import UnstructuredURLLoader | |
from langchain_community.vectorstores import FAISS | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import create_retrieval_chain | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain.chains import create_history_aware_retriever | |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder | |
from langchain_core.messages import HumanMessage | |
# Setup logging | |
logging.basicConfig(level=logging.DEBUG) | |
OPENAI_API_TOKEN = "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" | |
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN | |
# Retrieve the secret token from environment variables | |
hf_api_token = os.getenv('HF_API_TOKEN') | |
# Ensure the token is not None | |
if hf_api_token is None: | |
raise ValueError("HF_API_TOKEN environment variable not set") | |
# Fixing random seed for reproducibility in langdetect | |
DetectorFactory.seed = 0 | |
# Function to translate text based on detected language | |
def translate_content(text): | |
try: | |
detected_lang = detect(text) | |
if detected_lang == 'fr': | |
return GoogleTranslator(source='fr', target='en').translate(text) | |
elif detected_lang == 'en': | |
return GoogleTranslator(source='en', target='fr').translate(text) | |
else: | |
return text | |
except Exception as e: | |
print(f"Error detecting language or translating: {e}") | |
return text | |
# Function to chunk content | |
def chunk_content(content, chunk_size=1250, overlap=250): | |
chunks = [] | |
start = 0 | |
while start < len(content): | |
end = start + chunk_size | |
chunk = content[start:end] | |
chunks.append(chunk) | |
start += chunk_size - overlap | |
return chunks | |
# Initialize the list to store chunked documents | |
chunked_web_doc = [] | |
# Load the Excel file | |
df = pd.read_excel("UNTEanswers.xlsx") | |
# Merge the 'prompt' and 'reference' columns | |
df['merged_content'] = df['prompt'] + " " + df['reference'] | |
# Translate and store all text entries in a list | |
text_entries = [] | |
for index, row in df.iterrows(): | |
# Original content | |
merged_content = row['merged_content'] | |
text_entries.append(merged_content) | |
# Translated content | |
translated_content = translate_content(merged_content) | |
if translated_content and translated_content != merged_content: | |
text_entries.append(translated_content) | |
# Convert the list of text entries into a single string | |
excel_text = "\n".join(text_entries) | |
# Process content from the Excel file | |
for index, row in df.iterrows(): | |
merged_content = row['merged_content'] | |
# Chunk the original content | |
en_chunks = chunk_content(merged_content) | |
for chunk in en_chunks: | |
chunked_web_doc.append({ | |
"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file | |
"language": detect(merged_content), | |
"chunk": chunk | |
}) | |
# Translate and chunk the content if necessary | |
translated_content = translate_content(merged_content) | |
if translated_content and translated_content != merged_content: | |
translated_chunks = chunk_content(translated_content) | |
for chunk in translated_chunks: | |
chunked_web_doc.append({ | |
"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file | |
"language": detect(translated_content), | |
"chunk": chunk | |
}) | |
# Load the fetched content from the text file | |
with open('fetched_contentt.txt', 'r', encoding='utf-8') as f: | |
fetched_content = f.read() | |
# Combine the text from the Excel file and the fetched content | |
content = fetched_content + "\n" + excel_text | |
# Optionally, save the combined content to a new file | |
with open('merged_content.txt', 'w', encoding='utf-8') as f: | |
f.write(content) | |
web_contents = content.split("-" * 80 + "\n\n") | |
for block in web_contents: | |
if block.strip(): | |
lines = block.strip().splitlines() | |
url = "" | |
title = "" | |
en_content = "" | |
fr_content = "" | |
language = None | |
for i, line in enumerate(lines): | |
if line.startswith("URL:"): | |
url = line.split("URL:")[1].strip() | |
elif line.startswith("Title:"): | |
title = line.split("Title:")[1].strip() | |
elif line == "English Content:": | |
language = "en" | |
elif line == "French Content:": | |
language = "fr" | |
else: | |
if language == "en": | |
en_content += line + "\n" | |
elif language == "fr": | |
fr_content += line + "\n" | |
if en_content.strip(): | |
en_chunks = chunk_content(en_content.strip()) | |
for chunk in en_chunks: | |
chunked_web_doc.append({ | |
"url": url, | |
"language": "en", | |
"chunk": chunk | |
}) | |
if fr_content.strip(): | |
fr_chunks = chunk_content(fr_content.strip()) | |
for chunk in fr_chunks: | |
chunked_web_doc.append({ | |
"url": url, | |
"language": "fr", | |
"chunk": chunk | |
}) | |
model_id = 'sentence-transformers/all-MiniLM-L6-v2' | |
model_kwargs = {'device': 'cpu'} | |
embeddings = HuggingFaceEmbeddings( | |
model_name=model_id, | |
model_kwargs=model_kwargs | |
) | |
documents = [ | |
Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']}) | |
for chunk in chunked_web_doc | |
] | |
chroma_db = Chroma.from_documents(documents=documents, | |
collection_name='rag_web_db', | |
embedding=embeddings, | |
collection_metadata={"hnsw:space": "cosine"}, | |
persist_directory="./web_db") | |
similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold", | |
search_kwargs={"k": 3, | |
"score_threshold": 0.3}) | |
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) | |
################ history_aware_retriever################### | |
from langchain.chains import create_history_aware_retriever | |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder | |
contextualize_q_system_prompt = """Given a chat history and the latest user question \ | |
which might reference context in the chat history, formulate a standalone question \ | |
which can be understood without the chat history. Do NOT answer the question, \ | |
just reformulate it if needed and otherwise return it as is.""" | |
contextualize_q_prompt = ChatPromptTemplate.from_messages( | |
[ | |
("system", contextualize_q_system_prompt), | |
MessagesPlaceholder("chat_history"), | |
("human", "{input}"), | |
] | |
) | |
history_aware_retriever = create_history_aware_retriever( | |
llm, similarity_threshold_retriever, contextualize_q_prompt | |
) | |
################ question_answer_chain##################### | |
from langchain.chains import create_retrieval_chain | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
qa_system_prompt = """You are an assistant for question-answering tasks. \ | |
Use the following pieces of retrieved context to answer the question. \ | |
If you don't know the answer, just say that you don't know. \ | |
Use three sentences maximum and keep the answer concise.\ | |
{context}""" | |
qa_prompt = ChatPromptTemplate.from_messages( | |
[ | |
("system", qa_system_prompt), | |
MessagesPlaceholder("chat_history"), | |
("human", "{input}"), | |
] | |
) | |
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) | |
################ rag_chain##################### | |
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) | |
chat_history = [] | |
def ask(question, chat_history): | |
# Prepend a phrase to the question to ensure relevance to Moodle | |
prepended_phrase = "using platform Moodle :" | |
modified_question = prepended_phrase + question | |
# Invoke the chain to get the response | |
ai_message = rag_chain.invoke({"input": modified_question, "chat_history": chat_history}) | |
chat_history.append(("user", question)) | |
answer = ai_message["answer"] | |
# Prepare document links if available | |
document_links = [] | |
for doc in ai_message.get('context', []): | |
if 'url' in doc.metadata: | |
document_links.append(doc.metadata['url']) | |
# Append the question and answer to the chat history (without sources) | |
chat_history.append(("assistant", answer)) | |
# For display purposes, format the chat history without labels | |
display_chat_history = [] | |
for role, content in chat_history: | |
if role == "user": | |
display_chat_history.append((None, content)) # User question on the right | |
else: | |
display_chat_history.append((content, None)) # Assistant answer on the left | |
# Add sources to the last assistant message for display purposes only | |
if document_links: | |
document_links_text = "\n".join(document_links) | |
display_chat_history[-1] = (display_chat_history[-1][0] + f"\nSources: {document_links_text}", None) | |
# Return display history for the UI, and the actual chat history for internal use | |
return display_chat_history, chat_history, "" | |
# Initialize the Gradio interface | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
chatbot = gr.Chatbot() | |
clear_button = gr.Button("Clear") | |
#clear = gr.Button("Clear") | |
question = gr.Textbox(placeholder="Ask me anything about Moodle...") | |
chat_history = gr.State([]) | |
question.submit(ask, [question, chat_history], [chatbot, chat_history, question]) | |
clear_button.click(lambda: ([], [], ""), None, [chatbot, chat_history, question], queue=False) | |
#clear.click(lambda: ("", []), None, [chatbot, chat_history, question], queue=False) | |
demo.queue() | |
demo.launch(share=False) | |