Spaces:

tatts
/

UNTE_ASSISTANT

File size: 11,300 Bytes

import os
import re
import logging
import requests
import pandas as pd
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import langid
from deep_translator import GoogleTranslator
import gradio as gr
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain_community.tools.tavily_search import TavilySearchResults
from typing import List
from typing_extensions import TypedDict
from langgraph.graph import END, StateGraph
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage

# Setup logging
logging.basicConfig(level=logging.DEBUG)

OPENAI_API_TOKEN = "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"
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

# Retrieve the secret token from environment variables
hf_api_token = os.getenv('HF_API_TOKEN')

# Ensure the token is not None
if hf_api_token is None:
    raise ValueError("HF_API_TOKEN environment variable not set")

# Fixing random seed for reproducibility in langdetect
DetectorFactory.seed = 0

# Function to translate text based on detected language
def translate_content(text):
    try:
        detected_lang = detect(text)
        if detected_lang == 'fr':
            return GoogleTranslator(source='fr', target='en').translate(text)
        elif detected_lang == 'en':
            return GoogleTranslator(source='en', target='fr').translate(text)
        else:
            return text
    except Exception as e:
        print(f"Error detecting language or translating: {e}")
        return text

# Function to chunk content
def chunk_content(content, chunk_size=1250, overlap=250):
    chunks = []
    start = 0
    while start < len(content):
        end = start + chunk_size
        chunk = content[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# Initialize the list to store chunked documents
chunked_web_doc = []

# Load the Excel file
df = pd.read_excel("UNTEanswers.xlsx")

# Merge the 'prompt' and 'reference' columns
df['merged_content'] = df['prompt'] + " " + df['reference']

# Translate and store all text entries in a list
text_entries = []

for index, row in df.iterrows():
    # Original content
    merged_content = row['merged_content']
    text_entries.append(merged_content)
    
    # Translated content
    translated_content = translate_content(merged_content)
    if translated_content and translated_content != merged_content:
        text_entries.append(translated_content)

# Convert the list of text entries into a single string
excel_text = "\n".join(text_entries)

# Process content from the Excel file
for index, row in df.iterrows():
    merged_content = row['merged_content']
    
    # Chunk the original content
    en_chunks = chunk_content(merged_content)
    for chunk in en_chunks:
        chunked_web_doc.append({
            "url": "UNTEanswers.xlsx",  # Mark as coming from the Excel file
            "language": detect(merged_content),
            "chunk": chunk
        })
    
    # Translate and chunk the content if necessary
    translated_content = translate_content(merged_content)
    if translated_content and translated_content != merged_content:
        translated_chunks = chunk_content(translated_content)
        for chunk in translated_chunks:
            chunked_web_doc.append({
                "url": "UNTEanswers.xlsx",  # Mark as coming from the Excel file
                "language": detect(translated_content),
                "chunk": chunk
            })

# Load the fetched content from the text file
with open('fetched_contentt.txt', 'r', encoding='utf-8') as f:
    fetched_content = f.read()

# Combine the text from the Excel file and the fetched content
content = fetched_content + "\n" + excel_text

# Optionally, save the combined content to a new file
with open('merged_content.txt', 'w', encoding='utf-8') as f:
    f.write(content)


web_contents = content.split("-" * 80 + "\n\n")

for block in web_contents:
    if block.strip():
        lines = block.strip().splitlines()
        url = ""
        title = ""
        en_content = ""
        fr_content = ""
        language = None

        for i, line in enumerate(lines):
            if line.startswith("URL:"):
                url = line.split("URL:")[1].strip()
            elif line.startswith("Title:"):
                title = line.split("Title:")[1].strip()
            elif line == "English Content:":
                language = "en"
            elif line == "French Content:":
                language = "fr"
            else:
                if language == "en":
                    en_content += line + "\n"
                elif language == "fr":
                    fr_content += line + "\n"

        if en_content.strip():
            en_chunks = chunk_content(en_content.strip())
            for chunk in en_chunks:
                chunked_web_doc.append({
                    "url": url,
                    "language": "en",
                    "chunk": chunk
                })

        if fr_content.strip():
            fr_chunks = chunk_content(fr_content.strip())
            for chunk in fr_chunks:
                chunked_web_doc.append({
                    "url": url,
                    "language": "fr",
                    "chunk": chunk
                })

model_id = 'sentence-transformers/all-MiniLM-L6-v2'
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(
    model_name=model_id,
    model_kwargs=model_kwargs
)

documents = [
    Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']})
    for chunk in chunked_web_doc
]

chroma_db = Chroma.from_documents(documents=documents,
                                  collection_name='rag_web_db',
                                  embedding=embeddings,
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./web_db")

similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
                                                        search_kwargs={"k": 3,
                                                                       "score_threshold": 0.3})


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


################ history_aware_retriever###################


from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, similarity_threshold_retriever, contextualize_q_prompt
)


################ question_answer_chain#####################


from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\
{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)


################ rag_chain#####################


rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

chat_history = []

def ask(question, chat_history):
    # Prepend a phrase to the question to ensure relevance to Moodle
    prepended_phrase = "using platform Moodle :"
    modified_question = prepended_phrase + question

    
    # Invoke the chain to get the response
    ai_message = rag_chain.invoke({"input": modified_question, "chat_history": chat_history})
    chat_history.append(("user", question))
    
    answer = ai_message["answer"]

    # Prepare document links if available
    document_links = []
    for doc in ai_message.get('context', []):
        if 'url' in doc.metadata:
            document_links.append(doc.metadata['url'])

    # Append the question and answer to the chat history (without sources)

    chat_history.append(("assistant", answer))

    # For display purposes, format the chat history without labels
    display_chat_history = []
    for role, content in chat_history:
        if role == "user":
            display_chat_history.append((None, content))  # User question on the right
        else:
            display_chat_history.append((content, None))  # Assistant answer on the left

    # Add sources to the last assistant message for display purposes only
    if document_links:
        document_links_text = "\n".join(document_links)
        display_chat_history[-1] = (display_chat_history[-1][0] + f"\nSources: {document_links_text}", None)

    # Return display history for the UI, and the actual chat history for internal use
    return display_chat_history, chat_history, ""




# Initialize the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    chatbot = gr.Chatbot()
    clear_button = gr.Button("Clear")
    #clear = gr.Button("Clear")
    question = gr.Textbox(placeholder="Ask me anything about Moodle...")
    chat_history = gr.State([])

    question.submit(ask, [question, chat_history], [chatbot, chat_history, question])
    clear_button.click(lambda: ([], [], ""), None, [chatbot, chat_history, question], queue=False)
    #clear.click(lambda: ("", []), None, [chatbot, chat_history, question], queue=False)

demo.queue()
demo.launch(share=False)