import os import re import logging import requests import pandas as pd from bs4 import BeautifulSoup from langdetect import detect, DetectorFactory from langdetect.lang_detect_exception import LangDetectException import langid from deep_translator import GoogleTranslator import gradio as gr from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain.docstore.document import Document from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_core.prompts import ChatPromptTemplate from langchain_core.pydantic_v1 import BaseModel, Field from langchain_openai import ChatOpenAI from langchain_core.runnables import RunnablePassthrough, RunnableLambda from langchain_core.output_parsers import StrOutputParser from operator import itemgetter from langchain_community.tools.tavily_search import TavilySearchResults from typing import List from typing_extensions import TypedDict from langgraph.graph import END, StateGraph from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders import UnstructuredURLLoader from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.memory import ConversationBufferMemory from langchain.chains import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain from langchain.chains import create_history_aware_retriever from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.messages import HumanMessage # Setup logging logging.basicConfig(level=logging.DEBUG) OPENAI_API_TOKEN = "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" os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN # Retrieve the secret token from environment variables hf_api_token = os.getenv('HF_API_TOKEN') # Ensure the token is not None if hf_api_token is None: raise ValueError("HF_API_TOKEN environment variable not set") # Fixing random seed for reproducibility in langdetect DetectorFactory.seed = 0 # Function to translate text based on detected language def translate_content(text): try: detected_lang = detect(text) if detected_lang == 'fr': return GoogleTranslator(source='fr', target='en').translate(text) elif detected_lang == 'en': return GoogleTranslator(source='en', target='fr').translate(text) else: return text except Exception as e: print(f"Error detecting language or translating: {e}") return text # Function to chunk content def chunk_content(content, chunk_size=1250, overlap=250): chunks = [] start = 0 while start < len(content): end = start + chunk_size chunk = content[start:end] chunks.append(chunk) start += chunk_size - overlap return chunks # Initialize the list to store chunked documents chunked_web_doc = [] # Load the Excel file df = pd.read_excel("UNTEanswers.xlsx") # Merge the 'prompt' and 'reference' columns df['merged_content'] = df['prompt'] + " " + df['reference'] # Translate and store all text entries in a list text_entries = [] for index, row in df.iterrows(): # Original content merged_content = row['merged_content'] text_entries.append(merged_content) # Translated content translated_content = translate_content(merged_content) if translated_content and translated_content != merged_content: text_entries.append(translated_content) # Convert the list of text entries into a single string excel_text = "\n".join(text_entries) # Process content from the Excel file for index, row in df.iterrows(): merged_content = row['merged_content'] # Chunk the original content en_chunks = chunk_content(merged_content) for chunk in en_chunks: chunked_web_doc.append({ "url": "UNTEanswers.xlsx", # Mark as coming from the Excel file "language": detect(merged_content), "chunk": chunk }) # Translate and chunk the content if necessary translated_content = translate_content(merged_content) if translated_content and translated_content != merged_content: translated_chunks = chunk_content(translated_content) for chunk in translated_chunks: chunked_web_doc.append({ "url": "UNTEanswers.xlsx", # Mark as coming from the Excel file "language": detect(translated_content), "chunk": chunk }) # Load the fetched content from the text file with open('fetched_contentt.txt', 'r', encoding='utf-8') as f: fetched_content = f.read() # Combine the text from the Excel file and the fetched content content = fetched_content + "\n" + excel_text # Optionally, save the combined content to a new file with open('merged_content.txt', 'w', encoding='utf-8') as f: f.write(content) web_contents = content.split("-" * 80 + "\n\n") for block in web_contents: if block.strip(): lines = block.strip().splitlines() url = "" title = "" en_content = "" fr_content = "" language = None for i, line in enumerate(lines): if line.startswith("URL:"): url = line.split("URL:")[1].strip() elif line.startswith("Title:"): title = line.split("Title:")[1].strip() elif line == "English Content:": language = "en" elif line == "French Content:": language = "fr" else: if language == "en": en_content += line + "\n" elif language == "fr": fr_content += line + "\n" if en_content.strip(): en_chunks = chunk_content(en_content.strip()) for chunk in en_chunks: chunked_web_doc.append({ "url": url, "language": "en", "chunk": chunk }) if fr_content.strip(): fr_chunks = chunk_content(fr_content.strip()) for chunk in fr_chunks: chunked_web_doc.append({ "url": url, "language": "fr", "chunk": chunk }) model_id = 'sentence-transformers/all-MiniLM-L6-v2' model_kwargs = {'device': 'cpu'} embeddings = HuggingFaceEmbeddings( model_name=model_id, model_kwargs=model_kwargs ) documents = [ Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']}) for chunk in chunked_web_doc ] chroma_db = Chroma.from_documents(documents=documents, collection_name='rag_web_db', embedding=embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="./web_db") similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 3, "score_threshold": 0.3}) llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) ################ history_aware_retriever################### from langchain.chains import create_history_aware_retriever from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder contextualize_q_system_prompt = """Given a chat history and the latest user question \ which might reference context in the chat history, formulate a standalone question \ which can be understood without the chat history. Do NOT answer the question, \ just reformulate it if needed and otherwise return it as is.""" contextualize_q_prompt = ChatPromptTemplate.from_messages( [ ("system", contextualize_q_system_prompt), MessagesPlaceholder("chat_history"), ("human", "{input}"), ] ) history_aware_retriever = create_history_aware_retriever( llm, similarity_threshold_retriever, contextualize_q_prompt ) ################ question_answer_chain##################### from langchain.chains import create_retrieval_chain from langchain.chains.combine_documents import create_stuff_documents_chain qa_system_prompt = """You are an assistant for question-answering tasks. \ Use the following pieces of retrieved context to answer the question. \ If you don't know the answer, just say that you don't know. \ Use three sentences maximum and keep the answer concise.\ {context}""" qa_prompt = ChatPromptTemplate.from_messages( [ ("system", qa_system_prompt), MessagesPlaceholder("chat_history"), ("human", "{input}"), ] ) question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) ################ rag_chain##################### rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) chat_history = [] def ask(question, chat_history): # Prepend a phrase to the question to ensure relevance to Moodle prepended_phrase = "using platform Moodle :" modified_question = prepended_phrase + question # Invoke the chain to get the response ai_message = rag_chain.invoke({"input": modified_question, "chat_history": chat_history}) chat_history.append(("user", question)) answer = ai_message["answer"] # Prepare document links if available document_links = [] for doc in ai_message.get('context', []): if 'url' in doc.metadata: document_links.append(doc.metadata['url']) # Append the question and answer to the chat history (without sources) chat_history.append(("assistant", answer)) # For display purposes, format the chat history without labels display_chat_history = [] for role, content in chat_history: if role == "user": display_chat_history.append((None, content)) # User question on the right else: display_chat_history.append((content, None)) # Assistant answer on the left # Add sources to the last assistant message for display purposes only if document_links: document_links_text = "\n".join(document_links) display_chat_history[-1] = (display_chat_history[-1][0] + f"\nSources: {document_links_text}", None) # Return display history for the UI, and the actual chat history for internal use return display_chat_history, chat_history, "" # Initialize the Gradio interface with gr.Blocks(theme=gr.themes.Soft()) as demo: chatbot = gr.Chatbot() clear_button = gr.Button("Clear") #clear = gr.Button("Clear") question = gr.Textbox(placeholder="Ask me anything about Moodle...") chat_history = gr.State([]) question.submit(ask, [question, chat_history], [chatbot, chat_history, question]) clear_button.click(lambda: ([], [], ""), None, [chatbot, chat_history, question], queue=False) #clear.click(lambda: ("", []), None, [chatbot, chat_history, question], queue=False) demo.queue() demo.launch(share=False)