Spaces:

tatts
/

UNTE_ASSISTANT

Sleeping

App Files Files Community

UNTE_ASSISTANT / app.py

tatts

Update app.py

0792b3a verified 11 months ago

raw

history blame

11.3 kB

	import os
	import re
	import logging
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	from langdetect import detect, DetectorFactory
	from langdetect.lang_detect_exception import LangDetectException
	import langid
	from deep_translator import GoogleTranslator
	import gradio as gr
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain.docstore.document import Document
	from langchain_community.vectorstores.utils import filter_complex_metadata
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.pydantic_v1 import BaseModel, Field
	from langchain_openai import ChatOpenAI
	from langchain_core.runnables import RunnablePassthrough, RunnableLambda
	from langchain_core.output_parsers import StrOutputParser
	from operator import itemgetter
	from langchain_community.tools.tavily_search import TavilySearchResults
	from typing import List
	from typing_extensions import TypedDict
	from langgraph.graph import END, StateGraph
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.document_loaders import UnstructuredURLLoader
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain.chains import create_history_aware_retriever
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
	from langchain_core.messages import HumanMessage

	# Setup logging
	logging.basicConfig(level=logging.DEBUG)

	OPENAI_API_TOKEN = "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"
	os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

	# Retrieve the secret token from environment variables
	hf_api_token = os.getenv('HF_API_TOKEN')

	# Ensure the token is not None
	if hf_api_token is None:
	raise ValueError("HF_API_TOKEN environment variable not set")

	# Fixing random seed for reproducibility in langdetect
	DetectorFactory.seed = 0

	# Function to translate text based on detected language
	def translate_content(text):
	try:
	detected_lang = detect(text)
	if detected_lang == 'fr':
	return GoogleTranslator(source='fr', target='en').translate(text)
	elif detected_lang == 'en':
	return GoogleTranslator(source='en', target='fr').translate(text)
	else:
	return text
	except Exception as e:
	print(f"Error detecting language or translating: {e}")
	return text

	# Function to chunk content
	def chunk_content(content, chunk_size=1250, overlap=250):
	chunks = []
	start = 0
	while start < len(content):
	end = start + chunk_size
	chunk = content[start:end]
	chunks.append(chunk)
	start += chunk_size - overlap
	return chunks

	# Initialize the list to store chunked documents
	chunked_web_doc = []

	# Load the Excel file
	df = pd.read_excel("UNTEanswers.xlsx")

	# Merge the 'prompt' and 'reference' columns
	df['merged_content'] = df['prompt'] + " " + df['reference']

	# Translate and store all text entries in a list
	text_entries = []

	for index, row in df.iterrows():
	# Original content
	merged_content = row['merged_content']
	text_entries.append(merged_content)

	# Translated content
	translated_content = translate_content(merged_content)
	if translated_content and translated_content != merged_content:
	text_entries.append(translated_content)

	# Convert the list of text entries into a single string
	excel_text = "\n".join(text_entries)

	# Process content from the Excel file
	for index, row in df.iterrows():
	merged_content = row['merged_content']

	# Chunk the original content
	en_chunks = chunk_content(merged_content)
	for chunk in en_chunks:
	chunked_web_doc.append({
	"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
	"language": detect(merged_content),
	"chunk": chunk
	})

	# Translate and chunk the content if necessary
	translated_content = translate_content(merged_content)
	if translated_content and translated_content != merged_content:
	translated_chunks = chunk_content(translated_content)
	for chunk in translated_chunks:
	chunked_web_doc.append({
	"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
	"language": detect(translated_content),
	"chunk": chunk
	})

	# Load the fetched content from the text file
	with open('fetched_contentt.txt', 'r', encoding='utf-8') as f:
	fetched_content = f.read()

	# Combine the text from the Excel file and the fetched content
	content = fetched_content + "\n" + excel_text

	# Optionally, save the combined content to a new file
	with open('merged_content.txt', 'w', encoding='utf-8') as f:
	f.write(content)


	web_contents = content.split("-" * 80 + "\n\n")

	for block in web_contents:
	if block.strip():
	lines = block.strip().splitlines()
	url = ""
	title = ""
	en_content = ""
	fr_content = ""
	language = None

	for i, line in enumerate(lines):
	if line.startswith("URL:"):
	url = line.split("URL:")[1].strip()
	elif line.startswith("Title:"):
	title = line.split("Title:")[1].strip()
	elif line == "English Content:":
	language = "en"
	elif line == "French Content:":
	language = "fr"
	else:
	if language == "en":
	en_content += line + "\n"
	elif language == "fr":
	fr_content += line + "\n"

	if en_content.strip():
	en_chunks = chunk_content(en_content.strip())
	for chunk in en_chunks:
	chunked_web_doc.append({
	"url": url,
	"language": "en",
	"chunk": chunk
	})

	if fr_content.strip():
	fr_chunks = chunk_content(fr_content.strip())
	for chunk in fr_chunks:
	chunked_web_doc.append({
	"url": url,
	"language": "fr",
	"chunk": chunk
	})

	model_id = 'sentence-transformers/all-MiniLM-L6-v2'
	model_kwargs = {'device': 'cpu'}
	embeddings = HuggingFaceEmbeddings(
	model_name=model_id,
	model_kwargs=model_kwargs
	)

	documents = [
	Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']})
	for chunk in chunked_web_doc
	]

	chroma_db = Chroma.from_documents(documents=documents,
	collection_name='rag_web_db',
	embedding=embeddings,
	collection_metadata={"hnsw:space": "cosine"},
	persist_directory="./web_db")

	similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
	search_kwargs={"k": 3,
	"score_threshold": 0.3})


	llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


	################ history_aware_retriever###################


	from langchain.chains import create_history_aware_retriever
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

	contextualize_q_system_prompt = """Given a chat history and the latest user question \
	which might reference context in the chat history, formulate a standalone question \
	which can be understood without the chat history. Do NOT answer the question, \
	just reformulate it if needed and otherwise return it as is."""
	contextualize_q_prompt = ChatPromptTemplate.from_messages(
	[
	("system", contextualize_q_system_prompt),
	MessagesPlaceholder("chat_history"),
	("human", "{input}"),
	]
	)
	history_aware_retriever = create_history_aware_retriever(
	llm, similarity_threshold_retriever, contextualize_q_prompt
	)


	################ question_answer_chain#####################


	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain

	qa_system_prompt = """You are an assistant for question-answering tasks. \
	Use the following pieces of retrieved context to answer the question. \
	If you don't know the answer, just say that you don't know. \
	Use three sentences maximum and keep the answer concise.\
	{context}"""
	qa_prompt = ChatPromptTemplate.from_messages(
	[
	("system", qa_system_prompt),
	MessagesPlaceholder("chat_history"),
	("human", "{input}"),
	]
	)
	question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)


	################ rag_chain#####################


	rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

	chat_history = []

	def ask(question, chat_history):
	# Prepend a phrase to the question to ensure relevance to Moodle
	prepended_phrase = "using platform Moodle :"
	modified_question = prepended_phrase + question


	# Invoke the chain to get the response
	ai_message = rag_chain.invoke({"input": modified_question, "chat_history": chat_history})
	chat_history.append(("user", question))

	answer = ai_message["answer"]

	# Prepare document links if available
	document_links = []
	for doc in ai_message.get('context', []):
	if 'url' in doc.metadata:
	document_links.append(doc.metadata['url'])

	# Append the question and answer to the chat history (without sources)

	chat_history.append(("assistant", answer))

	# For display purposes, format the chat history without labels
	display_chat_history = []
	for role, content in chat_history:
	if role == "user":
	display_chat_history.append((None, content)) # User question on the right
	else:
	display_chat_history.append((content, None)) # Assistant answer on the left

	# Add sources to the last assistant message for display purposes only
	if document_links:
	document_links_text = "\n".join(document_links)
	display_chat_history[-1] = (display_chat_history[-1][0] + f"\nSources: {document_links_text}", None)

	# Return display history for the UI, and the actual chat history for internal use
	return display_chat_history, chat_history, ""




	# Initialize the Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	chatbot = gr.Chatbot()
	clear_button = gr.Button("Clear")
	#clear = gr.Button("Clear")
	question = gr.Textbox(placeholder="Ask me anything about Moodle...")
	chat_history = gr.State([])

	question.submit(ask, [question, chat_history], [chatbot, chat_history, question])
	clear_button.click(lambda: ([], [], ""), None, [chatbot, chat_history, question], queue=False)
	#clear.click(lambda: ("", []), None, [chatbot, chat_history, question], queue=False)

	demo.queue()
	demo.launch(share=False)