Spaces:

tatts
/

UNTE_ASSISTANT

App Files Files Community

UNTE_ASSISTANT / app.py

tatts

Update app.py

3c4f785 verified about 2 months ago

raw

history blame

10.3 kB

	import os
	import re
	import logging
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	from langdetect import detect, DetectorFactory
	from langdetect.lang_detect_exception import LangDetectException
	import langid
	from deep_translator import GoogleTranslator
	import gradio as gr
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain.docstore.document import Document
	from langchain_community.vectorstores.utils import filter_complex_metadata
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.pydantic_v1 import BaseModel, Field
	from langchain_openai import ChatOpenAI
	from langchain_core.runnables import RunnablePassthrough, RunnableLambda
	from langchain_core.output_parsers import StrOutputParser
	from operator import itemgetter
	from langchain_community.tools.tavily_search import TavilySearchResults
	from typing import List
	from typing_extensions import TypedDict
	from langgraph.graph import END, StateGraph
	from langchain_openai import OpenAIEmbeddings
	from langchain_community.document_loaders import UnstructuredURLLoader
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain.chains import create_history_aware_retriever
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
	from langchain_core.messages import HumanMessage

	# Setup logging
	logging.basicConfig(level=logging.DEBUG)

	OPENAI_API_TOKEN = "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"
	os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

	# Retrieve the secret token from environment variables
	hf_api_token = os.getenv('HF_API_TOKEN')

	# Ensure the token is not None
	if hf_api_token is None:
	raise ValueError("HF_API_TOKEN environment variable not set")

	# Fixing random seed for reproducibility in langdetect
	DetectorFactory.seed = 0

	# Function to translate text based on detected language
	def translate_content(text):
	try:
	detected_lang = detect(text)
	if detected_lang == 'fr':
	return GoogleTranslator(source='fr', target='en').translate(text)
	elif detected_lang == 'en':
	return GoogleTranslator(source='en', target='fr').translate(text)
	else:
	return text
	except Exception as e:
	print(f"Error detecting language or translating: {e}")
	return text

	# Function to chunk content
	def chunk_content(content, chunk_size=1250, overlap=250):
	chunks = []
	start = 0
	while start < len(content):
	end = start + chunk_size
	chunk = content[start:end]
	chunks.append(chunk)
	start += chunk_size - overlap
	return chunks

	# Initialize the list to store chunked documents
	chunked_web_doc = []

	# Load the Excel file
	df = pd.read_excel("UNTEanswers.xlsx")

	# Merge the 'prompt' and 'reference' columns
	df['merged_content'] = df['prompt'] + " " + df['reference']

	# Translate and store all text entries in a list
	text_entries = []

	for index, row in df.iterrows():
	# Original content
	merged_content = row['merged_content']
	text_entries.append(merged_content)

	# Translated content
	translated_content = translate_content(merged_content)
	if translated_content and translated_content != merged_content:
	text_entries.append(translated_content)

	# Convert the list of text entries into a single string
	excel_text = "\n".join(text_entries)

	# Process content from the Excel file
	for index, row in df.iterrows():
	merged_content = row['merged_content']

	# Chunk the original content
	en_chunks = chunk_content(merged_content)
	for chunk in en_chunks:
	chunked_web_doc.append({
	"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
	"language": detect(merged_content),
	"chunk": chunk
	})

	# Translate and chunk the content if necessary
	translated_content = translate_content(merged_content)
	if translated_content and translated_content != merged_content:
	translated_chunks = chunk_content(translated_content)
	for chunk in translated_chunks:
	chunked_web_doc.append({
	"url": "UNTEanswers.xlsx", # Mark as coming from the Excel file
	"language": detect(translated_content),
	"chunk": chunk
	})

	# Load the fetched content from the text file
	with open('fetched_contentt.txt', 'r', encoding='utf-8') as f:
	fetched_content = f.read()

	# Combine the text from the Excel file and the fetched content
	content = fetched_content + "\n" + excel_text

	# Optionally, save the combined content to a new file
	with open('merged_content.txt', 'w', encoding='utf-8') as f:
	f.write(content)


	web_contents = content.split("-" * 80 + "\n\n")

	for block in web_contents:
	if block.strip():
	lines = block.strip().splitlines()
	url = ""
	title = ""
	en_content = ""
	fr_content = ""
	language = None

	for i, line in enumerate(lines):
	if line.startswith("URL:"):
	url = line.split("URL:")[1].strip()
	elif line.startswith("Title:"):
	title = line.split("Title:")[1].strip()
	elif line == "English Content:":
	language = "en"
	elif line == "French Content:":
	language = "fr"
	else:
	if language == "en":
	en_content += line + "\n"
	elif language == "fr":
	fr_content += line + "\n"

	if en_content.strip():
	en_chunks = chunk_content(en_content.strip())
	for chunk in en_chunks:
	chunked_web_doc.append({
	"url": url,
	"language": "en",
	"chunk": chunk
	})

	if fr_content.strip():
	fr_chunks = chunk_content(fr_content.strip())
	for chunk in fr_chunks:
	chunked_web_doc.append({
	"url": url,
	"language": "fr",
	"chunk": chunk
	})

	model_id = 'sentence-transformers/all-MiniLM-L6-v2'
	model_kwargs = {'device': 'cpu'}
	embeddings = HuggingFaceEmbeddings(
	model_name=model_id,
	model_kwargs=model_kwargs
	)

	documents = [
	Document(page_content=chunk['chunk'], metadata={"url": chunk['url'], "language": chunk['language']})
	for chunk in chunked_web_doc
	]

	chroma_db = Chroma.from_documents(documents=documents,
	collection_name='rag_web_db',
	embedding=embeddings,
	collection_metadata={"hnsw:space": "cosine"},
	persist_directory="./web_db")

	similarity_threshold_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
	search_kwargs={"k": 3,
	"score_threshold": 0.3})


	llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)


	################ history_aware_retriever###################


	from langchain.chains import create_history_aware_retriever
	from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

	contextualize_q_system_prompt = """Given a chat history and the latest user question \
	which might reference context in the chat history, formulate a standalone question \
	which can be understood without the chat history. Do NOT answer the question, \
	just reformulate it if needed and otherwise return it as is."""
	contextualize_q_prompt = ChatPromptTemplate.from_messages(
	[
	("system", contextualize_q_system_prompt),
	MessagesPlaceholder("chat_history"),
	("human", "{input}"),
	]
	)
	history_aware_retriever = create_history_aware_retriever(
	llm, similarity_threshold_retriever, contextualize_q_prompt
	)


	################ question_answer_chain#####################


	from langchain.chains import create_retrieval_chain
	from langchain.chains.combine_documents import create_stuff_documents_chain

	qa_system_prompt = """You are an assistant for question-answering tasks. \
	Use the following pieces of retrieved context to answer the question. \
	If you don't know the answer, just say that you don't know. \
	Use three sentences maximum and keep the answer concise.\
	{context}"""
	qa_prompt = ChatPromptTemplate.from_messages(
	[
	("system", qa_system_prompt),
	MessagesPlaceholder("chat_history"),
	("human", "{input}"),
	]
	)
	question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)


	################ rag_chain#####################


	rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

	chat_history = ['Moodle','course','un cours']
	import gradio as gr

	#def ask(question, history):
	# ai_message = rag_chain.invoke({"input": question, "chat_history": chat_history})
	# chat_history.extend([HumanMessage(content=question), ai_message["answer"]])
	# return ai_message['answer']

	def ask(question, history):
	ai_message = rag_chain.invoke({"input": question, "chat_history": chat_history})
	chat_history.extend([HumanMessage(content=question), ai_message["answer"]])
	document_links = []
	if 'context' in ai_message and ai_message['context']:
	for doc in ai_message['context']:
	if 'url' in doc.metadata:
	document_links.append(doc.metadata['url'])
	# Format document links as part of the text output
	if document_links:
	document_links_text = "\n".join(document_links)
	links_text = f"\n\nSources:\n{document_links_text}"
	else:
	links_text = "UNTE_ASSISTANTE"

	demo = gr.ChatInterface(fn=ask, title="UNTE ChatBot",theme=gr.themes.Soft())



	if __name__ == "__main__":
	gr.close_all()
	demo.launch(share = False)