Spaces:

sango07
/

Chat_with_multiple_PDFs

Sleeping

App Files Files Community

Chat_with_multiple_PDFs / app.py

sango07

Update app.py

9dc7ca8 verified 7 months ago

raw

history blame contribute delete

7.57 kB

	import streamlit as st
	from dotenv import load_dotenv
	import os
	import traceback

	# PDF and NLP Libraries
	import PyPDF2
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from sentence_transformers import SentenceTransformer, util

	# Embedding and Vector Store
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS

	# LLM and Conversational Chain
	from langchain_groq import ChatGroq
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain.prompts import PromptTemplate

	# Custom Templates
	from htmlTemplate import css, bot_template, user_template

	# Load environment variables
	os.environ["GROQ_API_KEY"]= os.getenv('GROQ_API_KEY')

	# LLM Template for focused responses
	llmtemplate = """You're an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
	Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
	{question}
	Keep in mind the following instructions:
	- Your response should be direct and factual, limited to 50 words and 2-3 sentences.
	- Avoid using introductory phrases like "yes" or "no."
	- Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
	- If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
	- Do not fabricate information, include questions, or use confirmatory phrases.
	- Remember not to prompt for additional information or ask any questions.
	Ensure your response is strictly based on the content of the markdown document.
	"""

	def prepare_docs(pdf_docs):
	"""Extract text from uploaded PDF documents"""
	docs = []
	metadata = []
	content = []

	for pdf in pdf_docs:
	pdf_reader = PyPDF2.PdfReader(pdf)
	for index, text in enumerate(pdf_reader.pages):
	doc_page = {
	'title': f"{pdf.name} page {index + 1}",
	'content': pdf_reader.pages[index].extract_text()
	}
	docs.append(doc_page)

	for doc in docs:
	content.append(doc["content"])
	metadata.append({"title": doc["title"]})

	return content, metadata

	def get_text_chunks(content, metadata):
	"""Split documents into manageable chunks"""
	text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
	chunk_size=1024,
	chunk_overlap=256,
	)
	split_docs = text_splitter.create_documents(content, metadatas=metadata)
	print(f"Split documents into {len(split_docs)} passages")
	return split_docs

	def ingest_into_vectordb(split_docs):
	"""Create vector embeddings and store in FAISS"""
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device':'cpu'}
	)
	db = FAISS.from_documents(split_docs, embeddings)
	DB_FAISS_PATH = 'vectorstore/db_faiss'
	db.save_local(DB_FAISS_PATH)
	return db

	def get_conversation_chain(vectordb):
	"""Create conversational retrieval chain"""
	llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
	retriever = vectordb.as_retriever()

	memory = ConversationBufferMemory(
	memory_key='chat_history',
	return_messages=True,
	output_key='answer'
	)

	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=retriever,
	memory=memory,
	return_source_documents=True
	)

	print("Conversational Chain created for the LLM using the vector store")
	return conversation_chain

	def validate_answer_against_sources(response_answer, source_documents):
	"""Validate AI's response against source documents"""
	model = SentenceTransformer('all-MiniLM-L6-v2')
	similarity_threshold = 0.5
	source_texts = [doc.page_content for doc in source_documents]

	answer_embedding = model.encode(response_answer, convert_to_tensor=True)
	source_embeddings = model.encode(source_texts, convert_to_tensor=True)

	cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)

	return any(score.item() > similarity_threshold for score in cosine_scores[0])

	def handle_userinput(user_question):
	"""Process user input and display chat history"""
	response = st.session_state.conversation({'question': user_question})
	st.session_state.chat_history = response['chat_history']

	for i, message in enumerate(st.session_state.chat_history):
	if i % 2 == 0:
	st.write(user_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)
	else:
	st.write(bot_template.replace(
	"{{MSG}}", message.content), unsafe_allow_html=True)

	def main():
	"""Main Streamlit application"""
	load_dotenv()

	st.set_page_config(
	page_title="PDF Insights AI",
	page_icon=":books:",
	layout="wide"
	)
	st.write(css, unsafe_allow_html=True)

	# Welcome section
	st.title("📚 PDF Insights AI")
	st.markdown("""
	### Unlock the Knowledge in Your PDFs
	- 🤖 AI-powered document analysis
	- 💬 Ask questions about your uploaded documents
	- 📄 Support for multiple PDF files
	""")

	# Initialize session state
	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []

	# File upload section
	with st.sidebar:
	st.header("📤 Upload Documents")
	pdf_docs = st.file_uploader(
	"Upload your PDFs here",
	type=['pdf'],
	accept_multiple_files=True,
	help="Upload PDF files to analyze. Max file size: 200MB"
	)

	# File validation
	if pdf_docs:
	for doc in pdf_docs:
	if doc.size > 200 * 1024 * 1024: # 200 MB
	st.error(f"File {doc.name} is too large. Maximum file size is 200MB.")
	pdf_docs.remove(doc)

	if st.button("Process Documents", type="primary"):
	if not pdf_docs:
	st.warning("Please upload at least one PDF file.")
	else:
	with st.spinner("Processing your documents..."):
	try:
	# Process documents
	content, metadata = prepare_docs(pdf_docs)
	split_docs = get_text_chunks(content, metadata)
	vectorstore = ingest_into_vectordb(split_docs)
	st.session_state.conversation = get_conversation_chain(vectorstore)

	st.success("Documents processed successfully! You can now ask questions.")
	except Exception as e:
	st.error(f"An error occurred while processing documents: {str(e)}")

	# Question input section
	user_question = st.text_input(
	"📝 Ask a question about your documents",
	placeholder="What insights can you provide from these documents?"
	)

	if user_question:
	if st.session_state.conversation is None:
	st.warning("Please upload and process documents first.")
	else:
	handle_userinput(user_question)

	if __name__ == '__main__':
	main()