import streamlit as st
from dotenv import load_dotenv
import os
import traceback
# PDF and NLP Libraries
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
# Embedding and Vector Store
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
# LLM and Conversational Chain
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
# Custom Templates
from htmlTemplate import css, bot_template, user_template
# Load environment variables
os.environ["GROQ_API_KEY"]= os.getenv('GROQ_API_KEY')
# LLM Template for focused responses
llmtemplate = """You're an AI information specialist with a strong emphasis on extracting accurate information from markdown documents. Your expertise involves summarizing data succinctly while adhering to strict guidelines about neutrality and clarity.
Your task is to answer a specific question based on a provided markdown document. Here is the question you need to address:
Keep in mind the following instructions:
- Your response should be direct and factual, limited to 50 words and 2-3 sentences.
- Avoid using introductory phrases like "yes" or "no."
- Maintain an ethical and unbiased tone, steering clear of harmful or offensive content.
- If the document lacks relevant information, respond with "I cannot provide an answer based on the provided document."
- Do not fabricate information, include questions, or use confirmatory phrases.
- Remember not to prompt for additional information or ask any questions.
Ensure your response is strictly based on the content of the markdown document.
def prepare_docs(pdf_docs):
"""Extract text from uploaded PDF documents"""
docs = []
metadata = []
content = []
for pdf in pdf_docs:
pdf_reader = PyPDF2.PdfReader(pdf)
for index, text in enumerate(pdf_reader.pages):
doc_page = {
'title': f"{} page {index + 1}",
'content': pdf_reader.pages[index].extract_text()
for doc in docs:
metadata.append({"title": doc["title"]})
return content, metadata
def get_text_chunks(content, metadata):
"""Split documents into manageable chunks"""
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
split_docs = text_splitter.create_documents(content, metadatas=metadata)
print(f"Split documents into {len(split_docs)} passages")
return split_docs
def ingest_into_vectordb(split_docs):
"""Create vector embeddings and store in FAISS"""
embeddings = HuggingFaceEmbeddings(
db = FAISS.from_documents(split_docs, embeddings)
DB_FAISS_PATH = 'vectorstore/db_faiss'
return db
def get_conversation_chain(vectordb):
"""Create conversational retrieval chain"""
llm = ChatGroq(model="llama3-70b-8192", temperature=0.25)
retriever = vectordb.as_retriever()
memory = ConversationBufferMemory(
conversation_chain = ConversationalRetrievalChain.from_llm(
print("Conversational Chain created for the LLM using the vector store")
return conversation_chain
def validate_answer_against_sources(response_answer, source_documents):
"""Validate AI's response against source documents"""
model = SentenceTransformer('all-MiniLM-L6-v2')
similarity_threshold = 0.5
source_texts = [doc.page_content for doc in source_documents]
answer_embedding = model.encode(response_answer, convert_to_tensor=True)
source_embeddings = model.encode(source_texts, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(answer_embedding, source_embeddings)
return any(score.item() > similarity_threshold for score in cosine_scores[0])
def handle_userinput(user_question):
"""Process user input and display chat history"""
response = st.session_state.conversation({'question': user_question})
st.session_state.chat_history = response['chat_history']
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
"{{MSG}}", message.content), unsafe_allow_html=True)
"{{MSG}}", message.content), unsafe_allow_html=True)
def main():
"""Main Streamlit application"""
page_title="PDF Insights AI",
st.write(css, unsafe_allow_html=True)
# Welcome section
st.title("π PDF Insights AI")
### Unlock the Knowledge in Your PDFs
- π€ AI-powered document analysis
- π¬ Ask questions about your uploaded documents
- π Support for multiple PDF files
# Initialize session state
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
# File upload section
with st.sidebar:
st.header("π€ Upload Documents")
pdf_docs = st.file_uploader(
"Upload your PDFs here",
help="Upload PDF files to analyze. Max file size: 200MB"
# File validation
if pdf_docs:
for doc in pdf_docs:
if doc.size > 200 * 1024 * 1024: # 200 MB
st.error(f"File {} is too large. Maximum file size is 200MB.")
if st.button("Process Documents", type="primary"):
if not pdf_docs:
st.warning("Please upload at least one PDF file.")
with st.spinner("Processing your documents..."):
# Process documents
content, metadata = prepare_docs(pdf_docs)
split_docs = get_text_chunks(content, metadata)
vectorstore = ingest_into_vectordb(split_docs)
st.session_state.conversation = get_conversation_chain(vectorstore)
st.success("Documents processed successfully! You can now ask questions.")
except Exception as e:
st.error(f"An error occurred while processing documents: {str(e)}")
# Question input section
user_question = st.text_input(
"π Ask a question about your documents",
placeholder="What insights can you provide from these documents?"
if user_question:
if st.session_state.conversation is None:
st.warning("Please upload and process documents first.")
if __name__ == '__main__':
main() |