sidbhasin's picture
Update app.py
1be21c8 verified
raw
history blame
7.35 kB
import streamlit as st
from transformers import pipeline
import pdfplumber
import torch
from PyPDF2 import PdfReader
import re
# Set page config
st.set_page_config(
page_title="PDF AI Chat",
page_icon="πŸ“š",
layout="wide"
)
# Custom CSS for better styling
st.markdown("""
<style>
.chat-container {
display: flex;
flex-direction: column;
gap: 20px;
padding: 20px;
height: calc(100vh - 200px);
overflow-y: auto;
}
.message-container {
display: flex;
flex-direction: column;
gap: 10px;
padding: 15px;
border-radius: 10px;
max-width: 90%;
}
.user-message {
background-color: #2b313e;
color: white;
align-self: flex-end;
}
.assistant-message {
background-color: #f0f2f6;
color: black;
align-self: flex-start;
}
.source-info {
font-size: 0.8em;
color: #666;
border-top: 1px solid #ddd;
margin-top: 10px;
padding-top: 10px;
}
.context-box {
background-color: #f8f9fa;
border-left: 3px solid #1f77b4;
padding: 10px;
margin-top: 10px;
font-size: 0.9em;
}
.chat-input {
position: fixed;
bottom: 0;
left: 0;
right: 0;
padding: 20px;
background: white;
border-top: 1px solid #ddd;
}
</style>
""", unsafe_allow_html=True)
@st.cache_resource
def load_qa_model():
return pipeline(
"question-answering",
model="deepset/roberta-base-squad2",
tokenizer="deepset/roberta-base-squad2"
)
def process_pdf(pdf_file):
text_data = []
with pdfplumber.open(pdf_file) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text()
if text:
paragraphs = text.split('\n\n')
for para_num, paragraph in enumerate(paragraphs, 1):
if paragraph.strip():
text_data.append({
'text': paragraph.strip(),
'page': page_num,
'paragraph': para_num,
'context': paragraph.strip()
})
return text_data
def find_best_answer(question, text_data, qa_model):
best_answer = None
max_score = 0
relevant_context = []
for chunk in text_data:
try:
result = qa_model(
question=question,
context=chunk['text'],
max_answer_len=100
)
if result['score'] > max_score:
max_score = result['score']
best_answer = {
'answer': result['answer'],
'confidence': result['score'],
'page': chunk['page'],
'paragraph': chunk['paragraph'],
'context': chunk['context']
}
# Collect relevant contexts
if result['score'] > 0.1: # Threshold for relevance
relevant_context.append(chunk['context'])
except Exception as e:
continue
return best_answer, relevant_context[:3] # Return top 3 relevant contexts
def main():
st.title("πŸ“š Advanced PDF Question Answering")
# Initialize session state
if 'messages' not in st.session_state:
st.session_state.messages = []
if 'pdf_data' not in st.session_state:
st.session_state.pdf_data = None
# Load QA model
try:
qa_model = load_qa_model()
except Exception as e:
st.error(f"Error loading model: {str(e)}")
return
# File upload
pdf_file = st.file_uploader("Upload PDF Document", type=['pdf'])
if pdf_file and not st.session_state.pdf_data:
with st.spinner("Processing PDF..."):
try:
st.session_state.pdf_data = process_pdf(pdf_file)
st.success("PDF processed successfully! You can now ask questions.")
except Exception as e:
st.error(f"Error processing PDF: {str(e)}")
return
# Chat interface
st.markdown('<div class="chat-container">', unsafe_allow_html=True)
# Display chat history
for message in st.session_state.messages:
if message["role"] == "user":
st.markdown(f"""
<div class="message-container user-message">
{message["content"]}
</div>
""", unsafe_allow_html=True)
else:
st.markdown(f"""
<div class="message-container assistant-message">
<div>{message["content"]}</div>
<div class="source-info">
Source: Page {message["metadata"]["page"]},
Paragraph {message["metadata"]["paragraph"]}
(Confidence: {message["metadata"]["confidence"]:.1%})
</div>
<div class="context-box">
{message["metadata"]["context"]}
</div>
</div>
""", unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# Question input
if st.session_state.pdf_data:
question = st.text_input("Ask a question about the document:", key="question_input")
if question:
# Add user question to chat history
st.session_state.messages.append({"role": "user", "content": question})
# Generate answer
with st.spinner("Finding answer..."):
answer, relevant_contexts = find_best_answer(
question,
st.session_state.pdf_data,
qa_model
)
if answer:
# Add assistant response to chat history
st.session_state.messages.append({
"role": "assistant",
"content": answer["answer"],
"metadata": {
"page": answer["page"],
"paragraph": answer["paragraph"],
"confidence": answer["confidence"],
"context": answer["context"]
}
})
# Force refresh
st.rerun()
else:
st.error("Sorry, I couldn't find a relevant answer in the document.")
else:
st.markdown("""
### Instructions:
1. Upload a PDF document using the file uploader above
2. Wait for the document to be processed
3. Start asking questions about the content
4. Get detailed answers with source information and context
### Features:
- Natural conversation interface
- Source tracking with page numbers
- Confidence scores
- Relevant context display
- Multiple question support
""")
if __name__ == "__main__":
main()