|
import streamlit as st |
|
from transformers import pipeline |
|
import pdfplumber |
|
import torch |
|
from PyPDF2 import PdfReader |
|
import re |
|
|
|
|
|
st.set_page_config( |
|
page_title="PDF AI Chat", |
|
page_icon="π", |
|
layout="wide" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.chat-container { |
|
display: flex; |
|
flex-direction: column; |
|
gap: 20px; |
|
padding: 20px; |
|
height: calc(100vh - 200px); |
|
overflow-y: auto; |
|
} |
|
.message-container { |
|
display: flex; |
|
flex-direction: column; |
|
gap: 10px; |
|
padding: 15px; |
|
border-radius: 10px; |
|
max-width: 90%; |
|
} |
|
.user-message { |
|
background-color: #2b313e; |
|
color: white; |
|
align-self: flex-end; |
|
} |
|
.assistant-message { |
|
background-color: #f0f2f6; |
|
color: black; |
|
align-self: flex-start; |
|
} |
|
.source-info { |
|
font-size: 0.8em; |
|
color: #666; |
|
border-top: 1px solid #ddd; |
|
margin-top: 10px; |
|
padding-top: 10px; |
|
} |
|
.context-box { |
|
background-color: #f8f9fa; |
|
border-left: 3px solid #1f77b4; |
|
padding: 10px; |
|
margin-top: 10px; |
|
font-size: 0.9em; |
|
} |
|
.chat-input { |
|
position: fixed; |
|
bottom: 0; |
|
left: 0; |
|
right: 0; |
|
padding: 20px; |
|
background: white; |
|
border-top: 1px solid #ddd; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
@st.cache_resource |
|
def load_qa_model(): |
|
return pipeline( |
|
"question-answering", |
|
model="deepset/roberta-base-squad2", |
|
tokenizer="deepset/roberta-base-squad2" |
|
) |
|
|
|
def process_pdf(pdf_file): |
|
text_data = [] |
|
with pdfplumber.open(pdf_file) as pdf: |
|
for page_num, page in enumerate(pdf.pages, 1): |
|
text = page.extract_text() |
|
if text: |
|
paragraphs = text.split('\n\n') |
|
for para_num, paragraph in enumerate(paragraphs, 1): |
|
if paragraph.strip(): |
|
text_data.append({ |
|
'text': paragraph.strip(), |
|
'page': page_num, |
|
'paragraph': para_num, |
|
'context': paragraph.strip() |
|
}) |
|
return text_data |
|
|
|
def find_best_answer(question, text_data, qa_model): |
|
best_answer = None |
|
max_score = 0 |
|
relevant_context = [] |
|
|
|
for chunk in text_data: |
|
try: |
|
result = qa_model( |
|
question=question, |
|
context=chunk['text'], |
|
max_answer_len=100 |
|
) |
|
|
|
if result['score'] > max_score: |
|
max_score = result['score'] |
|
best_answer = { |
|
'answer': result['answer'], |
|
'confidence': result['score'], |
|
'page': chunk['page'], |
|
'paragraph': chunk['paragraph'], |
|
'context': chunk['context'] |
|
} |
|
|
|
|
|
if result['score'] > 0.1: |
|
relevant_context.append(chunk['context']) |
|
|
|
except Exception as e: |
|
continue |
|
|
|
return best_answer, relevant_context[:3] |
|
|
|
def main(): |
|
st.title("π Advanced PDF Question Answering") |
|
|
|
|
|
if 'messages' not in st.session_state: |
|
st.session_state.messages = [] |
|
if 'pdf_data' not in st.session_state: |
|
st.session_state.pdf_data = None |
|
|
|
|
|
try: |
|
qa_model = load_qa_model() |
|
except Exception as e: |
|
st.error(f"Error loading model: {str(e)}") |
|
return |
|
|
|
|
|
pdf_file = st.file_uploader("Upload PDF Document", type=['pdf']) |
|
|
|
if pdf_file and not st.session_state.pdf_data: |
|
with st.spinner("Processing PDF..."): |
|
try: |
|
st.session_state.pdf_data = process_pdf(pdf_file) |
|
st.success("PDF processed successfully! You can now ask questions.") |
|
except Exception as e: |
|
st.error(f"Error processing PDF: {str(e)}") |
|
return |
|
|
|
|
|
st.markdown('<div class="chat-container">', unsafe_allow_html=True) |
|
|
|
|
|
for message in st.session_state.messages: |
|
if message["role"] == "user": |
|
st.markdown(f""" |
|
<div class="message-container user-message"> |
|
{message["content"]} |
|
</div> |
|
""", unsafe_allow_html=True) |
|
else: |
|
st.markdown(f""" |
|
<div class="message-container assistant-message"> |
|
<div>{message["content"]}</div> |
|
<div class="source-info"> |
|
Source: Page {message["metadata"]["page"]}, |
|
Paragraph {message["metadata"]["paragraph"]} |
|
(Confidence: {message["metadata"]["confidence"]:.1%}) |
|
</div> |
|
<div class="context-box"> |
|
{message["metadata"]["context"]} |
|
</div> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
st.markdown('</div>', unsafe_allow_html=True) |
|
|
|
|
|
if st.session_state.pdf_data: |
|
question = st.text_input("Ask a question about the document:", key="question_input") |
|
|
|
if question: |
|
|
|
st.session_state.messages.append({"role": "user", "content": question}) |
|
|
|
|
|
with st.spinner("Finding answer..."): |
|
answer, relevant_contexts = find_best_answer( |
|
question, |
|
st.session_state.pdf_data, |
|
qa_model |
|
) |
|
|
|
if answer: |
|
|
|
st.session_state.messages.append({ |
|
"role": "assistant", |
|
"content": answer["answer"], |
|
"metadata": { |
|
"page": answer["page"], |
|
"paragraph": answer["paragraph"], |
|
"confidence": answer["confidence"], |
|
"context": answer["context"] |
|
} |
|
}) |
|
|
|
|
|
st.rerun() |
|
else: |
|
st.error("Sorry, I couldn't find a relevant answer in the document.") |
|
|
|
else: |
|
st.markdown(""" |
|
### Instructions: |
|
1. Upload a PDF document using the file uploader above |
|
2. Wait for the document to be processed |
|
3. Start asking questions about the content |
|
4. Get detailed answers with source information and context |
|
|
|
### Features: |
|
- Natural conversation interface |
|
- Source tracking with page numbers |
|
- Confidence scores |
|
- Relevant context display |
|
- Multiple question support |
|
""") |
|
|
|
if __name__ == "__main__": |
|
main() |