Spaces:
Sleeping
Sleeping
import streamlit as st | |
import fitz # PyMuPDF | |
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration | |
import torch | |
# Load the RAG model components | |
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq") | |
retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq") | |
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq") | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
doc = fitz.open(pdf_file) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
# Function to handle question answering | |
def answer_question(question, pdf_text): | |
# Prepare the context for the model | |
inputs = tokenizer(question, return_tensors="pt") | |
# Tokenize PDF text | |
pdf_inputs = tokenizer(pdf_text, return_tensors="pt") | |
# Generate the answer | |
with torch.no_grad(): | |
outputs = model.generate(input_ids=inputs['input_ids'], | |
attention_mask=inputs['attention_mask'], | |
context_input_ids=pdf_inputs['input_ids'], | |
context_attention_mask=pdf_inputs['attention_mask']) | |
answer = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return answer | |
# Streamlit app | |
st.title("PDF Question-Answer Chatbot") | |
st.write("Upload a PDF file and ask questions based on its content.") | |
# File uploader | |
pdf_file = st.file_uploader("Upload PDF", type=["pdf"]) | |
if pdf_file is not None: | |
# Extract text from the PDF | |
pdf_text = extract_text_from_pdf(pdf_file) | |
st.success("PDF loaded successfully!") | |
# Question input | |
question = st.text_input("Ask a question:") | |
if question: | |
with st.spinner("Finding answer..."): | |
try: | |
answer = answer_question(question, pdf_text) | |
st.write("### Answer:") | |
st.write(answer) | |
except Exception as e: | |
st.error(f"Error occurred: {str(e)}") | |