Spaces:
Sleeping
Sleeping
import streamlit as st | |
import PyPDF2 | |
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration | |
# Load PDF and extract text | |
def load_pdf(uploaded_file): | |
reader = PyPDF2.PdfReader(uploaded_file) | |
text = "" | |
for page in reader.pages: | |
if page.extract_text(): # Ensure text extraction is valid | |
text += page.extract_text() + "\n" | |
return text | |
# Initialize RAG model | |
def initialize_rag_model(): | |
# Load the tokenizer and model | |
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") | |
# Use a dummy retriever for testing purposes | |
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True) | |
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq") | |
return tokenizer, retriever, model | |
# Process user query | |
def generate_answer(query, context, tokenizer, retriever, model): | |
# Tokenize the input question | |
inputs = tokenizer(query, return_tensors="pt") | |
# Prepare inputs for the model with a dummy context | |
inputs["context_input_ids"] = retriever(context, return_tensors="pt")["input_ids"] | |
# Generate the answer | |
outputs = model.generate(**inputs) | |
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True) | |
return answer[0] | |
# Streamlit UI | |
st.title("PDF Question-Answer Chatbot") | |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
text = load_pdf(uploaded_file) | |
st.write("PDF loaded successfully. You can now ask questions.") | |
# Initialize the RAG model | |
tokenizer, retriever, model = initialize_rag_model() | |
user_query = st.text_input("Ask a question about the PDF:") | |
if user_query: | |
answer = generate_answer(user_query, text, tokenizer, retriever, model) | |
st.write(f"Answer: {answer}") # Display the answer | |