import streamlit as st import PyPDF2 from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration # Load PDF and extract text def load_pdf(uploaded_file): reader = PyPDF2.PdfReader(uploaded_file) text = "" for page in reader.pages: if page.extract_text(): # Ensure text extraction is valid text += page.extract_text() + "\n" return text # Initialize RAG model def initialize_rag_model(): # Load the tokenizer and model tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq") retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="legacy", use_dummy_dataset=True) model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq") return tokenizer, retriever, model # Process user query def generate_answer(query, context, tokenizer, retriever, model): # Tokenize the input question inputs = tokenizer(query, return_tensors="pt") # Generate context embeddings using retriever context_input_ids = retriever(context, return_tensors="pt")["input_ids"] # Prepare inputs for the model inputs["context_input_ids"] = context_input_ids # Generate the answer outputs = model.generate(**inputs) answer = tokenizer.batch_decode(outputs, skip_special_tokens=True) return answer[0] # Streamlit UI st.title("PDF Question-Answer Chatbot") uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) if uploaded_file is not None: text = load_pdf(uploaded_file) st.write("PDF loaded successfully. You can now ask questions.") # Initialize the RAG model tokenizer, retriever, model = initialize_rag_model() user_query = st.text_input("Ask a question about the PDF:") if user_query: answer = generate_answer(user_query, text, tokenizer, retriever, model) st.write(f"Answer: {answer}") # Display the answer