Spaces:

tarrasyed19472007
/

Ragpdfbot

Sleeping

App Files Files Community

tarrasyed19472007 commited on Oct 28, 2024

Commit

42feee4

verified ·

1 Parent(s): 9ed2bab

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -39

app.py CHANGED Viewed

@@ -1,52 +1,66 @@
 import streamlit as st
-import PyPDF2
-import numpy as np
 from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
 import faiss
-# Function to load the PDF file and extract text
-def load_pdf(file):
-    reader = PyPDF2.PdfReader(file)
     text = ""
-    for page in reader.pages:
-        text += page.extract_text()
     return text
-# Initialize the RAG model
-def initialize_rag_model():
-    # Load the tokenizer
-    tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-    # Load the retriever
-    retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="legacy", use_dummy_dataset=True)
-    # Load the model
-    model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq")
-    return tokenizer, retriever, model
-# Function to generate answers
-def generate_answer(question, text, tokenizer, retriever, model):
-    inputs = tokenizer([question], return_tensors="pt")
-    # Here we are creating a dummy dataset from the PDF text
-    input_ids = tokenizer(text, return_tensors="pt").input_ids
-    # Get the retrieved documents
-    doc_scores, retrieved_doc_indices = retriever(input_ids, return_tensors="pt").values()
-    # Generate answer
-    outputs = model.generate(inputs["input_ids"], doc_scores=doc_scores)
-    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
     return answer
-# Streamlit app layout
-st.title("PDF Question-Answering Chatbot")
-uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
-if uploaded_file is not None:
-    text = load_pdf(uploaded_file)
-    st.text_area("Extracted Text", text, height=300)
-    question = st.text_input("Ask a question about the content:")
-    if st.button("Get Answer"):
-        if question:
-            try:
-                tokenizer, retriever, model = initialize_rag_model()
-                answer = ge

 import streamlit as st
+import fitz  # PyMuPDF
 from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
+import numpy as np
 import faiss
+import torch
+# Load the RAG model components
+tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
+retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq")
+model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    doc = fitz.open(pdf_file)
     text = ""
+    for page in doc:
+        text += page.get_text()
     return text
+# Function to handle question answering
+def answer_question(question, pdf_text):
+    # Tokenize the question
+    inputs = tokenizer(question, return_tensors="pt")
+    # Retrieve documents based on the PDF text
+    doc_embeds = retriever.get_document_embeddings(pdf_text)
+    retriever.set_retriever_doc_embeddings(doc_embeds)
+    # Get the top k documents for the question
+    k = 5
+    retrieved_docs = retriever(question, n_docs=k)
+    # Prepare the context for the model
+    context = retrieved_docs["document_texts"]
+    context = " ".join(context)
+    # Generate the answer
+    input_dict = tokenizer.prepare_seq2seq_batch(question, context, return_tensors="pt")
+    outputs = model.generate(**input_dict)
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return answer
+# Streamlit app
+st.title("PDF Question-Answer Chatbot")
+st.write("Upload a PDF file and ask questions based on its content.")
+# File uploader
+pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
+if pdf_file is not None:
+    # Extract text from the PDF
+    pdf_text = extract_text_from_pdf(pdf_file)
+    st.success("PDF loaded successfully!")
+    # Question input
+    question = st.text_input("Ask a question:")
+    if question:
+        with st.spinner("Finding answer..."):
+            answer = answer_question(question, pdf_text)
+            st.write("### Answer:")
+            st.write(answer)