Spaces:

Shankarm08
/

pdfreader

Sleeping

App Files Files Community

Shankarm08 commited on Oct 5, 2024

Commit

bc28c5c

verified ·

1 Parent(s): 33015fa

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -31

app.py CHANGED Viewed

@@ -3,22 +3,13 @@ import torch
 from transformers import BertTokenizer, BertModel
 import pdfplumber
-# Load the pre-trained BERT model and tokenizer outside the function for efficiency
 model_name = "bert-base-uncased"
 tokenizer = BertTokenizer.from_pretrained(model_name)
 model = BertModel.from_pretrained(model_name)
-# Define a function to extract text from a PDF
-def extract_text_from_pdf(pdf_file):
-    with pdfplumber.open(pdf_file) as pdf:
-        text = ""
-        for page in pdf.pages:
-            text += page.extract_text()
-    return text
-# Define a function to classify the extracted text
-def classify_text(text):
-    # Preprocess the input text
     inputs = tokenizer.encode_plus(
         text,
         add_special_tokens=True,
@@ -26,29 +17,44 @@ def classify_text(text):
         return_attention_mask=True,
         return_tensors='pt'
     )
-    # Use the pre-trained BERT model to extract features from the input text
     outputs = model(**inputs)
-    # Extract the features
-    features = outputs.last_hidden_state[:, 0, :]
-    return features.tolist()
-# Streamlit app setup
-st.title("PDF Text Classification")
-st.write("Upload a PDF file to classify its text using BERT")
-# File uploader for PDFs
-pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
-if pdf_file is not None:
-    # Extract text from the uploaded PDF
-    extracted_text = extract_text_from_pdf(pdf_file)
-    st.write("Extracted Text:")
-    st.write(extracted_text)
-    # Classify the extracted text
-    if st.button("Classify"):
-        features = classify_text(extracted_text)
-        st.json({"features": features})  # Display the features in JSON format

 from transformers import BertTokenizer, BertModel
 import pdfplumber
+# Load the pre-trained BERT model and tokenizer once
 model_name = "bert-base-uncased"
 tokenizer = BertTokenizer.from_pretrained(model_name)
 model = BertModel.from_pretrained(model_name)
+# Function to get BERT embeddings
+def get_embeddings(text):
     inputs = tokenizer.encode_plus(
         text,
         add_special_tokens=True,
         return_attention_mask=True,
         return_tensors='pt'
     )
     outputs = model(**inputs)
+    return outputs.last_hidden_state[:, 0, :].detach().numpy()
+# Extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    with pdfplumber.open(pdf_file) as pdf:
+        text = ""
+        for page in pdf.pages:
+            text += page.extract_text() + "\n"  # Add newline for better separation
+    return text
+# Store the PDF text and embeddings
+pdf_text = ""
+pdf_embeddings = None
+# Streamlit app
+st.title("PDF Chatbot using BERT")
+# PDF file upload
+pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+if pdf_file:
+    pdf_text = extract_text_from_pdf(pdf_file)
+    pdf_embeddings = get_embeddings(pdf_text)
+    st.success("PDF loaded successfully!")
+# User input for chatbot
+user_input = st.text_input("Ask a question about the PDF:")
+if st.button("Get Response"):
+    if pdf_text == "":
+        st.warning("Please upload a PDF file first.")
+    else:
+        # Get embeddings for user input
+        user_embeddings = get_embeddings(user_input)
+        # For demonstration, simply return the PDF text.
+        # Implement similarity matching logic here as needed.
+        st.write("### Response:")
+        st.write(pdf_text)  # For simplicity, returning all text