pdfreader / app.py
Shankarm08's picture
Update app.py
bc28c5c verified
raw
history blame
1.75 kB
import streamlit as st
import torch
from transformers import BertTokenizer, BertModel
import pdfplumber
# Load the pre-trained BERT model and tokenizer once
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
# Function to get BERT embeddings
def get_embeddings(text):
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
return_attention_mask=True,
return_tensors='pt'
)
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].detach().numpy()
# Extract text from PDF
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n" # Add newline for better separation
return text
# Store the PDF text and embeddings
pdf_text = ""
pdf_embeddings = None
# Streamlit app
st.title("PDF Chatbot using BERT")
# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if pdf_file:
pdf_text = extract_text_from_pdf(pdf_file)
pdf_embeddings = get_embeddings(pdf_text)
st.success("PDF loaded successfully!")
# User input for chatbot
user_input = st.text_input("Ask a question about the PDF:")
if st.button("Get Response"):
if pdf_text == "":
st.warning("Please upload a PDF file first.")
else:
# Get embeddings for user input
user_embeddings = get_embeddings(user_input)
# For demonstration, simply return the PDF text.
# Implement similarity matching logic here as needed.
st.write("### Response:")
st.write(pdf_text) # For simplicity, returning all text