Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import BertTokenizer, BertModel | |
import pdfplumber | |
# Load the pre-trained BERT model and tokenizer once | |
model_name = "bert-base-uncased" | |
tokenizer = BertTokenizer.from_pretrained(model_name) | |
model = BertModel.from_pretrained(model_name) | |
# Function to get BERT embeddings | |
def get_embeddings(text): | |
inputs = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=512, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
outputs = model(**inputs) | |
return outputs.last_hidden_state[:, 0, :].detach().numpy() | |
# Extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
text = "" | |
for page in pdf.pages: | |
text += page.extract_text() + "\n" # Add newline for better separation | |
return text | |
# Store the PDF text and embeddings | |
pdf_text = "" | |
pdf_embeddings = None | |
# Streamlit app | |
st.title("PDF Chatbot using BERT") | |
# PDF file upload | |
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if pdf_file: | |
pdf_text = extract_text_from_pdf(pdf_file) | |
pdf_embeddings = get_embeddings(pdf_text) | |
st.success("PDF loaded successfully!") | |
# User input for chatbot | |
user_input = st.text_input("Ask a question about the PDF:") | |
if st.button("Get Response"): | |
if pdf_text == "": | |
st.warning("Please upload a PDF file first.") | |
else: | |
# Get embeddings for user input | |
user_embeddings = get_embeddings(user_input) | |
# For demonstration, simply return the PDF text. | |
# Implement similarity matching logic here as needed. | |
st.write("### Response:") | |
st.write(pdf_text) # For simplicity, returning all text | |