Spaces:
Sleeping
Sleeping
File size: 1,750 Bytes
c5608b5 7340eaf ceb87d2 7340eaf bc28c5c c5608b5 7340eaf bc28c5c 7340eaf c5608b5 7340eaf bc28c5c 7340eaf bc28c5c 7340eaf bc28c5c ceb87d2 bc28c5c c5608b5 bc28c5c ceb87d2 bc28c5c 7340eaf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import streamlit as st
import torch
from transformers import BertTokenizer, BertModel
import pdfplumber
# Load the pre-trained BERT model and tokenizer once
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
# Function to get BERT embeddings
def get_embeddings(text):
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
return_attention_mask=True,
return_tensors='pt'
)
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].detach().numpy()
# Extract text from PDF
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n" # Add newline for better separation
return text
# Store the PDF text and embeddings
pdf_text = ""
pdf_embeddings = None
# Streamlit app
st.title("PDF Chatbot using BERT")
# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if pdf_file:
pdf_text = extract_text_from_pdf(pdf_file)
pdf_embeddings = get_embeddings(pdf_text)
st.success("PDF loaded successfully!")
# User input for chatbot
user_input = st.text_input("Ask a question about the PDF:")
if st.button("Get Response"):
if pdf_text == "":
st.warning("Please upload a PDF file first.")
else:
# Get embeddings for user input
user_embeddings = get_embeddings(user_input)
# For demonstration, simply return the PDF text.
# Implement similarity matching logic here as needed.
st.write("### Response:")
st.write(pdf_text) # For simplicity, returning all text
|