Spaces:

Shankarm08
/

pdfreader

Sleeping

App Files Files Community

pdfreader / app.py

Shankarm08

Update app.py

bc28c5c verified 9 months ago

raw

history blame

1.75 kB

	import streamlit as st
	import torch
	from transformers import BertTokenizer, BertModel
	import pdfplumber

	# Load the pre-trained BERT model and tokenizer once
	model_name = "bert-base-uncased"
	tokenizer = BertTokenizer.from_pretrained(model_name)
	model = BertModel.from_pretrained(model_name)

	# Function to get BERT embeddings
	def get_embeddings(text):
	inputs = tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	max_length=512,
	return_attention_mask=True,
	return_tensors='pt'
	)
	outputs = model(**inputs)
	return outputs.last_hidden_state[:, 0, :].detach().numpy()

	# Extract text from PDF
	def extract_text_from_pdf(pdf_file):
	with pdfplumber.open(pdf_file) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text() + "\n" # Add newline for better separation
	return text

	# Store the PDF text and embeddings
	pdf_text = ""
	pdf_embeddings = None

	# Streamlit app
	st.title("PDF Chatbot using BERT")

	# PDF file upload
	pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])

	if pdf_file:
	pdf_text = extract_text_from_pdf(pdf_file)
	pdf_embeddings = get_embeddings(pdf_text)
	st.success("PDF loaded successfully!")

	# User input for chatbot
	user_input = st.text_input("Ask a question about the PDF:")

	if st.button("Get Response"):
	if pdf_text == "":
	st.warning("Please upload a PDF file first.")
	else:
	# Get embeddings for user input
	user_embeddings = get_embeddings(user_input)

	# For demonstration, simply return the PDF text.
	# Implement similarity matching logic here as needed.
	st.write("### Response:")
	st.write(pdf_text) # For simplicity, returning all text