Spaces:

Shankarm08
/

pdfreader

Sleeping

File size: 1,750 Bytes

c5608b5
7340eaf
 
ceb87d2
7340eaf
bc28c5c
c5608b5
 
 
7340eaf
bc28c5c
 
7340eaf
c5608b5
7340eaf
 
 
 
 
 
bc28c5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7340eaf
bc28c5c
 
7340eaf
bc28c5c
 
 
 
ceb87d2
bc28c5c
 
c5608b5
bc28c5c
 
 
 
 
 
ceb87d2
bc28c5c
 
 
 
7340eaf

import streamlit as st
import torch
from transformers import BertTokenizer, BertModel
import pdfplumber

# Load the pre-trained BERT model and tokenizer once
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to get BERT embeddings
def get_embeddings(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Extract text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"  # Add newline for better separation
    return text

# Store the PDF text and embeddings
pdf_text = ""
pdf_embeddings = None

# Streamlit app
st.title("PDF Chatbot using BERT")

# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])

if pdf_file:
    pdf_text = extract_text_from_pdf(pdf_file)
    pdf_embeddings = get_embeddings(pdf_text)
    st.success("PDF loaded successfully!")

# User input for chatbot
user_input = st.text_input("Ask a question about the PDF:")

if st.button("Get Response"):
    if pdf_text == "":
        st.warning("Please upload a PDF file first.")
    else:
        # Get embeddings for user input
        user_embeddings = get_embeddings(user_input)

        # For demonstration, simply return the PDF text.
        # Implement similarity matching logic here as needed.
        st.write("### Response:")
        st.write(pdf_text)  # For simplicity, returning all text