File size: 1,750 Bytes
c5608b5
7340eaf
 
ceb87d2
7340eaf
bc28c5c
c5608b5
 
 
7340eaf
bc28c5c
 
7340eaf
c5608b5
7340eaf
 
 
 
 
 
bc28c5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7340eaf
bc28c5c
 
7340eaf
bc28c5c
 
 
 
ceb87d2
bc28c5c
 
c5608b5
bc28c5c
 
 
 
 
 
ceb87d2
bc28c5c
 
 
 
7340eaf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import streamlit as st
import torch
from transformers import BertTokenizer, BertModel
import pdfplumber

# Load the pre-trained BERT model and tokenizer once
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to get BERT embeddings
def get_embeddings(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt'
    )
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Extract text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"  # Add newline for better separation
    return text

# Store the PDF text and embeddings
pdf_text = ""
pdf_embeddings = None

# Streamlit app
st.title("PDF Chatbot using BERT")

# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])

if pdf_file:
    pdf_text = extract_text_from_pdf(pdf_file)
    pdf_embeddings = get_embeddings(pdf_text)
    st.success("PDF loaded successfully!")

# User input for chatbot
user_input = st.text_input("Ask a question about the PDF:")

if st.button("Get Response"):
    if pdf_text == "":
        st.warning("Please upload a PDF file first.")
    else:
        # Get embeddings for user input
        user_embeddings = get_embeddings(user_input)

        # For demonstration, simply return the PDF text.
        # Implement similarity matching logic here as needed.
        st.write("### Response:")
        st.write(pdf_text)  # For simplicity, returning all text