Spaces:
Sleeping
Sleeping
File size: 3,740 Bytes
c5608b5 7340eaf 9e487ab 7340eaf ceb87d2 9e487ab 7340eaf bc28c5c c5608b5 7340eaf bc28c5c 96f0bc8 a472326 7340eaf c5608b5 7340eaf a472326 7340eaf a472326 96f0bc8 9e487ab 96f0bc8 9e487ab bc28c5c 51b8479 bc28c5c 9e487ab bc28c5c 7340eaf bc28c5c 7340eaf 9e487ab bc28c5c 51b8479 ceb87d2 bc28c5c c5608b5 bc28c5c 9e487ab bc28c5c 9e487ab bc28c5c 96f0bc8 9e487ab 96f0bc8 9e487ab 96f0bc8 7340eaf 51b8479 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import streamlit as st
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
import pdfplumber
from sklearn.metrics.pairwise import cosine_similarity
# Load the pre-trained BERT model and tokenizer once
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
# Function to get BERT embeddings
def get_embeddings(text):
# Check if input text is empty
if not text.strip():
raise ValueError("Input text is empty.")
# Ensure that text length does not exceed BERT's maximum input length
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
truncation=True, # This will truncate the text to the maximum length
return_attention_mask=True,
return_tensors='pt'
)
with torch.no_grad(): # Disable gradient calculation for inference
outputs = model(**inputs)
# Extract the embeddings from the last hidden state
if hasattr(outputs, 'last_hidden_state'):
return outputs.last_hidden_state[:, 0, :].detach().cpu().numpy() # Move to CPU before converting to numpy
else:
raise ValueError("Model output does not contain 'last_hidden_state'.")
# Extract text from PDF
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
text = ""
for page in pdf.pages:
page_text = page.extract_text()
if page_text: # Check if page text is not empty
text += page_text + "\n" # Add newline for better separation
else:
st.warning("No extractable text found on a page.")
return text
# Split text into sentences for better matching
def split_text_into_sentences(text):
return text.split('\n') # Split by newlines; adjust as needed
# Streamlit app
st.title("PDF Chatbot using BERT")
# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
# Store the PDF text and embeddings
pdf_text = ""
pdf_embeddings = None
if pdf_file:
pdf_text = extract_text_from_pdf(pdf_file)
# Check if the extracted text is empty
if not pdf_text.strip():
st.error("The extracted PDF text is empty. Please upload a PDF with extractable text.")
else:
try:
pdf_sentences = split_text_into_sentences(pdf_text) # Split PDF text into sentences
pdf_embeddings = np.array([get_embeddings(sentence) for sentence in pdf_sentences]) # Get embeddings for each sentence
st.success("PDF loaded successfully!")
except Exception as e:
st.error(f"Error while processing PDF: {e}")
# User input for chatbot
user_input = st.text_input("Ask a question about the PDF:")
if st.button("Get Response"):
if not pdf_sentences:
st.warning("Please upload a PDF file first.")
elif not user_input.strip():
st.warning("Please enter a question.")
else:
try:
user_embeddings = get_embeddings(user_input)
user_embeddings = user_embeddings.reshape(1, -1) # Reshape for cosine similarity calculation
# Calculate cosine similarity between user input and PDF sentence embeddings
similarities = cosine_similarity(user_embeddings, pdf_embeddings)
best_match_index = np.argmax(similarities) # Get the index of the best match
# Display the most relevant sentence
st.write("### Response:")
st.write(pdf_sentences[best_match_index]) # Return the most relevant sentence
except Exception as e:
st.error(f"Error while processing user input: {e}")
|