pdfreader / app.py
Shankarm08's picture
Update app.py
c5608b5 verified
raw
history blame
1.64 kB
import streamlit as st
import torch
from transformers import BertTokenizer, BertModel
import pdfplumber
# Load the pre-trained BERT model and tokenizer outside the function for efficiency
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
# Define a function to extract text from a PDF
def extract_text_from_pdf(pdf_file):
with pdfplumber.open(pdf_file) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
return text
# Define a function to classify the extracted text
def classify_text(text):
# Preprocess the input text
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
return_attention_mask=True,
return_tensors='pt'
)
# Use the pre-trained BERT model to extract features from the input text
outputs = model(**inputs)
# Extract the features
features = outputs.last_hidden_state[:, 0, :]
return features.tolist()
# Streamlit app setup
st.title("PDF Text Classification")
st.write("Upload a PDF file to classify its text using BERT")
# File uploader for PDFs
pdf_file = st.file_uploader("Choose a PDF file", type="pdf")
if pdf_file is not None:
# Extract text from the uploaded PDF
extracted_text = extract_text_from_pdf(pdf_file)
st.write("Extracted Text:")
st.write(extracted_text)
# Classify the extracted text
if st.button("Classify"):
features = classify_text(extracted_text)
st.json({"features": features}) # Display the features in JSON format