Spaces:
Sleeping
Sleeping
import streamlit as st | |
import torch | |
from transformers import BertTokenizer, BertModel | |
import pdfplumber | |
# Load the pre-trained BERT model and tokenizer outside the function for efficiency | |
model_name = "bert-base-uncased" | |
tokenizer = BertTokenizer.from_pretrained(model_name) | |
model = BertModel.from_pretrained(model_name) | |
# Define a function to extract text from a PDF | |
def extract_text_from_pdf(pdf_file): | |
with pdfplumber.open(pdf_file) as pdf: | |
text = "" | |
for page in pdf.pages: | |
text += page.extract_text() | |
return text | |
# Define a function to classify the extracted text | |
def classify_text(text): | |
# Preprocess the input text | |
inputs = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=512, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
# Use the pre-trained BERT model to extract features from the input text | |
outputs = model(**inputs) | |
# Extract the features | |
features = outputs.last_hidden_state[:, 0, :] | |
return features.tolist() | |
# Streamlit app setup | |
st.title("PDF Text Classification") | |
st.write("Upload a PDF file to classify its text using BERT") | |
# File uploader for PDFs | |
pdf_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if pdf_file is not None: | |
# Extract text from the uploaded PDF | |
extracted_text = extract_text_from_pdf(pdf_file) | |
st.write("Extracted Text:") | |
st.write(extracted_text) | |
# Classify the extracted text | |
if st.button("Classify"): | |
features = classify_text(extracted_text) | |
st.json({"features": features}) # Display the features in JSON format | |