Spaces:

Shankarm08
/

pdfreader

Sleeping

pdfreader / app.py

Update app.py

c5608b5 verified 8 months ago

1.64 kB

	import streamlit as st
	import torch
	from transformers import BertTokenizer, BertModel
	import pdfplumber

	# Load the pre-trained BERT model and tokenizer outside the function for efficiency
	model_name = "bert-base-uncased"
	tokenizer = BertTokenizer.from_pretrained(model_name)
	model = BertModel.from_pretrained(model_name)

	# Define a function to extract text from a PDF
	def extract_text_from_pdf(pdf_file):
	with pdfplumber.open(pdf_file) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text()
	return text

	# Define a function to classify the extracted text
	def classify_text(text):
	# Preprocess the input text
	inputs = tokenizer.encode_plus(
	text,
	add_special_tokens=True,
	max_length=512,
	return_attention_mask=True,
	return_tensors='pt'
	)

	# Use the pre-trained BERT model to extract features from the input text
	outputs = model(**inputs)

	# Extract the features
	features = outputs.last_hidden_state[:, 0, :]

	return features.tolist()

	# Streamlit app setup
	st.title("PDF Text Classification")
	st.write("Upload a PDF file to classify its text using BERT")

	# File uploader for PDFs
	pdf_file = st.file_uploader("Choose a PDF file", type="pdf")

	if pdf_file is not None:
	# Extract text from the uploaded PDF
	extracted_text = extract_text_from_pdf(pdf_file)
	st.write("Extracted Text:")
	st.write(extracted_text)

	# Classify the extracted text
	if st.button("Classify"):
	features = classify_text(extracted_text)
	st.json({"features": features}) # Display the features in JSON format