Spaces:
Sleeping
Sleeping
File size: 1,254 Bytes
f8883b1 e588742 f8883b1 e588742 f8883b1 f377843 f8883b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import streamlit as st
import PyPDF2
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
def pdf_to_text(pdf_file):
# Open the PDF file
pdf = PyPDF2.PdfReader(pdf_file)
# Extract the text from each page
text = ''
for page in pdf.pages:
text += page.extract_text()
# If the text is empty, use OCR to extract the text
if not text:
# Convert the PDF to images
images = convert_from_path(pdf_file)
# Perform OCR on each image
for image in images:
text += pytesseract.image_to_string(image)
return text
def main():
st.title("PDF Text Extractor")
st.write("Upload a PDF file to extract the text")
pdf_file = st.file_uploader("Upload PDF file", type=["pdf"])
if pdf_file is not None:
text = pdf_to_text(pdf_file)
txt_file = pdf_file.name.replace('.pdf', '.txt')
with open(txt_file, 'w') as f:
f.write(text)
with open(txt_file, "rb") as file:
btn = st.download_button(
label="Download Extracted Text",
data=file,
file_name=txt_file,
mime="text/plain"
)
if __name__ == "__main__":
main()
|