Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,478 Bytes
145d936 312add7 34c42f9 4d1d4d1 b697ac0 145d936 312add7 c1d7645 3bf066d b697ac0 312add7 c1d7645 b697ac0 312add7 c1d7645 b697ac0 145d936 b697ac0 145d936 c1d7645 145d936 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import spaces
import gradio as gr
from pypdf import PdfReader
import ocrmypdf
def extract_text_from_pdf(reader):
full_text = ""
for idx, page in enumerate(reader.pages):
text = page.extract_text()
if len(text) > 0:
full_text += f"---- Page {idx} ----\n" + page.extract_text() + "\n\n"
return full_text.strip()
@spaces.GPU
def convert(pdf_file):
reader = PdfReader(pdf_file)
# Extract metadata
metadata = {
"author": reader.metadata.author,
"creator": reader.metadata.creator,
"producer": reader.metadata.producer,
"subject": reader.metadata.subject,
"title": reader.metadata.title,
}
# Extract text
full_text = extract_text_from_pdf(reader)
# Check if there are any images
image_count = 0
for page in reader.pages:
image_count += len(page.images)
# If there are images and not much content, perform OCR on the document
if image_count > 0 and len(full_text) < 1000:
out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf")
ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True)
# Re-extract text
reader = PdfReader(pdf_file)
full_text = extract_text_from_pdf(reader)
return full_text, metadata
gr.Interface(
convert,
inputs=[
gr.File(label="Upload PDF", type="filepath"),
],
outputs=[
gr.Text(label="Markdown"),
gr.JSON(label="Metadata"),
],
).launch()
|