Spaces:
Sleeping
Sleeping
File size: 4,135 Bytes
e79d672 3d967f5 8b78611 0058c86 0ff7c49 c4f4fdf 928e49e 3c5823e e79d672 3d967f5 8b78611 c4f4fdf 3d967f5 c4f4fdf e79d672 3d967f5 c4f4fdf 3d967f5 063bf5b 3d967f5 063bf5b c4f4fdf 3d967f5 063bf5b 2768e21 063bf5b 3c5823e 2768e21 063bf5b 2768e21 3c5823e 063bf5b 3c5823e 063bf5b 3d967f5 2768e21 3c5823e 3d967f5 063bf5b 3c5823e 063bf5b 3d967f5 2768e21 c4f4fdf 3c5823e c4f4fdf 3d967f5 063bf5b 3c5823e 0058c86 c4f4fdf e79d672 8b78611 3d967f5 0058c86 063bf5b 8b78611 3c5823e 063bf5b 8b78611 0058c86 3d967f5 3c5823e 3d967f5 c4f4fdf 063bf5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
import fitz # PyMuPDF
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import numpy as np
import os
import base64
import io
import uuid
import tempfile
import time
import shutil
from pathlib import Path
import json
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True)
model = model.eval().cuda()
UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"
# Ensure directories exist
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
if not os.path.exists(folder):
os.makedirs(folder)
def image_to_base64(image):
buffered = io.BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode()
def pdf_to_images(pdf_path):
images = []
pdf_document = fitz.open(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
return images
def run_GOT(pdf_file):
unique_id = str(uuid.uuid4())
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
shutil.copy(pdf_file, pdf_path)
images = pdf_to_images(pdf_path)
results = []
html_content = ""
try:
for i, image in enumerate(images):
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
image.save(image_path)
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
logger.info(f"Processing page {i+1}...")
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
# Read the rendered HTML content
if os.path.exists(result_path):
with open(result_path, 'r', encoding='utf-8') as f:
page_html_content = f.read()
logger.info(f"HTML content for page {i+1} read successfully.")
else:
logger.error(f"HTML file for page {i+1} not found at {result_path}.")
page_html_content = ""
results.append({
"page_number": i + 1,
"text": res,
"html": page_html_content
})
html_content += f"<h2>Page {i + 1}</h2>"
html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
if os.path.exists(image_path):
os.remove(image_path)
if os.path.exists(result_path):
os.remove(result_path)
except Exception as e:
logger.error(f"Error occurred: {str(e)}")
return f"Error: {str(e)}", None
finally:
if os.path.exists(pdf_path):
os.remove(pdf_path)
print(html_content) # Debug: Check the content of html_content
return json.dumps(results, indent=4), html_content
def cleanup_old_files():
current_time = time.time()
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
for file_path in Path(folder).glob('*'):
if current_time - file_path.stat().st_mtime > 3600: # 1 hour
file_path.unlink()
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
pdf_input = gr.File(type="filepath", label="Upload your PDF")
submit_button = gr.Button("Submit")
with gr.Column():
ocr_result = gr.JSON(label="GOT output (JSON)")
html_result = gr.HTML(label="GOT output (HTML)", sanitize=False)
submit_button.click(
run_GOT,
inputs=[pdf_input],
outputs=[ocr_result, html_result]
)
if __name__ == "__main__":
cleanup_old_files()
demo.launch() |