Spaces:
Sleeping
Sleeping
File size: 3,578 Bytes
e79d672 2b25a9c 8b78611 0058c86 0ff7c49 c4f4fdf 928e49e e79d672 3d967f5 8b78611 0c6e502 8b78611 c4f4fdf 3d967f5 c4f4fdf e79d672 2b25a9c 3d967f5 c4f4fdf 3d967f5 063bf5b 2b25a9c 3d967f5 063bf5b c4f4fdf 2b25a9c 2768e21 063bf5b 2768e21 063bf5b 2768e21 e6c69f5 063bf5b 3d967f5 2768e21 e6c69f5 3d967f5 063bf5b 3d967f5 2768e21 c4f4fdf 3d967f5 063bf5b c125111 0058c86 c4f4fdf e79d672 8b78611 3d967f5 0058c86 063bf5b 8b78611 e6c69f5 e654048 e6c69f5 8b78611 0058c86 3d967f5 e654048 3d967f5 c4f4fdf 063bf5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
from pdf2image import convert_from_path
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import numpy as np
import os
import base64
import io
import uuid
import tempfile
import time
import shutil
from pathlib import Path
import json
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, device_map='cuda', use_safetensors=True)
model = model.eval().cuda()
UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"
# Ensure directories exist
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
if not os.path.exists(folder):
os.makedirs(folder)
def image_to_base64(image):
buffered = io.BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode()
def convert_pdf_to_images(pdf_path, output_folder):
# Ensure the output folder exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Convert PDF to images
images = convert_from_path(pdf_path)
# Save each image to the output folder
image_paths = []
for i, image in enumerate(images):
image_path = os.path.join(output_folder, f"page_{i + 1}.png")
image.save(image_path, 'JPEG')
image_paths.append(image_path)
print(f"Saved {image_path}")
return image_paths
def run_GOT(pdf_file):
unique_id = str(uuid.uuid4())
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
shutil.copy(pdf_file, pdf_path)
images = convert_pdf_to_images(pdf_path, UPLOAD_FOLDER)
results = []
try:
for i, image_path in enumerate(images):
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
# Read the rendered HTML content
with open(result_path, 'r') as f:
html_content = f.read()
results.append({
"page_number": i + 1,
"text": res,
"html": html_content
})
if os.path.exists(image_path):
os.remove(image_path)
if os.path.exists(result_path):
os.remove(result_path)
except Exception as e:
return f"Error: {str(e)}", None
finally:
if os.path.exists(pdf_path):
os.remove(pdf_path)
html_output = "".join([result["html"] for result in results])
print("HTML Output:", html_output) # Debugging print statement
return json.dumps(results, indent=4), html_output
def cleanup_old_files():
current_time = time.time()
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
for file_path in Path(folder).glob('*'):
if current_time - file_path.stat().st_mtime > 3600: # 1 hour
file_path.unlink()
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
pdf_input = gr.File(type="filepath", label="Upload your PDF")
submit_button = gr.Button("Submit")
with gr.Column():
ocr_result = gr.JSON(label="GOT output")
html_output = gr.HTML(label="Rendered HTML")
submit_button.click(
run_GOT,
inputs=[pdf_input],
outputs=[ocr_result, html_output]
)
if __name__ == "__main__":
cleanup_old_files()
demo.launch() |