import gradio as gr from pdf2image import convert_from_path from transformers import AutoModel, AutoTokenizer from PIL import Image import numpy as np import os import base64 import io import uuid import tempfile import time import shutil from pathlib import Path import json # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True) model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, device_map='cuda', use_safetensors=True) model = model.eval().cuda() UPLOAD_FOLDER = "./uploads" RESULTS_FOLDER = "./results" # Ensure directories exist for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]: if not os.path.exists(folder): os.makedirs(folder) def image_to_base64(image): buffered = io.BytesIO() image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode() def convert_pdf_to_images(pdf_path, output_folder): # Ensure the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Convert PDF to images images = convert_from_path(pdf_path) # Save each image to the output folder image_paths = [] for i, image in enumerate(images): image_path = os.path.join(output_folder, f"page_{i + 1}.png") image.save(image_path, 'JPEG') image_paths.append(image_path) print(f"Saved {image_path}") return image_paths def run_GOT(pdf_file): unique_id = str(uuid.uuid4()) pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf") shutil.copy(pdf_file, pdf_path) images = convert_pdf_to_images(pdf_path, UPLOAD_FOLDER) results = [] try: for i, image_path in enumerate(images): result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html") res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path) # Read the rendered HTML content with open(result_path, 'r') as f: html_content = f.read() results.append({ "page_number": i + 1, "text": res, "html": html_content }) if os.path.exists(image_path): os.remove(image_path) if os.path.exists(result_path): os.remove(result_path) except Exception as e: return f"Error: {str(e)}", None finally: if os.path.exists(pdf_path): os.remove(pdf_path) html_output = "".join([result["html"] for result in results]) print("HTML Output:", html_output) # Debugging print statement return json.dumps(results, indent=4), html_output def cleanup_old_files(): current_time = time.time() for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]: for file_path in Path(folder).glob('*'): if current_time - file_path.stat().st_mtime > 3600: # 1 hour file_path.unlink() with gr.Blocks() as demo: with gr.Row(): with gr.Column(): pdf_input = gr.File(type="filepath", label="Upload your PDF") submit_button = gr.Button("Submit") with gr.Column(): ocr_result = gr.JSON(label="GOT output") html_output = gr.HTML(label="Rendered HTML") submit_button.click( run_GOT, inputs=[pdf_input], outputs=[ocr_result, html_output] ) if __name__ == "__main__": cleanup_old_files() demo.launch()