File size: 3,578 Bytes
e79d672
2b25a9c
8b78611
0058c86
 
0ff7c49
c4f4fdf
 
 
 
 
 
 
928e49e
e79d672
3d967f5
8b78611
0c6e502
8b78611
c4f4fdf
 
 
 
3d967f5
c4f4fdf
 
 
 
 
 
 
 
e79d672
2b25a9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d967f5
 
c4f4fdf
3d967f5
 
063bf5b
2b25a9c
3d967f5
063bf5b
c4f4fdf
2b25a9c
2768e21
063bf5b
2768e21
063bf5b
2768e21
e6c69f5
 
063bf5b
3d967f5
 
2768e21
e6c69f5
3d967f5
063bf5b
3d967f5
 
2768e21
 
c4f4fdf
 
 
3d967f5
 
063bf5b
c125111
 
 
0058c86
c4f4fdf
 
 
 
 
 
e79d672
8b78611
 
 
3d967f5
0058c86
063bf5b
8b78611
e6c69f5
e654048
e6c69f5
8b78611
0058c86
3d967f5
e654048
3d967f5
 
c4f4fdf
 
063bf5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
from pdf2image import convert_from_path
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import numpy as np
import os
import base64
import io
import uuid
import tempfile
import time
import shutil
from pathlib import Path
import json

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, device_map='cuda', use_safetensors=True)
model = model.eval().cuda()

UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"

# Ensure directories exist
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)

def image_to_base64(image):
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode()

def convert_pdf_to_images(pdf_path, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Convert PDF to images
    images = convert_from_path(pdf_path)

    # Save each image to the output folder
    image_paths = []
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f"page_{i + 1}.png")
        image.save(image_path, 'JPEG')
        image_paths.append(image_path)
        print(f"Saved {image_path}")
    return image_paths

def run_GOT(pdf_file):
    unique_id = str(uuid.uuid4())
    pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
    shutil.copy(pdf_file, pdf_path)
    
    images = convert_pdf_to_images(pdf_path, UPLOAD_FOLDER)
    results = []
    
    try:
        for i, image_path in enumerate(images):
            result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
            
            res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
            
            # Read the rendered HTML content
            with open(result_path, 'r') as f:
                html_content = f.read()
            
            results.append({
                "page_number": i + 1,
                "text": res,
                "html": html_content
            })
            
            if os.path.exists(image_path):
                os.remove(image_path)
            if os.path.exists(result_path):
                os.remove(result_path)
    except Exception as e:
        return f"Error: {str(e)}", None
    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)
    
    html_output = "".join([result["html"] for result in results])
    print("HTML Output:", html_output)  # Debugging print statement
    return json.dumps(results, indent=4), html_output

def cleanup_old_files():
    current_time = time.time()
    for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
        for file_path in Path(folder).glob('*'):
            if current_time - file_path.stat().st_mtime > 3600:  # 1 hour
                file_path.unlink()

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(type="filepath", label="Upload your PDF")
            submit_button = gr.Button("Submit")
        
        with gr.Column():
            ocr_result = gr.JSON(label="GOT output")
            html_output = gr.HTML(label="Rendered HTML")

    submit_button.click(
        run_GOT,
        inputs=[pdf_input],
        outputs=[ocr_result, html_output]
    )

if __name__ == "__main__":
    cleanup_old_files()
    demo.launch()