File size: 4,135 Bytes
e79d672
3d967f5
8b78611
0058c86
 
0ff7c49
c4f4fdf
 
 
 
 
 
 
928e49e
3c5823e
 
 
 
 
e79d672
3d967f5
8b78611
 
 
c4f4fdf
 
 
 
3d967f5
c4f4fdf
 
 
 
 
 
 
 
e79d672
3d967f5
 
 
 
 
 
 
 
 
 
 
c4f4fdf
3d967f5
 
063bf5b
3d967f5
 
063bf5b
 
c4f4fdf
3d967f5
 
 
063bf5b
2768e21
063bf5b
3c5823e
2768e21
063bf5b
2768e21
3c5823e
063bf5b
3c5823e
 
 
 
 
063bf5b
3d967f5
 
2768e21
3c5823e
3d967f5
063bf5b
3c5823e
 
063bf5b
3d967f5
 
2768e21
 
c4f4fdf
3c5823e
c4f4fdf
 
3d967f5
 
063bf5b
 
3c5823e
0058c86
c4f4fdf
 
 
 
 
 
e79d672
8b78611
 
 
3d967f5
0058c86
063bf5b
8b78611
3c5823e
063bf5b
 
8b78611
0058c86
3d967f5
3c5823e
3d967f5
 
c4f4fdf
 
063bf5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
import fitz  # PyMuPDF
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import numpy as np
import os
import base64
import io
import uuid
import tempfile
import time
import shutil
from pathlib import Path
import json
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True)
model = model.eval().cuda()

UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"

# Ensure directories exist
for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)

def image_to_base64(image):
    buffered = io.BytesIO()
    image.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode()

def pdf_to_images(pdf_path):
    images = []
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

def run_GOT(pdf_file):
    unique_id = str(uuid.uuid4())
    pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
    shutil.copy(pdf_file, pdf_path)
    
    images = pdf_to_images(pdf_path)
    results = []
    html_content = ""
    
    try:
        for i, image in enumerate(images):
            image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
            image.save(image_path)
            
            result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
            
            logger.info(f"Processing page {i+1}...")
            res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
            
            # Read the rendered HTML content
            if os.path.exists(result_path):
                with open(result_path, 'r', encoding='utf-8') as f:
                    page_html_content = f.read()
                logger.info(f"HTML content for page {i+1} read successfully.")
            else:
                logger.error(f"HTML file for page {i+1} not found at {result_path}.")
                page_html_content = ""
            
            results.append({
                "page_number": i + 1,
                "text": res,
                "html": page_html_content
            })
            
            html_content += f"<h2>Page {i + 1}</h2>"
            html_content += page_html_content + "<br><hr><br>"  # Add a separator between pages
            
            if os.path.exists(image_path):
                os.remove(image_path)
            if os.path.exists(result_path):
                os.remove(result_path)
    except Exception as e:
        logger.error(f"Error occurred: {str(e)}")
        return f"Error: {str(e)}", None
    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)
    
    print(html_content)  # Debug: Check the content of html_content
    return json.dumps(results, indent=4), html_content

def cleanup_old_files():
    current_time = time.time()
    for folder in [UPLOAD_FOLDER, RESULTS_FOLDER]:
        for file_path in Path(folder).glob('*'):
            if current_time - file_path.stat().st_mtime > 3600:  # 1 hour
                file_path.unlink()

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(type="filepath", label="Upload your PDF")
            submit_button = gr.Button("Submit")
        
        with gr.Column():
            ocr_result = gr.JSON(label="GOT output (JSON)")
            html_result = gr.HTML(label="GOT output (HTML)", sanitize=False)
    
    submit_button.click(
        run_GOT,
        inputs=[pdf_input],
        outputs=[ocr_result, html_result]
    )

if __name__ == "__main__":
    cleanup_old_files()
    demo.launch()