Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,6 +12,11 @@ import time
|
|
12 |
import shutil
|
13 |
from pathlib import Path
|
14 |
import json
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Load tokenizer and model
|
17 |
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
@@ -45,40 +50,53 @@ def run_GOT(pdf_file):
|
|
45 |
unique_id = str(uuid.uuid4())
|
46 |
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
|
47 |
shutil.copy(pdf_file, pdf_path)
|
48 |
-
|
49 |
images = pdf_to_images(pdf_path)
|
50 |
results = []
|
51 |
-
|
|
|
52 |
try:
|
53 |
for i, image in enumerate(images):
|
54 |
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
|
55 |
image.save(image_path)
|
56 |
-
|
57 |
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
|
58 |
-
|
|
|
59 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
60 |
-
|
61 |
# Read the rendered HTML content
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
65 |
results.append({
|
66 |
"page_number": i + 1,
|
67 |
"text": res,
|
68 |
-
"html":
|
69 |
})
|
70 |
-
|
|
|
|
|
|
|
71 |
if os.path.exists(image_path):
|
72 |
os.remove(image_path)
|
73 |
if os.path.exists(result_path):
|
74 |
os.remove(result_path)
|
75 |
except Exception as e:
|
|
|
76 |
return f"Error: {str(e)}", None
|
77 |
finally:
|
78 |
if os.path.exists(pdf_path):
|
79 |
os.remove(pdf_path)
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
|
83 |
def cleanup_old_files():
|
84 |
current_time = time.time()
|
@@ -92,16 +110,17 @@ with gr.Blocks() as demo:
|
|
92 |
with gr.Column():
|
93 |
pdf_input = gr.File(type="filepath", label="Upload your PDF")
|
94 |
submit_button = gr.Button("Submit")
|
95 |
-
|
96 |
with gr.Column():
|
97 |
-
ocr_result = gr.JSON(label="GOT output")
|
|
|
98 |
|
99 |
submit_button.click(
|
100 |
run_GOT,
|
101 |
inputs=[pdf_input],
|
102 |
-
outputs=[ocr_result]
|
103 |
)
|
104 |
|
105 |
if __name__ == "__main__":
|
106 |
cleanup_old_files()
|
107 |
-
demo.launch()
|
|
|
12 |
import shutil
|
13 |
from pathlib import Path
|
14 |
import json
|
15 |
+
import logging
|
16 |
+
|
17 |
+
# Set up logging
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
|
21 |
# Load tokenizer and model
|
22 |
tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
|
|
|
50 |
unique_id = str(uuid.uuid4())
|
51 |
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
|
52 |
shutil.copy(pdf_file, pdf_path)
|
53 |
+
|
54 |
images = pdf_to_images(pdf_path)
|
55 |
results = []
|
56 |
+
html_content = "<html><body>"
|
57 |
+
|
58 |
try:
|
59 |
for i, image in enumerate(images):
|
60 |
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
|
61 |
image.save(image_path)
|
62 |
+
|
63 |
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
|
64 |
+
|
65 |
+
logger.info(f"Processing page {i+1}...")
|
66 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
67 |
+
|
68 |
# Read the rendered HTML content
|
69 |
+
if os.path.exists(result_path):
|
70 |
+
with open(result_path, 'r') as f:
|
71 |
+
page_html_content = f.read()
|
72 |
+
logger.info(f"HTML content for page {i+1} read successfully.")
|
73 |
+
else:
|
74 |
+
logger.error(f"HTML file for page {i+1} not found at {result_path}.")
|
75 |
+
page_html_content = ""
|
76 |
+
|
77 |
results.append({
|
78 |
"page_number": i + 1,
|
79 |
"text": res,
|
80 |
+
"html": page_html_content
|
81 |
})
|
82 |
+
|
83 |
+
html_content += f"<h2>Page {i + 1}</h2>"
|
84 |
+
html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
|
85 |
+
|
86 |
if os.path.exists(image_path):
|
87 |
os.remove(image_path)
|
88 |
if os.path.exists(result_path):
|
89 |
os.remove(result_path)
|
90 |
except Exception as e:
|
91 |
+
logger.error(f"Error occurred: {str(e)}")
|
92 |
return f"Error: {str(e)}", None
|
93 |
finally:
|
94 |
if os.path.exists(pdf_path):
|
95 |
os.remove(pdf_path)
|
96 |
+
|
97 |
+
html_content += "</body></html>"
|
98 |
+
logger.info(f"Final HTML content: {html_content}") # Log the final HTML content for debugging
|
99 |
+
return json.dumps(results, indent=4), html_content
|
100 |
|
101 |
def cleanup_old_files():
|
102 |
current_time = time.time()
|
|
|
110 |
with gr.Column():
|
111 |
pdf_input = gr.File(type="filepath", label="Upload your PDF")
|
112 |
submit_button = gr.Button("Submit")
|
113 |
+
|
114 |
with gr.Column():
|
115 |
+
ocr_result = gr.JSON(label="GOT output (JSON)")
|
116 |
+
html_result = gr.HTML(label="GOT output (HTML)")
|
117 |
|
118 |
submit_button.click(
|
119 |
run_GOT,
|
120 |
inputs=[pdf_input],
|
121 |
+
outputs=[ocr_result, html_result]
|
122 |
)
|
123 |
|
124 |
if __name__ == "__main__":
|
125 |
cleanup_old_files()
|
126 |
+
demo.launch()
|