Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -50,39 +50,39 @@ def run_GOT(pdf_file):
|
|
50 |
unique_id = str(uuid.uuid4())
|
51 |
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
|
52 |
shutil.copy(pdf_file, pdf_path)
|
53 |
-
|
54 |
images = pdf_to_images(pdf_path)
|
55 |
results = []
|
56 |
-
html_content = "
|
57 |
-
|
58 |
try:
|
59 |
for i, image in enumerate(images):
|
60 |
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
|
61 |
image.save(image_path)
|
62 |
-
|
63 |
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
|
64 |
-
|
65 |
logger.info(f"Processing page {i+1}...")
|
66 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
67 |
-
|
68 |
# Read the rendered HTML content
|
69 |
if os.path.exists(result_path):
|
70 |
-
with open(result_path, 'r') as f:
|
71 |
page_html_content = f.read()
|
72 |
logger.info(f"HTML content for page {i+1} read successfully.")
|
73 |
else:
|
74 |
logger.error(f"HTML file for page {i+1} not found at {result_path}.")
|
75 |
page_html_content = ""
|
76 |
-
|
77 |
results.append({
|
78 |
"page_number": i + 1,
|
79 |
"text": res,
|
80 |
"html": page_html_content
|
81 |
})
|
82 |
-
|
83 |
html_content += f"<h2>Page {i + 1}</h2>"
|
84 |
html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
|
85 |
-
|
86 |
if os.path.exists(image_path):
|
87 |
os.remove(image_path)
|
88 |
if os.path.exists(result_path):
|
@@ -93,9 +93,8 @@ def run_GOT(pdf_file):
|
|
93 |
finally:
|
94 |
if os.path.exists(pdf_path):
|
95 |
os.remove(pdf_path)
|
96 |
-
|
97 |
-
html_content
|
98 |
-
logger.info(f"Final HTML content: {html_content}") # Log the final HTML content for debugging
|
99 |
return json.dumps(results, indent=4), html_content
|
100 |
|
101 |
def cleanup_old_files():
|
@@ -110,11 +109,11 @@ with gr.Blocks() as demo:
|
|
110 |
with gr.Column():
|
111 |
pdf_input = gr.File(type="filepath", label="Upload your PDF")
|
112 |
submit_button = gr.Button("Submit")
|
113 |
-
|
114 |
with gr.Column():
|
115 |
ocr_result = gr.JSON(label="GOT output (JSON)")
|
116 |
-
html_result = gr.HTML(label="GOT output (HTML)")
|
117 |
-
|
118 |
submit_button.click(
|
119 |
run_GOT,
|
120 |
inputs=[pdf_input],
|
@@ -123,4 +122,4 @@ with gr.Blocks() as demo:
|
|
123 |
|
124 |
if __name__ == "__main__":
|
125 |
cleanup_old_files()
|
126 |
-
demo.launch()
|
|
|
50 |
unique_id = str(uuid.uuid4())
|
51 |
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
|
52 |
shutil.copy(pdf_file, pdf_path)
|
53 |
+
|
54 |
images = pdf_to_images(pdf_path)
|
55 |
results = []
|
56 |
+
html_content = ""
|
57 |
+
|
58 |
try:
|
59 |
for i, image in enumerate(images):
|
60 |
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
|
61 |
image.save(image_path)
|
62 |
+
|
63 |
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
|
64 |
+
|
65 |
logger.info(f"Processing page {i+1}...")
|
66 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
67 |
+
|
68 |
# Read the rendered HTML content
|
69 |
if os.path.exists(result_path):
|
70 |
+
with open(result_path, 'r', encoding='utf-8') as f:
|
71 |
page_html_content = f.read()
|
72 |
logger.info(f"HTML content for page {i+1} read successfully.")
|
73 |
else:
|
74 |
logger.error(f"HTML file for page {i+1} not found at {result_path}.")
|
75 |
page_html_content = ""
|
76 |
+
|
77 |
results.append({
|
78 |
"page_number": i + 1,
|
79 |
"text": res,
|
80 |
"html": page_html_content
|
81 |
})
|
82 |
+
|
83 |
html_content += f"<h2>Page {i + 1}</h2>"
|
84 |
html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
|
85 |
+
|
86 |
if os.path.exists(image_path):
|
87 |
os.remove(image_path)
|
88 |
if os.path.exists(result_path):
|
|
|
93 |
finally:
|
94 |
if os.path.exists(pdf_path):
|
95 |
os.remove(pdf_path)
|
96 |
+
|
97 |
+
print(html_content) # Debug: Check the content of html_content
|
|
|
98 |
return json.dumps(results, indent=4), html_content
|
99 |
|
100 |
def cleanup_old_files():
|
|
|
109 |
with gr.Column():
|
110 |
pdf_input = gr.File(type="filepath", label="Upload your PDF")
|
111 |
submit_button = gr.Button("Submit")
|
112 |
+
|
113 |
with gr.Column():
|
114 |
ocr_result = gr.JSON(label="GOT output (JSON)")
|
115 |
+
html_result = gr.HTML(label="GOT output (HTML)", sanitize=False)
|
116 |
+
|
117 |
submit_button.click(
|
118 |
run_GOT,
|
119 |
inputs=[pdf_input],
|
|
|
122 |
|
123 |
if __name__ == "__main__":
|
124 |
cleanup_old_files()
|
125 |
+
demo.launch()
|