acharyaaditya26 commited on
Commit
063bf5b
·
verified ·
1 Parent(s): 3c5823e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -17
app.py CHANGED
@@ -50,39 +50,39 @@ def run_GOT(pdf_file):
50
  unique_id = str(uuid.uuid4())
51
  pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
52
  shutil.copy(pdf_file, pdf_path)
53
-
54
  images = pdf_to_images(pdf_path)
55
  results = []
56
- html_content = "<html><body>"
57
-
58
  try:
59
  for i, image in enumerate(images):
60
  image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
61
  image.save(image_path)
62
-
63
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
64
-
65
  logger.info(f"Processing page {i+1}...")
66
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
67
-
68
  # Read the rendered HTML content
69
  if os.path.exists(result_path):
70
- with open(result_path, 'r') as f:
71
  page_html_content = f.read()
72
  logger.info(f"HTML content for page {i+1} read successfully.")
73
  else:
74
  logger.error(f"HTML file for page {i+1} not found at {result_path}.")
75
  page_html_content = ""
76
-
77
  results.append({
78
  "page_number": i + 1,
79
  "text": res,
80
  "html": page_html_content
81
  })
82
-
83
  html_content += f"<h2>Page {i + 1}</h2>"
84
  html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
85
-
86
  if os.path.exists(image_path):
87
  os.remove(image_path)
88
  if os.path.exists(result_path):
@@ -93,9 +93,8 @@ def run_GOT(pdf_file):
93
  finally:
94
  if os.path.exists(pdf_path):
95
  os.remove(pdf_path)
96
-
97
- html_content += "</body></html>"
98
- logger.info(f"Final HTML content: {html_content}") # Log the final HTML content for debugging
99
  return json.dumps(results, indent=4), html_content
100
 
101
  def cleanup_old_files():
@@ -110,11 +109,11 @@ with gr.Blocks() as demo:
110
  with gr.Column():
111
  pdf_input = gr.File(type="filepath", label="Upload your PDF")
112
  submit_button = gr.Button("Submit")
113
-
114
  with gr.Column():
115
  ocr_result = gr.JSON(label="GOT output (JSON)")
116
- html_result = gr.HTML(label="GOT output (HTML)")
117
-
118
  submit_button.click(
119
  run_GOT,
120
  inputs=[pdf_input],
@@ -123,4 +122,4 @@ with gr.Blocks() as demo:
123
 
124
  if __name__ == "__main__":
125
  cleanup_old_files()
126
- demo.launch()
 
50
  unique_id = str(uuid.uuid4())
51
  pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
52
  shutil.copy(pdf_file, pdf_path)
53
+
54
  images = pdf_to_images(pdf_path)
55
  results = []
56
+ html_content = ""
57
+
58
  try:
59
  for i, image in enumerate(images):
60
  image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
61
  image.save(image_path)
62
+
63
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
64
+
65
  logger.info(f"Processing page {i+1}...")
66
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
67
+
68
  # Read the rendered HTML content
69
  if os.path.exists(result_path):
70
+ with open(result_path, 'r', encoding='utf-8') as f:
71
  page_html_content = f.read()
72
  logger.info(f"HTML content for page {i+1} read successfully.")
73
  else:
74
  logger.error(f"HTML file for page {i+1} not found at {result_path}.")
75
  page_html_content = ""
76
+
77
  results.append({
78
  "page_number": i + 1,
79
  "text": res,
80
  "html": page_html_content
81
  })
82
+
83
  html_content += f"<h2>Page {i + 1}</h2>"
84
  html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
85
+
86
  if os.path.exists(image_path):
87
  os.remove(image_path)
88
  if os.path.exists(result_path):
 
93
  finally:
94
  if os.path.exists(pdf_path):
95
  os.remove(pdf_path)
96
+
97
+ print(html_content) # Debug: Check the content of html_content
 
98
  return json.dumps(results, indent=4), html_content
99
 
100
  def cleanup_old_files():
 
109
  with gr.Column():
110
  pdf_input = gr.File(type="filepath", label="Upload your PDF")
111
  submit_button = gr.Button("Submit")
112
+
113
  with gr.Column():
114
  ocr_result = gr.JSON(label="GOT output (JSON)")
115
+ html_result = gr.HTML(label="GOT output (HTML)", sanitize=False)
116
+
117
  submit_button.click(
118
  run_GOT,
119
  inputs=[pdf_input],
 
122
 
123
  if __name__ == "__main__":
124
  cleanup_old_files()
125
+ demo.launch()