acharyaaditya26 commited on
Commit
3c5823e
·
verified ·
1 Parent(s): 6101e9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -16
app.py CHANGED
@@ -12,6 +12,11 @@ import time
12
  import shutil
13
  from pathlib import Path
14
  import json
 
 
 
 
 
15
 
16
  # Load tokenizer and model
17
  tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
@@ -45,40 +50,53 @@ def run_GOT(pdf_file):
45
  unique_id = str(uuid.uuid4())
46
  pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
47
  shutil.copy(pdf_file, pdf_path)
48
-
49
  images = pdf_to_images(pdf_path)
50
  results = []
51
-
 
52
  try:
53
  for i, image in enumerate(images):
54
  image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
55
  image.save(image_path)
56
-
57
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
58
-
 
59
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
60
-
61
  # Read the rendered HTML content
62
- with open(result_path, 'r') as f:
63
- html_content = f.read()
64
-
 
 
 
 
 
65
  results.append({
66
  "page_number": i + 1,
67
  "text": res,
68
- "html": html_content
69
  })
70
-
 
 
 
71
  if os.path.exists(image_path):
72
  os.remove(image_path)
73
  if os.path.exists(result_path):
74
  os.remove(result_path)
75
  except Exception as e:
 
76
  return f"Error: {str(e)}", None
77
  finally:
78
  if os.path.exists(pdf_path):
79
  os.remove(pdf_path)
80
-
81
- return json.dumps(results, indent=4)
 
 
82
 
83
  def cleanup_old_files():
84
  current_time = time.time()
@@ -92,16 +110,17 @@ with gr.Blocks() as demo:
92
  with gr.Column():
93
  pdf_input = gr.File(type="filepath", label="Upload your PDF")
94
  submit_button = gr.Button("Submit")
95
-
96
  with gr.Column():
97
- ocr_result = gr.JSON(label="GOT output")
 
98
 
99
  submit_button.click(
100
  run_GOT,
101
  inputs=[pdf_input],
102
- outputs=[ocr_result]
103
  )
104
 
105
  if __name__ == "__main__":
106
  cleanup_old_files()
107
- demo.launch()
 
12
  import shutil
13
  from pathlib import Path
14
  import json
15
+ import logging
16
+
17
+ # Set up logging
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
 
21
  # Load tokenizer and model
22
  tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
 
50
  unique_id = str(uuid.uuid4())
51
  pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
52
  shutil.copy(pdf_file, pdf_path)
53
+
54
  images = pdf_to_images(pdf_path)
55
  results = []
56
+ html_content = "<html><body>"
57
+
58
  try:
59
  for i, image in enumerate(images):
60
  image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
61
  image.save(image_path)
62
+
63
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
64
+
65
+ logger.info(f"Processing page {i+1}...")
66
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
67
+
68
  # Read the rendered HTML content
69
+ if os.path.exists(result_path):
70
+ with open(result_path, 'r') as f:
71
+ page_html_content = f.read()
72
+ logger.info(f"HTML content for page {i+1} read successfully.")
73
+ else:
74
+ logger.error(f"HTML file for page {i+1} not found at {result_path}.")
75
+ page_html_content = ""
76
+
77
  results.append({
78
  "page_number": i + 1,
79
  "text": res,
80
+ "html": page_html_content
81
  })
82
+
83
+ html_content += f"<h2>Page {i + 1}</h2>"
84
+ html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
85
+
86
  if os.path.exists(image_path):
87
  os.remove(image_path)
88
  if os.path.exists(result_path):
89
  os.remove(result_path)
90
  except Exception as e:
91
+ logger.error(f"Error occurred: {str(e)}")
92
  return f"Error: {str(e)}", None
93
  finally:
94
  if os.path.exists(pdf_path):
95
  os.remove(pdf_path)
96
+
97
+ html_content += "</body></html>"
98
+ logger.info(f"Final HTML content: {html_content}") # Log the final HTML content for debugging
99
+ return json.dumps(results, indent=4), html_content
100
 
101
  def cleanup_old_files():
102
  current_time = time.time()
 
110
  with gr.Column():
111
  pdf_input = gr.File(type="filepath", label="Upload your PDF")
112
  submit_button = gr.Button("Submit")
113
+
114
  with gr.Column():
115
+ ocr_result = gr.JSON(label="GOT output (JSON)")
116
+ html_result = gr.HTML(label="GOT output (HTML)")
117
 
118
  submit_button.click(
119
  run_GOT,
120
  inputs=[pdf_input],
121
+ outputs=[ocr_result, html_result]
122
  )
123
 
124
  if __name__ == "__main__":
125
  cleanup_old_files()
126
+ demo.launch()