acharyaaditya26 commited on
Commit
e6c69f5
·
verified ·
1 Parent(s): 063bf5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -25
app.py CHANGED
@@ -12,11 +12,6 @@ import time
12
  import shutil
13
  from pathlib import Path
14
  import json
15
- import logging
16
-
17
- # Set up logging
18
- logging.basicConfig(level=logging.INFO)
19
- logger = logging.getLogger(__name__)
20
 
21
  # Load tokenizer and model
22
  tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
@@ -53,7 +48,6 @@ def run_GOT(pdf_file):
53
 
54
  images = pdf_to_images(pdf_path)
55
  results = []
56
- html_content = ""
57
 
58
  try:
59
  for i, image in enumerate(images):
@@ -62,40 +56,29 @@ def run_GOT(pdf_file):
62
 
63
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
64
 
65
- logger.info(f"Processing page {i+1}...")
66
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
67
 
68
  # Read the rendered HTML content
69
- if os.path.exists(result_path):
70
- with open(result_path, 'r', encoding='utf-8') as f:
71
- page_html_content = f.read()
72
- logger.info(f"HTML content for page {i+1} read successfully.")
73
- else:
74
- logger.error(f"HTML file for page {i+1} not found at {result_path}.")
75
- page_html_content = ""
76
 
77
  results.append({
78
  "page_number": i + 1,
79
  "text": res,
80
- "html": page_html_content
81
  })
82
 
83
- html_content += f"<h2>Page {i + 1}</h2>"
84
- html_content += page_html_content + "<br><hr><br>" # Add a separator between pages
85
-
86
  if os.path.exists(image_path):
87
  os.remove(image_path)
88
  if os.path.exists(result_path):
89
  os.remove(result_path)
90
  except Exception as e:
91
- logger.error(f"Error occurred: {str(e)}")
92
  return f"Error: {str(e)}", None
93
  finally:
94
  if os.path.exists(pdf_path):
95
  os.remove(pdf_path)
96
 
97
- print(html_content) # Debug: Check the content of html_content
98
- return json.dumps(results, indent=4), html_content
99
 
100
  def cleanup_old_files():
101
  current_time = time.time()
@@ -111,13 +94,12 @@ with gr.Blocks() as demo:
111
  submit_button = gr.Button("Submit")
112
 
113
  with gr.Column():
114
- ocr_result = gr.JSON(label="GOT output (JSON)")
115
- html_result = gr.HTML(label="GOT output (HTML)", sanitize=False)
116
-
117
  submit_button.click(
118
  run_GOT,
119
  inputs=[pdf_input],
120
- outputs=[ocr_result, html_result]
121
  )
122
 
123
  if __name__ == "__main__":
 
12
  import shutil
13
  from pathlib import Path
14
  import json
 
 
 
 
 
15
 
16
  # Load tokenizer and model
17
  tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
 
48
 
49
  images = pdf_to_images(pdf_path)
50
  results = []
 
51
 
52
  try:
53
  for i, image in enumerate(images):
 
56
 
57
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
58
 
 
59
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
60
 
61
  # Read the rendered HTML content
62
+ with open(result_path, 'r') as f:
63
+ html_content = f.read()
 
 
 
 
 
64
 
65
  results.append({
66
  "page_number": i + 1,
67
  "text": res,
68
+ "html": html_content
69
  })
70
 
 
 
 
71
  if os.path.exists(image_path):
72
  os.remove(image_path)
73
  if os.path.exists(result_path):
74
  os.remove(result_path)
75
  except Exception as e:
 
76
  return f"Error: {str(e)}", None
77
  finally:
78
  if os.path.exists(pdf_path):
79
  os.remove(pdf_path)
80
 
81
+ return json.dumps(results, indent=4)
 
82
 
83
  def cleanup_old_files():
84
  current_time = time.time()
 
94
  submit_button = gr.Button("Submit")
95
 
96
  with gr.Column():
97
+ ocr_result = gr.JSON(label="GOT output")
98
+
 
99
  submit_button.click(
100
  run_GOT,
101
  inputs=[pdf_input],
102
+ outputs=[ocr_result]
103
  )
104
 
105
  if __name__ == "__main__":