acharyaaditya26 commited on
Commit
82adbca
1 Parent(s): 75d2e17

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -17
app.py CHANGED
@@ -63,45 +63,45 @@ def run_GOT(pdf_file):
63
  unique_id = str(uuid.uuid4())
64
  pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
65
  shutil.copy(pdf_file, pdf_path)
66
-
67
  images = pdf_to_images(pdf_path)
68
  results = []
69
-
70
  try:
71
  for i, image in enumerate(images):
72
  image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
73
  image.save(image_path)
74
-
75
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
76
-
77
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
78
-
79
  # Read the rendered HTML content
80
  with open(result_path, 'r') as f:
81
  html_content = f.read()
82
-
83
  # Parse the HTML and ensure newlines are preserved
84
  soup = BeautifulSoup(html_content, 'html.parser')
85
-
86
  # Extract the text content and ensure newlines are preserved
87
- text_content = soup.find('div', id='content-text').get_text(separator='\n', strip=True)
88
-
89
  # Modify the HTML to include newlines in the script
90
  script_tag = soup.find('script', string=lambda x: 'const text =' in x)
91
  if script_tag:
92
  # Replace newlines with escaped newlines in the JavaScript string
93
- escaped_text = res.replace('\n', '\\n')
94
  script_tag.string = f"const text = \"{escaped_text}\""
95
-
96
  # Convert the modified BeautifulSoup object back to a string
97
  formatted_html_content = soup.prettify()
98
-
99
  results.append({
100
  "page_number": i + 1,
101
  "text": res,
102
  "html": formatted_html_content
103
  })
104
-
105
  if os.path.exists(image_path):
106
  os.remove(image_path)
107
  if os.path.exists(result_path):
@@ -111,7 +111,7 @@ def run_GOT(pdf_file):
111
  finally:
112
  if os.path.exists(pdf_path):
113
  os.remove(pdf_path)
114
-
115
  return json.dumps(results, indent=4), results
116
 
117
  def cleanup_old_files():
@@ -139,11 +139,11 @@ async def upload_file(file: UploadFile = File(...)):
139
  temp_pdf_path = os.path.join(temp_dir.name, file.filename)
140
  with open(temp_pdf_path, "wb") as buffer:
141
  buffer.write(await file.read())
142
-
143
  json_output, results = run_GOT(temp_pdf_path)
144
  temp_dir.cleanup()
145
-
146
  return results
147
 
148
  if __name__ == "__main__":
149
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
63
  unique_id = str(uuid.uuid4())
64
  pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
65
  shutil.copy(pdf_file, pdf_path)
66
+
67
  images = pdf_to_images(pdf_path)
68
  results = []
69
+
70
  try:
71
  for i, image in enumerate(images):
72
  image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
73
  image.save(image_path)
74
+
75
  result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
76
+
77
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
78
+
79
  # Read the rendered HTML content
80
  with open(result_path, 'r') as f:
81
  html_content = f.read()
82
+
83
  # Parse the HTML and ensure newlines are preserved
84
  soup = BeautifulSoup(html_content, 'html.parser')
85
+
86
  # Extract the text content and ensure newlines are preserved
87
+ text_content = soup.find('div', id='content-text').get_text(separator=' ', strip=True)
88
+
89
  # Modify the HTML to include newlines in the script
90
  script_tag = soup.find('script', string=lambda x: 'const text =' in x)
91
  if script_tag:
92
  # Replace newlines with escaped newlines in the JavaScript string
93
+ escaped_text = res.replace('\n', ' ')
94
  script_tag.string = f"const text = \"{escaped_text}\""
95
+
96
  # Convert the modified BeautifulSoup object back to a string
97
  formatted_html_content = soup.prettify()
98
+
99
  results.append({
100
  "page_number": i + 1,
101
  "text": res,
102
  "html": formatted_html_content
103
  })
104
+
105
  if os.path.exists(image_path):
106
  os.remove(image_path)
107
  if os.path.exists(result_path):
 
111
  finally:
112
  if os.path.exists(pdf_path):
113
  os.remove(pdf_path)
114
+
115
  return json.dumps(results, indent=4), results
116
 
117
  def cleanup_old_files():
 
139
  temp_pdf_path = os.path.join(temp_dir.name, file.filename)
140
  with open(temp_pdf_path, "wb") as buffer:
141
  buffer.write(await file.read())
142
+
143
  json_output, results = run_GOT(temp_pdf_path)
144
  temp_dir.cleanup()
145
+
146
  return results
147
 
148
  if __name__ == "__main__":
149
+ uvicorn.run(app, host="0.0.0.0", port=8000)