Spaces:
Sleeping
Sleeping
acharyaaditya26
commited on
Commit
•
82adbca
1
Parent(s):
75d2e17
Update app.py
Browse files
app.py
CHANGED
@@ -63,45 +63,45 @@ def run_GOT(pdf_file):
|
|
63 |
unique_id = str(uuid.uuid4())
|
64 |
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
|
65 |
shutil.copy(pdf_file, pdf_path)
|
66 |
-
|
67 |
images = pdf_to_images(pdf_path)
|
68 |
results = []
|
69 |
-
|
70 |
try:
|
71 |
for i, image in enumerate(images):
|
72 |
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
|
73 |
image.save(image_path)
|
74 |
-
|
75 |
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
|
76 |
-
|
77 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
78 |
-
|
79 |
# Read the rendered HTML content
|
80 |
with open(result_path, 'r') as f:
|
81 |
html_content = f.read()
|
82 |
-
|
83 |
# Parse the HTML and ensure newlines are preserved
|
84 |
soup = BeautifulSoup(html_content, 'html.parser')
|
85 |
-
|
86 |
# Extract the text content and ensure newlines are preserved
|
87 |
-
text_content = soup.find('div', id='content-text').get_text(separator='
|
88 |
-
|
89 |
# Modify the HTML to include newlines in the script
|
90 |
script_tag = soup.find('script', string=lambda x: 'const text =' in x)
|
91 |
if script_tag:
|
92 |
# Replace newlines with escaped newlines in the JavaScript string
|
93 |
-
escaped_text = res.replace('\n', '
|
94 |
script_tag.string = f"const text = \"{escaped_text}\""
|
95 |
-
|
96 |
# Convert the modified BeautifulSoup object back to a string
|
97 |
formatted_html_content = soup.prettify()
|
98 |
-
|
99 |
results.append({
|
100 |
"page_number": i + 1,
|
101 |
"text": res,
|
102 |
"html": formatted_html_content
|
103 |
})
|
104 |
-
|
105 |
if os.path.exists(image_path):
|
106 |
os.remove(image_path)
|
107 |
if os.path.exists(result_path):
|
@@ -111,7 +111,7 @@ def run_GOT(pdf_file):
|
|
111 |
finally:
|
112 |
if os.path.exists(pdf_path):
|
113 |
os.remove(pdf_path)
|
114 |
-
|
115 |
return json.dumps(results, indent=4), results
|
116 |
|
117 |
def cleanup_old_files():
|
@@ -139,11 +139,11 @@ async def upload_file(file: UploadFile = File(...)):
|
|
139 |
temp_pdf_path = os.path.join(temp_dir.name, file.filename)
|
140 |
with open(temp_pdf_path, "wb") as buffer:
|
141 |
buffer.write(await file.read())
|
142 |
-
|
143 |
json_output, results = run_GOT(temp_pdf_path)
|
144 |
temp_dir.cleanup()
|
145 |
-
|
146 |
return results
|
147 |
|
148 |
if __name__ == "__main__":
|
149 |
-
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
63 |
unique_id = str(uuid.uuid4())
|
64 |
pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
|
65 |
shutil.copy(pdf_file, pdf_path)
|
66 |
+
|
67 |
images = pdf_to_images(pdf_path)
|
68 |
results = []
|
69 |
+
|
70 |
try:
|
71 |
for i, image in enumerate(images):
|
72 |
image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
|
73 |
image.save(image_path)
|
74 |
+
|
75 |
result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
|
76 |
+
|
77 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
78 |
+
|
79 |
# Read the rendered HTML content
|
80 |
with open(result_path, 'r') as f:
|
81 |
html_content = f.read()
|
82 |
+
|
83 |
# Parse the HTML and ensure newlines are preserved
|
84 |
soup = BeautifulSoup(html_content, 'html.parser')
|
85 |
+
|
86 |
# Extract the text content and ensure newlines are preserved
|
87 |
+
text_content = soup.find('div', id='content-text').get_text(separator=' ', strip=True)
|
88 |
+
|
89 |
# Modify the HTML to include newlines in the script
|
90 |
script_tag = soup.find('script', string=lambda x: 'const text =' in x)
|
91 |
if script_tag:
|
92 |
# Replace newlines with escaped newlines in the JavaScript string
|
93 |
+
escaped_text = res.replace('\n', ' ')
|
94 |
script_tag.string = f"const text = \"{escaped_text}\""
|
95 |
+
|
96 |
# Convert the modified BeautifulSoup object back to a string
|
97 |
formatted_html_content = soup.prettify()
|
98 |
+
|
99 |
results.append({
|
100 |
"page_number": i + 1,
|
101 |
"text": res,
|
102 |
"html": formatted_html_content
|
103 |
})
|
104 |
+
|
105 |
if os.path.exists(image_path):
|
106 |
os.remove(image_path)
|
107 |
if os.path.exists(result_path):
|
|
|
111 |
finally:
|
112 |
if os.path.exists(pdf_path):
|
113 |
os.remove(pdf_path)
|
114 |
+
|
115 |
return json.dumps(results, indent=4), results
|
116 |
|
117 |
def cleanup_old_files():
|
|
|
139 |
temp_pdf_path = os.path.join(temp_dir.name, file.filename)
|
140 |
with open(temp_pdf_path, "wb") as buffer:
|
141 |
buffer.write(await file.read())
|
142 |
+
|
143 |
json_output, results = run_GOT(temp_pdf_path)
|
144 |
temp_dir.cleanup()
|
145 |
+
|
146 |
return results
|
147 |
|
148 |
if __name__ == "__main__":
|
149 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|