Spaces:
Sleeping
Sleeping
acharyaaditya26
commited on
Commit
•
dd6ac97
1
Parent(s):
05f8689
Update app.py
Browse files
app.py
CHANGED
@@ -18,6 +18,7 @@ from pathlib import Path
|
|
18 |
import json
|
19 |
from starlette.requests import Request
|
20 |
import uvicorn
|
|
|
21 |
|
22 |
app = FastAPI()
|
23 |
|
@@ -79,10 +80,26 @@ def run_GOT(pdf_file):
|
|
79 |
with open(result_path, 'r') as f:
|
80 |
html_content = f.read()
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
results.append({
|
83 |
"page_number": i + 1,
|
84 |
"text": res,
|
85 |
-
"html":
|
86 |
})
|
87 |
|
88 |
if os.path.exists(image_path):
|
|
|
18 |
import json
|
19 |
from starlette.requests import Request
|
20 |
import uvicorn
|
21 |
+
from bs4 import BeautifulSoup
|
22 |
|
23 |
app = FastAPI()
|
24 |
|
|
|
80 |
with open(result_path, 'r') as f:
|
81 |
html_content = f.read()
|
82 |
|
83 |
+
# Parse the HTML and ensure newlines are preserved
|
84 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
85 |
+
|
86 |
+
# Extract the text content and ensure newlines are preserved
|
87 |
+
text_content = soup.find('div', id='content-text').get_text(separator='\n', strip=True)
|
88 |
+
|
89 |
+
# Modify the HTML to include newlines in the script
|
90 |
+
script_tag = soup.find('script', string=lambda x: 'const text =' in x)
|
91 |
+
if script_tag:
|
92 |
+
# Replace newlines with escaped newlines in the JavaScript string
|
93 |
+
escaped_text = res.replace('\n', '\\n')
|
94 |
+
script_tag.string = f"const text = \"{escaped_text}\""
|
95 |
+
|
96 |
+
# Convert the modified BeautifulSoup object back to a string
|
97 |
+
formatted_html_content = soup.prettify()
|
98 |
+
|
99 |
results.append({
|
100 |
"page_number": i + 1,
|
101 |
"text": res,
|
102 |
+
"html": formatted_html_content
|
103 |
})
|
104 |
|
105 |
if os.path.exists(image_path):
|