Spaces:
Sleeping
Sleeping
acharyaaditya26
commited on
Commit
•
664cbfd
1
Parent(s):
e76c9d1
Update app.py
Browse files
app.py
CHANGED
@@ -76,30 +76,9 @@ def run_GOT(pdf_file):
|
|
76 |
|
77 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
78 |
|
79 |
-
# Read the rendered HTML content
|
80 |
-
with open(result_path, 'r') as f:
|
81 |
-
html_content = f.read()
|
82 |
-
|
83 |
-
# Parse the HTML and ensure newlines are preserved
|
84 |
-
soup = BeautifulSoup(html_content, 'html.parser')
|
85 |
-
|
86 |
-
# Extract the text content and ensure newlines are preserved
|
87 |
-
text_content = soup.find('div', id='content-text').get_text(separator=' ', strip=True)
|
88 |
-
|
89 |
-
# Modify the HTML to include newlines in the script
|
90 |
-
script_tag = soup.find('script', string=lambda x: 'const text =' in x)
|
91 |
-
if script_tag:
|
92 |
-
# Replace newlines with escaped newlines in the JavaScript string
|
93 |
-
escaped_text = res.replace('\n', ' ')
|
94 |
-
script_tag.string = f"const text = \"{escaped_text}\""
|
95 |
-
|
96 |
-
# Convert the modified BeautifulSoup object back to a string
|
97 |
-
formatted_html_content = soup.prettify()
|
98 |
-
|
99 |
results.append({
|
100 |
"page_number": i + 1,
|
101 |
-
"text": res
|
102 |
-
"html": formatted_html_content
|
103 |
})
|
104 |
|
105 |
if os.path.exists(image_path):
|
|
|
76 |
|
77 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
results.append({
|
80 |
"page_number": i + 1,
|
81 |
+
"text": res # Directly use the output from model.chat_crop
|
|
|
82 |
})
|
83 |
|
84 |
if os.path.exists(image_path):
|