acharyaaditya26 commited on
Commit
dd6ac97
1 Parent(s): 05f8689

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -1
app.py CHANGED
@@ -18,6 +18,7 @@ from pathlib import Path
18
  import json
19
  from starlette.requests import Request
20
  import uvicorn
 
21
 
22
  app = FastAPI()
23
 
@@ -79,10 +80,26 @@ def run_GOT(pdf_file):
79
  with open(result_path, 'r') as f:
80
  html_content = f.read()
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  results.append({
83
  "page_number": i + 1,
84
  "text": res,
85
- "html": html_content
86
  })
87
 
88
  if os.path.exists(image_path):
 
18
  import json
19
  from starlette.requests import Request
20
  import uvicorn
21
+ from bs4 import BeautifulSoup
22
 
23
  app = FastAPI()
24
 
 
80
  with open(result_path, 'r') as f:
81
  html_content = f.read()
82
 
83
+ # Parse the HTML and ensure newlines are preserved
84
+ soup = BeautifulSoup(html_content, 'html.parser')
85
+
86
+ # Extract the text content and ensure newlines are preserved
87
+ text_content = soup.find('div', id='content-text').get_text(separator='\n', strip=True)
88
+
89
+ # Modify the HTML to include newlines in the script
90
+ script_tag = soup.find('script', string=lambda x: 'const text =' in x)
91
+ if script_tag:
92
+ # Replace newlines with escaped newlines in the JavaScript string
93
+ escaped_text = res.replace('\n', '\\n')
94
+ script_tag.string = f"const text = \"{escaped_text}\""
95
+
96
+ # Convert the modified BeautifulSoup object back to a string
97
+ formatted_html_content = soup.prettify()
98
+
99
  results.append({
100
  "page_number": i + 1,
101
  "text": res,
102
+ "html": formatted_html_content
103
  })
104
 
105
  if os.path.exists(image_path):