got_ocr_test

Sleeping

acharyaaditya26 commited on Dec 3, 2024

Commit

dd6ac97

verified ·

1 Parent(s): 05f8689

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ from pathlib import Path
 import json
 from starlette.requests import Request
 import uvicorn
 app = FastAPI()
@@ -79,10 +80,26 @@ def run_GOT(pdf_file):
             with open(result_path, 'r') as f:
                 html_content = f.read()
             results.append({
                 "page_number": i + 1,
                 "text": res,
-                "html": html_content
             })
             if os.path.exists(image_path):

 import json
 from starlette.requests import Request
 import uvicorn
+from bs4 import BeautifulSoup
 app = FastAPI()
             with open(result_path, 'r') as f:
                 html_content = f.read()
+            # Parse the HTML and ensure newlines are preserved
+            soup = BeautifulSoup(html_content, 'html.parser')
+            # Extract the text content and ensure newlines are preserved
+            text_content = soup.find('div', id='content-text').get_text(separator='\n', strip=True)
+            # Modify the HTML to include newlines in the script
+            script_tag = soup.find('script', string=lambda x: 'const text =' in x)
+            if script_tag:
+                # Replace newlines with escaped newlines in the JavaScript string
+                escaped_text = res.replace('\n', '\\n')
+                script_tag.string = f"const text = \"{escaped_text}\""
+            # Convert the modified BeautifulSoup object back to a string
+            formatted_html_content = soup.prettify()
             results.append({
                 "page_number": i + 1,
                 "text": res,
+                "html": formatted_html_content
             })
             if os.path.exists(image_path):