got_ocr_test

Sleeping

App Files Files Community

acharyaaditya26 commited on Dec 3, 2024

Commit

82adbca

verified ·

1 Parent(s): 75d2e17

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -17

app.py CHANGED Viewed

@@ -63,45 +63,45 @@ def run_GOT(pdf_file):
     unique_id = str(uuid.uuid4())
     pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
     shutil.copy(pdf_file, pdf_path)
     images = pdf_to_images(pdf_path)
     results = []
     try:
         for i, image in enumerate(images):
             image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
             image.save(image_path)
             result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
             res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
             # Read the rendered HTML content
             with open(result_path, 'r') as f:
                 html_content = f.read()
             # Parse the HTML and ensure newlines are preserved
             soup = BeautifulSoup(html_content, 'html.parser')
             # Extract the text content and ensure newlines are preserved
-            text_content = soup.find('div', id='content-text').get_text(separator='\n', strip=True)
             # Modify the HTML to include newlines in the script
             script_tag = soup.find('script', string=lambda x: 'const text =' in x)
             if script_tag:
                 # Replace newlines with escaped newlines in the JavaScript string
-                escaped_text = res.replace('\n', '\\n')
                 script_tag.string = f"const text = \"{escaped_text}\""
             # Convert the modified BeautifulSoup object back to a string
             formatted_html_content = soup.prettify()
             results.append({
                 "page_number": i + 1,
                 "text": res,
                 "html": formatted_html_content
             })
             if os.path.exists(image_path):
                 os.remove(image_path)
             if os.path.exists(result_path):
@@ -111,7 +111,7 @@ def run_GOT(pdf_file):
     finally:
         if os.path.exists(pdf_path):
             os.remove(pdf_path)
     return json.dumps(results, indent=4), results
 def cleanup_old_files():
@@ -139,11 +139,11 @@ async def upload_file(file: UploadFile = File(...)):
     temp_pdf_path = os.path.join(temp_dir.name, file.filename)
     with open(temp_pdf_path, "wb") as buffer:
         buffer.write(await file.read())
     json_output, results = run_GOT(temp_pdf_path)
     temp_dir.cleanup()
     return results
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

     unique_id = str(uuid.uuid4())
     pdf_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}.pdf")
     shutil.copy(pdf_file, pdf_path)
     images = pdf_to_images(pdf_path)
     results = []
     try:
         for i, image in enumerate(images):
             image_path = os.path.join(UPLOAD_FOLDER, f"{unique_id}_page_{i+1}.png")
             image.save(image_path)
             result_path = os.path.join(RESULTS_FOLDER, f"{unique_id}_page_{i+1}.html")
             res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
             # Read the rendered HTML content
             with open(result_path, 'r') as f:
                 html_content = f.read()
             # Parse the HTML and ensure newlines are preserved
             soup = BeautifulSoup(html_content, 'html.parser')
             # Extract the text content and ensure newlines are preserved
+            text_content = soup.find('div', id='content-text').get_text(separator=' ', strip=True)
             # Modify the HTML to include newlines in the script
             script_tag = soup.find('script', string=lambda x: 'const text =' in x)
             if script_tag:
                 # Replace newlines with escaped newlines in the JavaScript string
+                escaped_text = res.replace('\n', ' ')
                 script_tag.string = f"const text = \"{escaped_text}\""
             # Convert the modified BeautifulSoup object back to a string
             formatted_html_content = soup.prettify()
             results.append({
                 "page_number": i + 1,
                 "text": res,
                 "html": formatted_html_content
             })
             if os.path.exists(image_path):
                 os.remove(image_path)
             if os.path.exists(result_path):
     finally:
         if os.path.exists(pdf_path):
             os.remove(pdf_path)
     return json.dumps(results, indent=4), results
 def cleanup_old_files():
     temp_pdf_path = os.path.join(temp_dir.name, file.filename)
     with open(temp_pdf_path, "wb") as buffer:
         buffer.write(await file.read())
     json_output, results = run_GOT(temp_pdf_path)
     temp_dir.cleanup()
     return results
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)