got_ocr_test

Sleeping

acharyaaditya26 commited on 25 days ago

Commit

664cbfd

•

1 Parent(s): e76c9d1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -76,30 +76,9 @@ def run_GOT(pdf_file):
             res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
-            # Read the rendered HTML content
-            with open(result_path, 'r') as f:
-                html_content = f.read()
-            # Parse the HTML and ensure newlines are preserved
-            soup = BeautifulSoup(html_content, 'html.parser')
-            # Extract the text content and ensure newlines are preserved
-            text_content = soup.find('div', id='content-text').get_text(separator=' ', strip=True)
-            # Modify the HTML to include newlines in the script
-            script_tag = soup.find('script', string=lambda x: 'const text =' in x)
-            if script_tag:
-                # Replace newlines with escaped newlines in the JavaScript string
-                escaped_text = res.replace('\n', ' ')
-                script_tag.string = f"const text = \"{escaped_text}\""
-            # Convert the modified BeautifulSoup object back to a string
-            formatted_html_content = soup.prettify()
             results.append({
                 "page_number": i + 1,
-                "text": res.replace('\n', ' '),  # Remove newlines from the text
-                "html": formatted_html_content
             })
             if os.path.exists(image_path):

             res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
             results.append({
                 "page_number": i + 1,
+                "text": res  # Directly use the output from model.chat_crop
             })
             if os.path.exists(image_path):