acharyaaditya26 commited on
Commit
664cbfd
1 Parent(s): e76c9d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -22
app.py CHANGED
@@ -76,30 +76,9 @@ def run_GOT(pdf_file):
76
 
77
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
78
 
79
- # Read the rendered HTML content
80
- with open(result_path, 'r') as f:
81
- html_content = f.read()
82
-
83
- # Parse the HTML and ensure newlines are preserved
84
- soup = BeautifulSoup(html_content, 'html.parser')
85
-
86
- # Extract the text content and ensure newlines are preserved
87
- text_content = soup.find('div', id='content-text').get_text(separator=' ', strip=True)
88
-
89
- # Modify the HTML to include newlines in the script
90
- script_tag = soup.find('script', string=lambda x: 'const text =' in x)
91
- if script_tag:
92
- # Replace newlines with escaped newlines in the JavaScript string
93
- escaped_text = res.replace('\n', ' ')
94
- script_tag.string = f"const text = \"{escaped_text}\""
95
-
96
- # Convert the modified BeautifulSoup object back to a string
97
- formatted_html_content = soup.prettify()
98
-
99
  results.append({
100
  "page_number": i + 1,
101
- "text": res.replace('\n', ' '), # Remove newlines from the text
102
- "html": formatted_html_content
103
  })
104
 
105
  if os.path.exists(image_path):
 
76
 
77
  res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  results.append({
80
  "page_number": i + 1,
81
+ "text": res # Directly use the output from model.chat_crop
 
82
  })
83
 
84
  if os.path.exists(image_path):