anmolsahai commited on
Commit
28cc4a3
1 Parent(s): 507a7c5
Files changed (1) hide show
  1. app.py +2 -2
app.py CHANGED
@@ -16,8 +16,8 @@ def pdf_to_text_with_layout(pdf_file):
16
  return "\n".join(text)
17
 
18
  def clean_text(text):
19
- # Remove non-XML-compatible characters
20
- return re.sub(r'[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD\x10000-\x10FFFF]', '', text)
21
 
22
  def text_to_word_with_formatting(text, word_path):
23
  doc = Document()
 
16
  return "\n".join(text)
17
 
18
  def clean_text(text):
19
+ # Remove non-ASCII and control characters
20
+ return ''.join(c for c in text if c.isprintable() and ord(c) < 65536)
21
 
22
  def text_to_word_with_formatting(text, word_path):
23
  doc = Document()