Tonic commited on
Commit
d1a7b5b
Β·
unverified Β·
1 Parent(s): eeaf186

improve latex parsing

Browse files
Files changed (1) hide show
  1. app.py +23 -14
app.py CHANGED
@@ -133,34 +133,43 @@ def update_inputs(task):
133
  ]
134
 
135
  def parse_latex_output(res):
136
- lines = res.split('\n')
 
137
  parsed_lines = []
138
  in_latex = False
139
-
 
140
  for line in lines:
141
- line = line.strip()
142
- if not line:
 
 
 
143
  continue
 
 
144
 
145
- latex_patterns = [r'\{', r'\}', r'\[', r'\]', r'\\', r'\$', r'_', r'^']
146
  contains_latex = any(re.search(pattern, line) for pattern in latex_patterns)
147
 
148
- is_key_value = ':' in line and not line.startswith('{') and not line.endswith('}')
149
-
150
- if contains_latex or is_key_value:
151
  if not in_latex:
152
- parsed_lines.append('$$')
153
  in_latex = True
154
- parsed_lines.append(line)
 
155
  else:
156
  if in_latex:
157
- parsed_lines.append('$$')
 
158
  in_latex = False
 
159
  parsed_lines.append(line)
 
160
  if in_latex:
161
- parsed_lines.append('$$')
162
-
163
- return '\n'.join(parsed_lines)
 
164
 
165
  def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
166
  res, html_content, unique_id = process_image(image, task, ocr_type, ocr_box, ocr_color)
 
133
  ]
134
 
135
  def parse_latex_output(res):
136
+ # Split the input, preserving newlines and empty lines
137
+ lines = re.split(r'(\$\$.*?\$\$)', res, flags=re.DOTALL)
138
  parsed_lines = []
139
  in_latex = False
140
+ latex_buffer = []
141
+
142
  for line in lines:
143
+ if line == '\n':
144
+ if in_latex:
145
+ latex_buffer.append(line)
146
+ else:
147
+ parsed_lines.append(line)
148
  continue
149
+
150
+ line = line.strip()
151
 
152
+ latex_patterns = [r'\{', r'\}', r'\[', r'\]', r'\\', r'\$', r'_', r'^', r'"']
153
  contains_latex = any(re.search(pattern, line) for pattern in latex_patterns)
154
 
155
+ if contains_latex:
 
 
156
  if not in_latex:
 
157
  in_latex = True
158
+ latex_buffer = ['$$']
159
+ latex_buffer.append(line)
160
  else:
161
  if in_latex:
162
+ latex_buffer.append('$$')
163
+ parsed_lines.extend(latex_buffer)
164
  in_latex = False
165
+ latex_buffer = []
166
  parsed_lines.append(line)
167
+
168
  if in_latex:
169
+ latex_buffer.append('$$')
170
+ parsed_lines.extend(latex_buffer)
171
+
172
+ return '$$\n$$'.join(parsed_lines)
173
 
174
  def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
175
  res, html_content, unique_id = process_image(image, task, ocr_type, ocr_box, ocr_color)