kuroiikimono commited on
Commit
e3e8dac
·
verified ·
1 Parent(s): c48b00b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -3
app.py CHANGED
@@ -176,15 +176,41 @@ This translation app is useful for people who want to translate something or wan
176
  book = [] # PDF text data pool
177
  progressbar1 = st.empty()
178
  my_bar1 = progressbar1.progress(0)
179
- #for index, page in enumerate(doc.pages):
 
 
180
  for index, page in enumerate(doc):
181
  #page_text = page.extract_text()
182
- page_text = page.get_text(sort=True)
183
- book.append((index, page_text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  done = int(((index + 1) / page_count) * 100)
185
  my_bar1.progress(done,
186
  text=f"Reading Page Number : {index + 1}")
187
  doc.close()
 
188
  st.session_state.book = book
189
  my_bar1.empty()
190
  if os.path.isfile(
 
176
  book = [] # PDF text data pool
177
  progressbar1 = st.empty()
178
  my_bar1 = progressbar1.progress(0)
179
+
180
+ from bs4 import BeautifulSoup
181
+ xml_line_blocks = st.empty()
182
  for index, page in enumerate(doc):
183
  #page_text = page.extract_text()
184
+ #page_text = page.get_text(sort=True)
185
+ blocks = page.get_text("xml")
186
+ soup = BeautifulSoup(blocks,'lxml-xml')
187
+ page_text2 = ""
188
+
189
+ for tag0 in soup.find_all("block"):
190
+ temp_y_posi = 0.0
191
+ for tag1 in tag0.find_all("line"):
192
+ for tag2 in tag1.find_all("font"):
193
+ for tag3 in tag2.find_all("char"):
194
+ y_posi = tag3.get("y")
195
+ if y_posi != temp_y_posi:
196
+ page_text2 += "\n"
197
+ temp_y_posi = y_posi
198
+ page_text2 += tag3.get("c")
199
+
200
+ xml_line_blocks.write(page_text2)
201
+ time.sleep(0.05)
202
+ #for index, page in enumerate(doc.pages):
203
+ #for index, page in enumerate(doc):
204
+ # #page_text = page.extract_text()
205
+ # page_text = page.get_text(sort=True)
206
+ # book.append((index, page_text))
207
+ book.append((index, page_text2))
208
+
209
  done = int(((index + 1) / page_count) * 100)
210
  my_bar1.progress(done,
211
  text=f"Reading Page Number : {index + 1}")
212
  doc.close()
213
+ xml_line_blocks.empty()
214
  st.session_state.book = book
215
  my_bar1.empty()
216
  if os.path.isfile(