Spaces:
Running
Running
kuroiikimono
commited on
Commit
•
e3e8dac
1
Parent(s):
c48b00b
Update app.py
Browse files
app.py
CHANGED
@@ -176,15 +176,41 @@ This translation app is useful for people who want to translate something or wan
|
|
176 |
book = [] # PDF text data pool
|
177 |
progressbar1 = st.empty()
|
178 |
my_bar1 = progressbar1.progress(0)
|
179 |
-
|
|
|
|
|
180 |
for index, page in enumerate(doc):
|
181 |
#page_text = page.extract_text()
|
182 |
-
page_text = page.get_text(sort=True)
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
done = int(((index + 1) / page_count) * 100)
|
185 |
my_bar1.progress(done,
|
186 |
text=f"Reading Page Number : {index + 1}")
|
187 |
doc.close()
|
|
|
188 |
st.session_state.book = book
|
189 |
my_bar1.empty()
|
190 |
if os.path.isfile(
|
|
|
176 |
book = [] # PDF text data pool
|
177 |
progressbar1 = st.empty()
|
178 |
my_bar1 = progressbar1.progress(0)
|
179 |
+
|
180 |
+
from bs4 import BeautifulSoup
|
181 |
+
xml_line_blocks = st.empty()
|
182 |
for index, page in enumerate(doc):
|
183 |
#page_text = page.extract_text()
|
184 |
+
#page_text = page.get_text(sort=True)
|
185 |
+
blocks = page.get_text("xml")
|
186 |
+
soup = BeautifulSoup(blocks,'lxml-xml')
|
187 |
+
page_text2 = ""
|
188 |
+
|
189 |
+
for tag0 in soup.find_all("block"):
|
190 |
+
temp_y_posi = 0.0
|
191 |
+
for tag1 in tag0.find_all("line"):
|
192 |
+
for tag2 in tag1.find_all("font"):
|
193 |
+
for tag3 in tag2.find_all("char"):
|
194 |
+
y_posi = tag3.get("y")
|
195 |
+
if y_posi != temp_y_posi:
|
196 |
+
page_text2 += "\n"
|
197 |
+
temp_y_posi = y_posi
|
198 |
+
page_text2 += tag3.get("c")
|
199 |
+
|
200 |
+
xml_line_blocks.write(page_text2)
|
201 |
+
time.sleep(0.05)
|
202 |
+
#for index, page in enumerate(doc.pages):
|
203 |
+
#for index, page in enumerate(doc):
|
204 |
+
# #page_text = page.extract_text()
|
205 |
+
# page_text = page.get_text(sort=True)
|
206 |
+
# book.append((index, page_text))
|
207 |
+
book.append((index, page_text2))
|
208 |
+
|
209 |
done = int(((index + 1) / page_count) * 100)
|
210 |
my_bar1.progress(done,
|
211 |
text=f"Reading Page Number : {index + 1}")
|
212 |
doc.close()
|
213 |
+
xml_line_blocks.empty()
|
214 |
st.session_state.book = book
|
215 |
my_bar1.empty()
|
216 |
if os.path.isfile(
|