kuroiikimono commited on
Commit
378db40
Β·
verified Β·
1 Parent(s): 837ccfa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +471 -0
app.py ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import zipfile, shutil, time
3
+ import os
4
+ import hashlib
5
+ #from streamlit_pdf_viewer import pdf_viewer
6
+ from streamlit import runtime
7
+ from streamlit.runtime.scriptrunner import get_script_run_ctx
8
+ #from streamlit_js_eval import streamlit_js_eval
9
+ import secrets
10
+ from pypdf import PdfReader
11
+ import glob
12
+
13
+ def get_remote_ip() -> str:
14
+ """Get remote ip."""
15
+
16
+ try:
17
+ ctx = get_script_run_ctx()
18
+ if ctx is None:
19
+ return None
20
+
21
+ session_info = runtime.get_instance().get_client(ctx.session_id)
22
+ if session_info is None:
23
+ return None
24
+ except Exception as e:
25
+ return None
26
+
27
+ return session_info.request.remote_ip
28
+
29
+
30
+ # colab side make dir
31
+ def my_makedirs(path):
32
+ if not os.path.isdir(path):
33
+ os.makedirs(path)
34
+
35
+
36
+ def main():
37
+
38
+ if 'uniq' not in st.session_state:
39
+ st.session_state.uniq = secrets.token_urlsafe()
40
+
41
+ temp_dir = st.session_state.uniq
42
+ my_makedirs(f"removefolder/{temp_dir}")
43
+
44
+ flag = True
45
+ if 'count' not in st.session_state:
46
+ st.session_state.count = 0
47
+ #tempolary
48
+ if 'temp' not in st.session_state:
49
+ st.session_state.temp = 0
50
+
51
+ if 'lang' not in st.session_state:
52
+ st.session_state.lang = ""
53
+ if 'result' not in st.session_state:
54
+ st.session_state.result = ""
55
+
56
+ obj_0 = st.empty()
57
+ obj_1 = st.empty()
58
+
59
+ obj_0.header("`PDF file uploader`")
60
+ st.markdown(f"The remote ip is `{get_remote_ip()}`")
61
+
62
+ uploaded_file = obj_1.file_uploader("UPLOAD your .pdf file", type="pdf")
63
+ ####
64
+ if uploaded_file is not None:
65
+ flag = False
66
+ st.success("PDF file translator")
67
+ # hashed
68
+ raw_filename = uploaded_file.name
69
+ intext_0 = f'<span style="color:LavenderBlush;background:Orchid">{raw_filename}</span>'
70
+ st.write(intext_0, unsafe_allow_html=True)
71
+ hashed_filename = hashlib.sha1(raw_filename.encode())
72
+ uploadedfilename = hashed_filename.hexdigest()
73
+ if "uploadedfilename" not in st.session_state:
74
+ st.session_state.uploadedfilename = uploadedfilename
75
+
76
+ if "book" not in st.session_state:
77
+ # pdf_viewer(input=uploaded_file.getvalue(), width=700, height=500)
78
+
79
+ my_makedirs(
80
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}"
81
+ )
82
+
83
+ with open(
84
+ f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf',
85
+ 'wb') as file:
86
+ file.write(uploaded_file.getvalue())
87
+ # pdf_viewer(input=f'{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf', width=700, height=500)
88
+
89
+ # read from PDF file
90
+ PDF = glob.glob(
91
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
92
+ )
93
+
94
+ doc = PdfReader(PDF[0])
95
+ # meta = doc.metadata
96
+ page_count = len(doc.pages)
97
+
98
+ book = [] # PDF text data pool
99
+ progressbar1 = st.empty()
100
+ my_bar1 = progressbar1.progress(0)
101
+ for index, page in enumerate(doc.pages):
102
+ page_text = page.extract_text()
103
+ book.append((index, page_text))
104
+ done = int(((index + 1) / page_count) * 100)
105
+ my_bar1.progress(done,
106
+ text=f"Reading Page Number : {index + 1}")
107
+ st.session_state.book = book
108
+ my_bar1.empty()
109
+ if os.path.isfile(
110
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
111
+ ):
112
+ shutil.rmtree(
113
+ f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/"
114
+ )
115
+
116
+ ########
117
+ reload_bt = st.empty()
118
+ if reload_bt.button("Upload another PDF file"):
119
+ for key in st.session_state.keys():
120
+ if key == "count" or key == "temp" or key == "lang":
121
+ continue
122
+ else:
123
+ del st.session_state[key]
124
+ shutil.rmtree(f"removefolder/{temp_dir}")
125
+ # page reload
126
+ # streamlit_js_eval(js_expressions="parent.window.location.reload()")
127
+ st.markdown("----")
128
+
129
+ plain_text1 = " 𓃠 select target language 𓃠 "
130
+ var_text1 = f'##### <span style="color:green">{plain_text1}</span>'
131
+
132
+ select = st.empty()
133
+ select.write(var_text1, unsafe_allow_html=True)
134
+
135
+ # select language
136
+ st.markdown("""
137
+ `ja`: **Japanese**,
138
+ `en`: **English**,
139
+ `fr`: **French**,
140
+ `zb-TW`: **Chinese (traditional)**,
141
+ `zh-CN`: **Chinese (simplified)**,
142
+ `ru`: **Russian**,
143
+ `ko`: **Korean**,
144
+ `vi`: **Vietnamese**,
145
+ `th`: **Thai**,
146
+ `tl`: **Tagalog**,
147
+ `ca`: **Catalan**,
148
+ `si`: **Sinhalese**
149
+ """)
150
+ lang_code = [
151
+ "select language",
152
+ "Japanese",
153
+ "English",
154
+ "French",
155
+ "Chinese traditional",
156
+ "Chinese simplified",
157
+ "Russian",
158
+ "Korean",
159
+ "Vietnamese",
160
+ "Thai",
161
+ "Tagalog",
162
+ "Catalan",
163
+ "Sinhalese",
164
+ ]
165
+ sel = st.empty()
166
+ #language = sel.radio(
167
+ # label='translate to',
168
+ # options=lang_code,
169
+ # index=0,
170
+ # key = f"select_lang{st.session_state.count}")
171
+ language = sel.selectbox(
172
+ 'translate to',
173
+ lang_code,
174
+ index=0,
175
+ #placeholder = "select language",
176
+ key=f"select_lang{st.session_state.count}")
177
+
178
+ statename = f"select_lang{st.session_state.count}"
179
+ if "target_lang" not in st.session_state:
180
+ st.session_state.target_lang = "UNSELECTED"
181
+
182
+ def reset_selected_lang():
183
+ st.session_state[statename] = "select language"
184
+
185
+ st.button('Reset Language', on_click=reset_selected_lang)
186
+
187
+ area = st.empty()
188
+ if flag:
189
+ if "select_lang" in st.session_state:
190
+ if st.session_state.select_lang != "select language":
191
+ area2 = st.empty()
192
+ plain_text2 = "☟Reset Language☟"
193
+ empty_text = "☟ ☟"
194
+ var_text2 = f'<span style="color:#FF69B4">{plain_text2}</span>'
195
+ while flag:
196
+ area2.write(var_text2, unsafe_allow_html=True)
197
+ time.sleep(0.9)
198
+ area2.write(empty_text)
199
+ time.sleep(0.5)
200
+
201
+ while flag:
202
+ area.text("π“€€ upload PDF file π“€€")
203
+ time.sleep(1)
204
+ area.text("π“€₯ π“€₯")
205
+ time.sleep(0.8)
206
+ else:
207
+ if f"select_lang{st.session_state.count}" in st.session_state:
208
+ statename = f"select_lang{st.session_state.count}"
209
+ if st.session_state[statename] != "select language":
210
+ plain_text2 = "Reset Language"
211
+ var_text2 = f'<span style="color:gray">β–² `{plain_text2}`</span>'
212
+ area.write(var_text2, unsafe_allow_html=True)
213
+
214
+ obj_0.empty()
215
+ obj_1.empty() # uploader hide
216
+
217
+ # pdf translator
218
+ #------------------------------------------
219
+ st.markdown("----")
220
+ st.success("translator")
221
+
222
+ if "book" in st.session_state:
223
+ book_data = st.session_state.book
224
+ page_count = len(book_data)
225
+ else:
226
+ page_count = 0
227
+
228
+ st.text(f"PDF total pages : {page_count}")
229
+
230
+ progressbar = st.empty()
231
+ my_bar = progressbar.progress(0)
232
+
233
+ #3
234
+ # from google.colab import output
235
+ import re
236
+ #from googletrans import Translator
237
+ from deep_translator import GoogleTranslator
238
+
239
+ title_name = re.sub("\.| |%|@|\"|\'", "_", f"{uploaded_file.name}")
240
+
241
+ if st.session_state.temp != int(st.session_state.count):
242
+ st.session_state.lang = "init"
243
+ st.session_state.temp = int(st.session_state.count)
244
+
245
+ if language not in lang_code[1:]:
246
+ language = None
247
+
248
+ if st.session_state.lang != language and language is not None:
249
+ st.session_state.count += 1
250
+ st.session_state.result = ""
251
+ st.session_state.lang = language
252
+
253
+ my_makedirs(
254
+ f"removefolder/{temp_dir}/work_{st.session_state.count}")
255
+
256
+ to = ""
257
+ match language:
258
+ case "Japanese":
259
+ to = "ja"
260
+ case "English":
261
+ to = "en"
262
+ case "French":
263
+ to = "fr"
264
+ case "Chinese traditional":
265
+ to = "zh-TW"
266
+ case "Chinese simplified":
267
+ to = "zh-CN"
268
+ case "Russian":
269
+ to = "ru"
270
+ case "Korean":
271
+ to = "ko"
272
+ case "Vietnamese":
273
+ to = "vi"
274
+ case "Thai":
275
+ to = "th"
276
+ case "Tagalog":
277
+ to = "tl"
278
+ case "Catalan":
279
+ to = "ca"
280
+ case "Sinhalese":
281
+ to = "si"
282
+ case _:
283
+ to = "unknown"
284
+
285
+ st.info(f"translate to [ {language} ]")
286
+
287
+ st.session_state.target_lang = to
288
+
289
+ work_area1 = st.empty()
290
+ work_area2 = st.empty()
291
+ #--------------------------------------
292
+
293
+ for index, page in enumerate(book_data):
294
+ page_text = page[1]
295
+ # print("\nPage Number:" + str(index))
296
+ done = int(((index + 1) / page_count) * 100)
297
+ my_bar.progress(done,
298
+ text=f"Working Page Number : {index + 1}")
299
+ # print(len(page_text))
300
+ # text_list = [s for s in page_text.split('\n') if s]
301
+ page_text = re.sub('\.', '.π“‚€', page_text)
302
+ text_list = [s for s in page_text.split('π“‚€')]
303
+ if len(text_list) < 1:
304
+ continue
305
+
306
+ limit = 0
307
+ temp_list = []
308
+ line_number = []
309
+
310
+ for n, line in enumerate(text_list):
311
+ limit += 1
312
+ if limit > 10:
313
+ limit = 0
314
+
315
+
316
+ # output.clear()
317
+
318
+ line2 = re.sub(r"\s+", " ", line)
319
+ if line2 == "":
320
+ continue
321
+ temp_list.append((n, line2))
322
+
323
+ if len(temp_list) == 15 or n == len(text_list) - 1:
324
+ text_ = ""
325
+ all_text_orig = ""
326
+ all_text_done = ""
327
+ for i, t in enumerate(temp_list):
328
+ if t[1] != " ":
329
+ line_number.append(t[0])
330
+ text_ += 'π“‚€' + t[1].strip()
331
+ temp_list.clear()
332
+
333
+ text_2 = text_
334
+ text_ = re.sub('π“‚€', "", text_)
335
+ while (re.search('π“‚€', text_2)):
336
+ num = line_number.pop(0)
337
+ rep_words = f"𓃐NO:{num}| "
338
+ text_2 = text_2.replace('π“‚€', rep_words, 1)
339
+ line_number.clear()
340
+
341
+ # print(re.sub("𓃐","\n", text_2))
342
+ #ts = Translator()
343
+ all_text_orig = f":::info\n𓃰{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n"
344
+
345
+ for times in range(0, 5):
346
+
347
+ try:
348
+ tsd = GoogleTranslator(
349
+ source="auto",
350
+ target=to).translate(text=text_)
351
+ if tsd == None:
352
+ tsd = text_
353
+ #tsd = ts.translate(text_, src="en", dest="ja")
354
+ #translated_text = ts.translate(line, src="en", dest="ja").text
355
+ all_text_done = f":::info\n𓆏{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n"
356
+ #all_text_done = f"**{index:05d}" + f"-{n}" + "; " + tsd.text + "\n"
357
+
358
+ # all_text_orig += str(n) + "; " + tsd.pronunciation + "\n"
359
+ # print(index,n, line)
360
+ # print(index,n, tsd.text)
361
+
362
+ # print(all_text_orig)
363
+ # print(all_text_done + "\n")
364
+ if type(all_text_orig) is str and type(
365
+ all_text_done) is str:
366
+
367
+ # intext_1 = f'<span style="color:DimGray;background:GhostWhite">{all_text_orig}</span>'
368
+ # work_area1.markdown(intext_1, unsafe_allow_html=True)
369
+ work_area1.write(f"{all_text_orig}")
370
+ # intext_2 = f'<span style="color:LavenderBlush;background:LightGray">{all_text_done}</span>'
371
+ work_area2.write(f"{all_text_done}")
372
+ # work_area2.markdown(intext_2, unsafe_allow_html=True)
373
+
374
+ with open(
375
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
376
+ "a") as tempf:
377
+ tempf.write(all_text_orig + "\n\n" +
378
+ all_text_done + "\n\n")
379
+
380
+ # st.session_state.result += all_text_orig + "\n\n"
381
+ # st.session_state.result += all_text_done + "\n\n"
382
+
383
+ # print(n, tsd.pronunciation)
384
+ with open(
385
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done.txt",
386
+ "a") as f:
387
+ f.write(all_text_orig + all_text_done +
388
+ "\n")
389
+ with open(
390
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done_{language}.txt",
391
+ "a") as f:
392
+ f.write(all_text_done + "\n")
393
+
394
+ break
395
+
396
+ except Exception as e:
397
+ print(e)
398
+ time.sleep(3)
399
+ continue
400
+
401
+ with open(
402
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_orig.txt",
403
+ "a") as f:
404
+ f.write(all_text_orig + "\n")
405
+
406
+
407
+ st.markdown("----")
408
+
409
+ my_makedirs(f"removefolder/{temp_dir}/download_section")
410
+ shutil.move(
411
+ f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
412
+ f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
413
+ )
414
+
415
+ shutil.make_archive(
416
+ f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}',\
417
+ format='zip',\
418
+ root_dir=f'removefolder/{temp_dir}/work_{st.session_state.count}'\
419
+ )
420
+ shutil.rmtree(
421
+ f"removefolder/{temp_dir}/work_{st.session_state.count}")
422
+
423
+ st.balloons()
424
+ work_area1.empty()
425
+ work_area2.empty()
426
+
427
+ #--------------------------------------
428
+
429
+ st.success("Download translated text files")
430
+ st.write(intext_0, unsafe_allow_html=True)
431
+ # plain_text3 = f"[ {st.session_state.target_lang} ] : translated text files"
432
+ plain_text3 = f"[ {language} ] : translated text files"
433
+ var_text3 = f'##### <span style="color:#FF69B4">{plain_text3}</span>'
434
+
435
+ translated = st.empty()
436
+ translated.write(var_text3, unsafe_allow_html=True)
437
+
438
+ if os.path.isfile(
439
+ f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip'
440
+ ):
441
+ with open(
442
+ f"removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
443
+ "rb") as fpath:
444
+ btn = st.download_button(
445
+ label=f"DOWNLOAD .zip file",
446
+ data=fpath,
447
+ file_name=
448
+ f"{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
449
+ mime="application/zip")
450
+
451
+ plain_text4 = "download zipfile"
452
+ var_text4 = f'<span style="color:gray">β–² `{plain_text4}` 𓁉 </span>'
453
+ st.write(var_text4, unsafe_allow_html=True)
454
+
455
+ st.markdown("----")
456
+
457
+ plain_text5 = " 𓀑 results 𓁙 "
458
+ var_text5 = f'##### <span style="color:#20B2AA">{plain_text5}</span>'
459
+ st.write(var_text5, unsafe_allow_html=True)
460
+
461
+ tempf = open(
462
+ f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
463
+ )
464
+ all_result = tempf.read()
465
+ tempf.close()
466
+ st.write(intext_0, unsafe_allow_html=True)
467
+ st.write(all_result, unsafe_allow_html=True)
468
+ # st.write(st.session_state.result, unsafe_allow_html=True)
469
+
470
+ if __name__ == "__main__":
471
+ main()