zmbfeng commited on
Commit
e82c65e
1 Parent(s): aecc042

tables zip file download implemented

Browse files
Files changed (1) hide show
  1. app.py +23 -5
app.py CHANGED
@@ -11,6 +11,7 @@ import shutil
11
  import zipfile
12
  from io import BytesIO
13
  temp_figure_dir="pdf_figures/"
 
14
  import time
15
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
16
  # poppler-utils:
@@ -168,6 +169,7 @@ if 'page_count' in st.session_state:
168
  st.session_state.color_image_list = []
169
  st.session_state.gray_image_np_list = []
170
  st.session_state.pdf_figures_image_list=[]
 
171
  pdf_tables_image_list=[]
172
  st.session_state.pdf_text_list=[]
173
 
@@ -185,20 +187,24 @@ if 'page_count' in st.session_state:
185
  st.session_state.extracted_text = ""
186
 
187
  manage_temp_to_be_zipped_directory(temp_figure_dir)
188
-
189
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
190
  print("index="+str(index))
191
 
192
  figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
193
  st.session_state.pdf_figures_image_list.append(figures_image_list)
 
194
  if st.session_state.pdf_figures_image_list[index]:
195
  for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
196
  raw_image_file_name = f"page_{index}_{pdf_figure_text_image[0]}.png"
197
  cleaned_image_file_name = clean_filename(raw_image_file_name)
198
  Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
 
 
 
 
 
199
 
200
-
201
- pdf_tables_image_list.append(tables_image_list)
202
  st.session_state.pdf_text_list.append(text)
203
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
204
  # st.write(text)
@@ -206,6 +212,7 @@ if 'page_count' in st.session_state:
206
  progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
207
  read_pdf_progress_bar.progress(progress_percentage)
208
  st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
 
209
  #add_animation_to_image()
210
  #st.session_state['video_generated'] = True
211
  st.rerun()
@@ -218,7 +225,7 @@ if 'page_count' in st.session_state:
218
  mime="text/plain")
219
 
220
  download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
221
-
222
  st.download_button(
223
  label="Download Figures ZIP",
224
  data=st.session_state.figure_zip_bytes,
@@ -226,7 +233,12 @@ if 'page_count' in st.session_state:
226
  mime="application/zip"
227
  )
228
 
229
-
 
 
 
 
 
230
 
231
  # st.image(Image.fromarray(bgr_image))
232
  # for index,pdf_text in enumerate(st.session_state.pdf_text_list):
@@ -239,6 +251,12 @@ if 'page_count' in st.session_state:
239
  for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
240
  st.write(pdf_figure_text_image[0])
241
  st.image(Image.fromarray(pdf_figure_text_image[1]))
 
 
 
 
 
 
242
 
243
 
244
 
 
11
  import zipfile
12
  from io import BytesIO
13
  temp_figure_dir="pdf_figures/"
14
+ temp_table_dir="pdf_tables/"
15
  import time
16
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
17
  # poppler-utils:
 
169
  st.session_state.color_image_list = []
170
  st.session_state.gray_image_np_list = []
171
  st.session_state.pdf_figures_image_list=[]
172
+ st.session_state.pdf_tables_image_list = []
173
  pdf_tables_image_list=[]
174
  st.session_state.pdf_text_list=[]
175
 
 
187
  st.session_state.extracted_text = ""
188
 
189
  manage_temp_to_be_zipped_directory(temp_figure_dir)
190
+ manage_temp_to_be_zipped_directory(temp_table_dir)
191
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
192
  print("index="+str(index))
193
 
194
  figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
195
  st.session_state.pdf_figures_image_list.append(figures_image_list)
196
+ st.session_state.pdf_tables_image_list.append(tables_image_list)
197
  if st.session_state.pdf_figures_image_list[index]:
198
  for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
199
  raw_image_file_name = f"page_{index}_{pdf_figure_text_image[0]}.png"
200
  cleaned_image_file_name = clean_filename(raw_image_file_name)
201
  Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
202
+ if st.session_state.pdf_tables_image_list:
203
+ for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
204
+ raw_image_file_name = f"page_{index}_{pdf_table_text_image[0]}.png"
205
+ cleaned_image_file_name = clean_filename(raw_image_file_name)
206
+ Image.fromarray(pdf_table_text_image[1]).save(temp_table_dir + cleaned_image_file_name)
207
 
 
 
208
  st.session_state.pdf_text_list.append(text)
209
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
210
  # st.write(text)
 
212
  progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
213
  read_pdf_progress_bar.progress(progress_percentage)
214
  st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
215
+ st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
216
  #add_animation_to_image()
217
  #st.session_state['video_generated'] = True
218
  st.rerun()
 
225
  mime="text/plain")
226
 
227
  download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
228
+ download_table_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_tables.zip")
229
  st.download_button(
230
  label="Download Figures ZIP",
231
  data=st.session_state.figure_zip_bytes,
 
233
  mime="application/zip"
234
  )
235
 
236
+ st.download_button(
237
+ label="Download Tables ZIP",
238
+ data=st.session_state.table_zip_bytes,
239
+ file_name=download_table_zip_file_name,
240
+ mime="application/zip"
241
+ )
242
 
243
  # st.image(Image.fromarray(bgr_image))
244
  # for index,pdf_text in enumerate(st.session_state.pdf_text_list):
 
251
  for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
252
  st.write(pdf_figure_text_image[0])
253
  st.image(Image.fromarray(pdf_figure_text_image[1]))
254
+ if not st.session_state.pdf_tables_image_list[index]:
255
+ st.write("no tables")
256
+ else:
257
+ for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
258
+ st.write(pdf_table_text_image[0])
259
+ st.image(Image.fromarray(pdf_table_text_image[1]))
260
 
261
 
262