zmbfeng commited on
Commit
aecc042
1 Parent(s): cd77918

figures zip file download implemented

Browse files
Files changed (1) hide show
  1. app.py +82 -2
app.py CHANGED
@@ -6,15 +6,69 @@ import cv2
6
  import os
7
  import io
8
  from PIL import Image
 
9
  import shutil
10
-
 
 
11
  import time
12
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
13
  # poppler-utils:
14
  # Installed: 22.02.0-2ubuntu0.4
15
  # install https://github.com/UB-Mannheim/tesseract/wiki
16
  #page extraction disabled
17
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def is_new_pdf_upload(uploaded_file):
19
  if 'last_pdf_uploaded_file' in st.session_state:
20
  # Check if the newly uploaded file is different from the last one
@@ -130,11 +184,20 @@ if 'page_count' in st.session_state:
130
  read_pdf_progress_bar.progress(progress_percentage)
131
  st.session_state.extracted_text = ""
132
 
 
 
133
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
134
  print("index="+str(index))
135
 
136
  figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
137
  st.session_state.pdf_figures_image_list.append(figures_image_list)
 
 
 
 
 
 
 
138
  pdf_tables_image_list.append(tables_image_list)
139
  st.session_state.pdf_text_list.append(text)
140
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
@@ -142,6 +205,7 @@ if 'page_count' in st.session_state:
142
  # print(text)
143
  progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
144
  read_pdf_progress_bar.progress(progress_percentage)
 
145
  #add_animation_to_image()
146
  #st.session_state['video_generated'] = True
147
  st.rerun()
@@ -152,8 +216,21 @@ if 'page_count' in st.session_state:
152
  data=string_buffer.getvalue(),
153
  file_name=txt_file_path,
154
  mime="text/plain")
 
 
 
 
 
 
 
 
 
 
 
 
155
  # st.image(Image.fromarray(bgr_image))
156
  # for index,pdf_text in enumerate(st.session_state.pdf_text_list):
 
157
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
158
  st.write(f"Page {index+1} \n\n {st.session_state.pdf_text_list[index]}\n")
159
  if not st.session_state.pdf_figures_image_list[index]:
@@ -164,6 +241,9 @@ if 'page_count' in st.session_state:
164
  st.image(Image.fromarray(pdf_figure_text_image[1]))
165
 
166
 
 
 
 
167
  # for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
168
  # print("index="+str(index))
169
  #
 
6
  import os
7
  import io
8
  from PIL import Image
9
+ import re
10
  import shutil
11
+ import zipfile
12
+ from io import BytesIO
13
+ temp_figure_dir="pdf_figures/"
14
  import time
15
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
16
  # poppler-utils:
17
  # Installed: 22.02.0-2ubuntu0.4
18
  # install https://github.com/UB-Mannheim/tesseract/wiki
19
  #page extraction disabled
20
+ def clean_filename(filename, replace_char=' '):
21
+ # Check for empty filename or None
22
+ if not filename or filename.isspace():
23
+ return None # Return None or maybe an empty string, depending on your requirements
24
+
25
+ cleaned_name = filename.strip() # Trim whitespace from the ends
26
+
27
+ # Platform-specific checks and clean-up
28
+ if os.name == 'nt': # Windows
29
+ invalid_chars = r'<>:"/\\|?*\0'
30
+ invalid_names = {"CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4",
31
+ "COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2",
32
+ "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9"}
33
+ # Replace invalid names with a placeholder or modify it in a specific way
34
+ base_name, _, ext = cleaned_name.partition('.')
35
+ if base_name.upper() in invalid_names:
36
+ cleaned_name = replace_char * len(base_name) + '.' + ext
37
+ else: # POSIX (Linux, macOS, etc.)
38
+ invalid_chars = '/\0'
39
+
40
+ # Remove invalid characters
41
+ for char in invalid_chars:
42
+ cleaned_name = cleaned_name.replace(char, replace_char)
43
+
44
+ # Optionally, remove any double spaces and strip leading/trailing spaces
45
+ cleaned_name = re.sub(' +', ' ', cleaned_name).strip()
46
+
47
+ return cleaned_name
48
+
49
+
50
+ def manage_temp_to_be_zipped_directory(directory_path):
51
+ if os.path.exists(directory_path):
52
+ # Remove the directory and all its contents
53
+ shutil.rmtree(directory_path)
54
+ print(f"Directory '{directory_path}' was removed.")
55
+
56
+ # Optionally, you might want to recreate the directory immediately after deleting
57
+ os.makedirs(directory_path)
58
+ print(f"Directory '{directory_path}' was recreated.")
59
+ else:
60
+ # Create the directory since it does not exist
61
+ os.makedirs(directory_path)
62
+ print(f"Directory '{directory_path}' was created.")
63
+ def zip_directory(directory_path):
64
+ zip_buffer = BytesIO()
65
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
66
+ for root, dirs, files in os.walk(directory_path):
67
+ for file in files:
68
+ file_path = os.path.join(root, file)
69
+ zip_file.write(file_path, arcname=file)
70
+ zip_buffer.seek(0)
71
+ return zip_buffer
72
  def is_new_pdf_upload(uploaded_file):
73
  if 'last_pdf_uploaded_file' in st.session_state:
74
  # Check if the newly uploaded file is different from the last one
 
184
  read_pdf_progress_bar.progress(progress_percentage)
185
  st.session_state.extracted_text = ""
186
 
187
+ manage_temp_to_be_zipped_directory(temp_figure_dir)
188
+
189
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
190
  print("index="+str(index))
191
 
192
  figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
193
  st.session_state.pdf_figures_image_list.append(figures_image_list)
194
+ if st.session_state.pdf_figures_image_list[index]:
195
+ for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
196
+ raw_image_file_name = f"page_{index}_{pdf_figure_text_image[0]}.png"
197
+ cleaned_image_file_name = clean_filename(raw_image_file_name)
198
+ Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
199
+
200
+
201
  pdf_tables_image_list.append(tables_image_list)
202
  st.session_state.pdf_text_list.append(text)
203
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
 
205
  # print(text)
206
  progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
207
  read_pdf_progress_bar.progress(progress_percentage)
208
+ st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
209
  #add_animation_to_image()
210
  #st.session_state['video_generated'] = True
211
  st.rerun()
 
216
  data=string_buffer.getvalue(),
217
  file_name=txt_file_path,
218
  mime="text/plain")
219
+
220
+ download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
221
+
222
+ st.download_button(
223
+ label="Download Figures ZIP",
224
+ data=st.session_state.figure_zip_bytes,
225
+ file_name=download_figure_zip_file_name,
226
+ mime="application/zip"
227
+ )
228
+
229
+
230
+
231
  # st.image(Image.fromarray(bgr_image))
232
  # for index,pdf_text in enumerate(st.session_state.pdf_text_list):
233
+
234
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
235
  st.write(f"Page {index+1} \n\n {st.session_state.pdf_text_list[index]}\n")
236
  if not st.session_state.pdf_figures_image_list[index]:
 
241
  st.image(Image.fromarray(pdf_figure_text_image[1]))
242
 
243
 
244
+
245
+
246
+
247
  # for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
248
  # print("index="+str(index))
249
  #