Spaces:
Sleeping
Sleeping
figures zip file download implemented
Browse files
app.py
CHANGED
@@ -6,15 +6,69 @@ import cv2
|
|
6 |
import os
|
7 |
import io
|
8 |
from PIL import Image
|
|
|
9 |
import shutil
|
10 |
-
|
|
|
|
|
11 |
import time
|
12 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
13 |
# poppler-utils:
|
14 |
# Installed: 22.02.0-2ubuntu0.4
|
15 |
# install https://github.com/UB-Mannheim/tesseract/wiki
|
16 |
#page extraction disabled
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def is_new_pdf_upload(uploaded_file):
|
19 |
if 'last_pdf_uploaded_file' in st.session_state:
|
20 |
# Check if the newly uploaded file is different from the last one
|
@@ -130,11 +184,20 @@ if 'page_count' in st.session_state:
|
|
130 |
read_pdf_progress_bar.progress(progress_percentage)
|
131 |
st.session_state.extracted_text = ""
|
132 |
|
|
|
|
|
133 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
134 |
print("index="+str(index))
|
135 |
|
136 |
figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
137 |
st.session_state.pdf_figures_image_list.append(figures_image_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
pdf_tables_image_list.append(tables_image_list)
|
139 |
st.session_state.pdf_text_list.append(text)
|
140 |
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
@@ -142,6 +205,7 @@ if 'page_count' in st.session_state:
|
|
142 |
# print(text)
|
143 |
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
144 |
read_pdf_progress_bar.progress(progress_percentage)
|
|
|
145 |
#add_animation_to_image()
|
146 |
#st.session_state['video_generated'] = True
|
147 |
st.rerun()
|
@@ -152,8 +216,21 @@ if 'page_count' in st.session_state:
|
|
152 |
data=string_buffer.getvalue(),
|
153 |
file_name=txt_file_path,
|
154 |
mime="text/plain")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
# st.image(Image.fromarray(bgr_image))
|
156 |
# for index,pdf_text in enumerate(st.session_state.pdf_text_list):
|
|
|
157 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
158 |
st.write(f"Page {index+1} \n\n {st.session_state.pdf_text_list[index]}\n")
|
159 |
if not st.session_state.pdf_figures_image_list[index]:
|
@@ -164,6 +241,9 @@ if 'page_count' in st.session_state:
|
|
164 |
st.image(Image.fromarray(pdf_figure_text_image[1]))
|
165 |
|
166 |
|
|
|
|
|
|
|
167 |
# for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
|
168 |
# print("index="+str(index))
|
169 |
#
|
|
|
6 |
import os
|
7 |
import io
|
8 |
from PIL import Image
|
9 |
+
import re
|
10 |
import shutil
|
11 |
+
import zipfile
|
12 |
+
from io import BytesIO
|
13 |
+
temp_figure_dir="pdf_figures/"
|
14 |
import time
|
15 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
16 |
# poppler-utils:
|
17 |
# Installed: 22.02.0-2ubuntu0.4
|
18 |
# install https://github.com/UB-Mannheim/tesseract/wiki
|
19 |
#page extraction disabled
|
20 |
+
def clean_filename(filename, replace_char=' '):
|
21 |
+
# Check for empty filename or None
|
22 |
+
if not filename or filename.isspace():
|
23 |
+
return None # Return None or maybe an empty string, depending on your requirements
|
24 |
+
|
25 |
+
cleaned_name = filename.strip() # Trim whitespace from the ends
|
26 |
+
|
27 |
+
# Platform-specific checks and clean-up
|
28 |
+
if os.name == 'nt': # Windows
|
29 |
+
invalid_chars = r'<>:"/\\|?*\0'
|
30 |
+
invalid_names = {"CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4",
|
31 |
+
"COM5", "COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2",
|
32 |
+
"LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9"}
|
33 |
+
# Replace invalid names with a placeholder or modify it in a specific way
|
34 |
+
base_name, _, ext = cleaned_name.partition('.')
|
35 |
+
if base_name.upper() in invalid_names:
|
36 |
+
cleaned_name = replace_char * len(base_name) + '.' + ext
|
37 |
+
else: # POSIX (Linux, macOS, etc.)
|
38 |
+
invalid_chars = '/\0'
|
39 |
+
|
40 |
+
# Remove invalid characters
|
41 |
+
for char in invalid_chars:
|
42 |
+
cleaned_name = cleaned_name.replace(char, replace_char)
|
43 |
+
|
44 |
+
# Optionally, remove any double spaces and strip leading/trailing spaces
|
45 |
+
cleaned_name = re.sub(' +', ' ', cleaned_name).strip()
|
46 |
+
|
47 |
+
return cleaned_name
|
48 |
+
|
49 |
+
|
50 |
+
def manage_temp_to_be_zipped_directory(directory_path):
|
51 |
+
if os.path.exists(directory_path):
|
52 |
+
# Remove the directory and all its contents
|
53 |
+
shutil.rmtree(directory_path)
|
54 |
+
print(f"Directory '{directory_path}' was removed.")
|
55 |
+
|
56 |
+
# Optionally, you might want to recreate the directory immediately after deleting
|
57 |
+
os.makedirs(directory_path)
|
58 |
+
print(f"Directory '{directory_path}' was recreated.")
|
59 |
+
else:
|
60 |
+
# Create the directory since it does not exist
|
61 |
+
os.makedirs(directory_path)
|
62 |
+
print(f"Directory '{directory_path}' was created.")
|
63 |
+
def zip_directory(directory_path):
|
64 |
+
zip_buffer = BytesIO()
|
65 |
+
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
66 |
+
for root, dirs, files in os.walk(directory_path):
|
67 |
+
for file in files:
|
68 |
+
file_path = os.path.join(root, file)
|
69 |
+
zip_file.write(file_path, arcname=file)
|
70 |
+
zip_buffer.seek(0)
|
71 |
+
return zip_buffer
|
72 |
def is_new_pdf_upload(uploaded_file):
|
73 |
if 'last_pdf_uploaded_file' in st.session_state:
|
74 |
# Check if the newly uploaded file is different from the last one
|
|
|
184 |
read_pdf_progress_bar.progress(progress_percentage)
|
185 |
st.session_state.extracted_text = ""
|
186 |
|
187 |
+
manage_temp_to_be_zipped_directory(temp_figure_dir)
|
188 |
+
|
189 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
190 |
print("index="+str(index))
|
191 |
|
192 |
figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
193 |
st.session_state.pdf_figures_image_list.append(figures_image_list)
|
194 |
+
if st.session_state.pdf_figures_image_list[index]:
|
195 |
+
for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
|
196 |
+
raw_image_file_name = f"page_{index}_{pdf_figure_text_image[0]}.png"
|
197 |
+
cleaned_image_file_name = clean_filename(raw_image_file_name)
|
198 |
+
Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
|
199 |
+
|
200 |
+
|
201 |
pdf_tables_image_list.append(tables_image_list)
|
202 |
st.session_state.pdf_text_list.append(text)
|
203 |
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
|
|
205 |
# print(text)
|
206 |
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
207 |
read_pdf_progress_bar.progress(progress_percentage)
|
208 |
+
st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
|
209 |
#add_animation_to_image()
|
210 |
#st.session_state['video_generated'] = True
|
211 |
st.rerun()
|
|
|
216 |
data=string_buffer.getvalue(),
|
217 |
file_name=txt_file_path,
|
218 |
mime="text/plain")
|
219 |
+
|
220 |
+
download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
|
221 |
+
|
222 |
+
st.download_button(
|
223 |
+
label="Download Figures ZIP",
|
224 |
+
data=st.session_state.figure_zip_bytes,
|
225 |
+
file_name=download_figure_zip_file_name,
|
226 |
+
mime="application/zip"
|
227 |
+
)
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
# st.image(Image.fromarray(bgr_image))
|
232 |
# for index,pdf_text in enumerate(st.session_state.pdf_text_list):
|
233 |
+
|
234 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
235 |
st.write(f"Page {index+1} \n\n {st.session_state.pdf_text_list[index]}\n")
|
236 |
if not st.session_state.pdf_figures_image_list[index]:
|
|
|
241 |
st.image(Image.fromarray(pdf_figure_text_image[1]))
|
242 |
|
243 |
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
# for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
|
248 |
# print("index="+str(index))
|
249 |
#
|