Spaces:
Sleeping
Sleeping
tables zip file download implemented
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import shutil
|
|
11 |
import zipfile
|
12 |
from io import BytesIO
|
13 |
temp_figure_dir="pdf_figures/"
|
|
|
14 |
import time
|
15 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
16 |
# poppler-utils:
|
@@ -168,6 +169,7 @@ if 'page_count' in st.session_state:
|
|
168 |
st.session_state.color_image_list = []
|
169 |
st.session_state.gray_image_np_list = []
|
170 |
st.session_state.pdf_figures_image_list=[]
|
|
|
171 |
pdf_tables_image_list=[]
|
172 |
st.session_state.pdf_text_list=[]
|
173 |
|
@@ -185,20 +187,24 @@ if 'page_count' in st.session_state:
|
|
185 |
st.session_state.extracted_text = ""
|
186 |
|
187 |
manage_temp_to_be_zipped_directory(temp_figure_dir)
|
188 |
-
|
189 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
190 |
print("index="+str(index))
|
191 |
|
192 |
figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
193 |
st.session_state.pdf_figures_image_list.append(figures_image_list)
|
|
|
194 |
if st.session_state.pdf_figures_image_list[index]:
|
195 |
for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
|
196 |
raw_image_file_name = f"page_{index}_{pdf_figure_text_image[0]}.png"
|
197 |
cleaned_image_file_name = clean_filename(raw_image_file_name)
|
198 |
Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
-
|
201 |
-
pdf_tables_image_list.append(tables_image_list)
|
202 |
st.session_state.pdf_text_list.append(text)
|
203 |
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
204 |
# st.write(text)
|
@@ -206,6 +212,7 @@ if 'page_count' in st.session_state:
|
|
206 |
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
207 |
read_pdf_progress_bar.progress(progress_percentage)
|
208 |
st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
|
|
|
209 |
#add_animation_to_image()
|
210 |
#st.session_state['video_generated'] = True
|
211 |
st.rerun()
|
@@ -218,7 +225,7 @@ if 'page_count' in st.session_state:
|
|
218 |
mime="text/plain")
|
219 |
|
220 |
download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
|
221 |
-
|
222 |
st.download_button(
|
223 |
label="Download Figures ZIP",
|
224 |
data=st.session_state.figure_zip_bytes,
|
@@ -226,7 +233,12 @@ if 'page_count' in st.session_state:
|
|
226 |
mime="application/zip"
|
227 |
)
|
228 |
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
# st.image(Image.fromarray(bgr_image))
|
232 |
# for index,pdf_text in enumerate(st.session_state.pdf_text_list):
|
@@ -239,6 +251,12 @@ if 'page_count' in st.session_state:
|
|
239 |
for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
|
240 |
st.write(pdf_figure_text_image[0])
|
241 |
st.image(Image.fromarray(pdf_figure_text_image[1]))
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
|
244 |
|
|
|
11 |
import zipfile
|
12 |
from io import BytesIO
|
13 |
temp_figure_dir="pdf_figures/"
|
14 |
+
temp_table_dir="pdf_tables/"
|
15 |
import time
|
16 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
17 |
# poppler-utils:
|
|
|
169 |
st.session_state.color_image_list = []
|
170 |
st.session_state.gray_image_np_list = []
|
171 |
st.session_state.pdf_figures_image_list=[]
|
172 |
+
st.session_state.pdf_tables_image_list = []
|
173 |
pdf_tables_image_list=[]
|
174 |
st.session_state.pdf_text_list=[]
|
175 |
|
|
|
187 |
st.session_state.extracted_text = ""
|
188 |
|
189 |
manage_temp_to_be_zipped_directory(temp_figure_dir)
|
190 |
+
manage_temp_to_be_zipped_directory(temp_table_dir)
|
191 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
192 |
print("index="+str(index))
|
193 |
|
194 |
figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
195 |
st.session_state.pdf_figures_image_list.append(figures_image_list)
|
196 |
+
st.session_state.pdf_tables_image_list.append(tables_image_list)
|
197 |
if st.session_state.pdf_figures_image_list[index]:
|
198 |
for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
|
199 |
raw_image_file_name = f"page_{index}_{pdf_figure_text_image[0]}.png"
|
200 |
cleaned_image_file_name = clean_filename(raw_image_file_name)
|
201 |
Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
|
202 |
+
if st.session_state.pdf_tables_image_list:
|
203 |
+
for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
|
204 |
+
raw_image_file_name = f"page_{index}_{pdf_table_text_image[0]}.png"
|
205 |
+
cleaned_image_file_name = clean_filename(raw_image_file_name)
|
206 |
+
Image.fromarray(pdf_table_text_image[1]).save(temp_table_dir + cleaned_image_file_name)
|
207 |
|
|
|
|
|
208 |
st.session_state.pdf_text_list.append(text)
|
209 |
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
210 |
# st.write(text)
|
|
|
212 |
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
213 |
read_pdf_progress_bar.progress(progress_percentage)
|
214 |
st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
|
215 |
+
st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
|
216 |
#add_animation_to_image()
|
217 |
#st.session_state['video_generated'] = True
|
218 |
st.rerun()
|
|
|
225 |
mime="text/plain")
|
226 |
|
227 |
download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
|
228 |
+
download_table_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_tables.zip")
|
229 |
st.download_button(
|
230 |
label="Download Figures ZIP",
|
231 |
data=st.session_state.figure_zip_bytes,
|
|
|
233 |
mime="application/zip"
|
234 |
)
|
235 |
|
236 |
+
st.download_button(
|
237 |
+
label="Download Tables ZIP",
|
238 |
+
data=st.session_state.table_zip_bytes,
|
239 |
+
file_name=download_table_zip_file_name,
|
240 |
+
mime="application/zip"
|
241 |
+
)
|
242 |
|
243 |
# st.image(Image.fromarray(bgr_image))
|
244 |
# for index,pdf_text in enumerate(st.session_state.pdf_text_list):
|
|
|
251 |
for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
|
252 |
st.write(pdf_figure_text_image[0])
|
253 |
st.image(Image.fromarray(pdf_figure_text_image[1]))
|
254 |
+
if not st.session_state.pdf_tables_image_list[index]:
|
255 |
+
st.write("no tables")
|
256 |
+
else:
|
257 |
+
for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
|
258 |
+
st.write(pdf_table_text_image[0])
|
259 |
+
st.image(Image.fromarray(pdf_table_text_image[1]))
|
260 |
|
261 |
|
262 |
|