zmbfeng commited on
Commit
7d07000
1 Parent(s): e82c65e

textbox zip file download implemented

Browse files
Files changed (2) hide show
  1. app.py +31 -5
  2. utils.py +36 -34
app.py CHANGED
@@ -12,6 +12,7 @@ import zipfile
12
  from io import BytesIO
13
  temp_figure_dir="pdf_figures/"
14
  temp_table_dir="pdf_tables/"
 
15
  import time
16
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
17
  # poppler-utils:
@@ -170,6 +171,7 @@ if 'page_count' in st.session_state:
170
  st.session_state.gray_image_np_list = []
171
  st.session_state.pdf_figures_image_list=[]
172
  st.session_state.pdf_tables_image_list = []
 
173
  pdf_tables_image_list=[]
174
  st.session_state.pdf_text_list=[]
175
 
@@ -188,22 +190,33 @@ if 'page_count' in st.session_state:
188
 
189
  manage_temp_to_be_zipped_directory(temp_figure_dir)
190
  manage_temp_to_be_zipped_directory(temp_table_dir)
 
191
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
192
- print("index="+str(index))
193
 
194
- figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
 
 
195
  st.session_state.pdf_figures_image_list.append(figures_image_list)
196
  st.session_state.pdf_tables_image_list.append(tables_image_list)
 
197
  if st.session_state.pdf_figures_image_list[index]:
198
  for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
199
- raw_image_file_name = f"page_{index}_{pdf_figure_text_image[0]}.png"
200
  cleaned_image_file_name = clean_filename(raw_image_file_name)
201
  Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
202
  if st.session_state.pdf_tables_image_list:
203
  for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
204
- raw_image_file_name = f"page_{index}_{pdf_table_text_image[0]}.png"
205
  cleaned_image_file_name = clean_filename(raw_image_file_name)
206
  Image.fromarray(pdf_table_text_image[1]).save(temp_table_dir + cleaned_image_file_name)
 
 
 
 
 
 
 
 
207
 
208
  st.session_state.pdf_text_list.append(text)
209
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
@@ -213,6 +226,7 @@ if 'page_count' in st.session_state:
213
  read_pdf_progress_bar.progress(progress_percentage)
214
  st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
215
  st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
 
216
  #add_animation_to_image()
217
  #st.session_state['video_generated'] = True
218
  st.rerun()
@@ -226,6 +240,7 @@ if 'page_count' in st.session_state:
226
 
227
  download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
228
  download_table_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_tables.zip")
 
229
  st.download_button(
230
  label="Download Figures ZIP",
231
  data=st.session_state.figure_zip_bytes,
@@ -239,6 +254,12 @@ if 'page_count' in st.session_state:
239
  file_name=download_table_zip_file_name,
240
  mime="application/zip"
241
  )
 
 
 
 
 
 
242
 
243
  # st.image(Image.fromarray(bgr_image))
244
  # for index,pdf_text in enumerate(st.session_state.pdf_text_list):
@@ -257,7 +278,12 @@ if 'page_count' in st.session_state:
257
  for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
258
  st.write(pdf_table_text_image[0])
259
  st.image(Image.fromarray(pdf_table_text_image[1]))
260
-
 
 
 
 
 
261
 
262
 
263
 
 
12
  from io import BytesIO
13
  temp_figure_dir="pdf_figures/"
14
  temp_table_dir="pdf_tables/"
15
+ temp_textbox_dir="pdf_textbox/"
16
  import time
17
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
18
  # poppler-utils:
 
171
  st.session_state.gray_image_np_list = []
172
  st.session_state.pdf_figures_image_list=[]
173
  st.session_state.pdf_tables_image_list = []
174
+ st.session_state.pdf_textbox_image_list=[]
175
  pdf_tables_image_list=[]
176
  st.session_state.pdf_text_list=[]
177
 
 
190
 
191
  manage_temp_to_be_zipped_directory(temp_figure_dir)
192
  manage_temp_to_be_zipped_directory(temp_table_dir)
193
+ manage_temp_to_be_zipped_directory(temp_textbox_dir)
194
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
 
195
 
196
+
197
+ figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
198
+ print("index="+str(index)+" txt book " + str(len(textbox_image_list)))
199
  st.session_state.pdf_figures_image_list.append(figures_image_list)
200
  st.session_state.pdf_tables_image_list.append(tables_image_list)
201
+ st.session_state.pdf_textbox_image_list.append(textbox_image_list)
202
  if st.session_state.pdf_figures_image_list[index]:
203
  for pdf_figure_text_image in st.session_state.pdf_figures_image_list[index]:
204
+ raw_image_file_name = f"page_{index+1}_{pdf_figure_text_image[0]}.png"
205
  cleaned_image_file_name = clean_filename(raw_image_file_name)
206
  Image.fromarray(pdf_figure_text_image[1]).save(temp_figure_dir+cleaned_image_file_name)
207
  if st.session_state.pdf_tables_image_list:
208
  for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
209
+ raw_image_file_name = f"page_{index+1}_{pdf_table_text_image[0]}.png"
210
  cleaned_image_file_name = clean_filename(raw_image_file_name)
211
  Image.fromarray(pdf_table_text_image[1]).save(temp_table_dir + cleaned_image_file_name)
212
+ if st.session_state.pdf_textbox_image_list:
213
+ textbox_index = 1
214
+ for pdf_textbox_image in st.session_state.pdf_textbox_image_list[index]:
215
+ raw_image_file_name = f"page_{index+1}_textbox_{textbox_index}.png"
216
+ cleaned_image_file_name = clean_filename(raw_image_file_name)
217
+ Image.fromarray(pdf_textbox_image).save(temp_textbox_dir + cleaned_image_file_name)
218
+ textbox_index = textbox_index + 1
219
+
220
 
221
  st.session_state.pdf_text_list.append(text)
222
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
 
226
  read_pdf_progress_bar.progress(progress_percentage)
227
  st.session_state.figure_zip_bytes=zip_directory(temp_figure_dir)
228
  st.session_state.table_zip_bytes = zip_directory(temp_table_dir)
229
+ st.session_state.textbox_zip_bytes = zip_directory(temp_textbox_dir)
230
  #add_animation_to_image()
231
  #st.session_state['video_generated'] = True
232
  st.rerun()
 
240
 
241
  download_figure_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_figures.zip")
242
  download_table_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_tables.zip")
243
+ download_textbox_zip_file_name = uploaded_locked_pdf_file.name.replace(".pdf", "_textbox.zip")
244
  st.download_button(
245
  label="Download Figures ZIP",
246
  data=st.session_state.figure_zip_bytes,
 
254
  file_name=download_table_zip_file_name,
255
  mime="application/zip"
256
  )
257
+ st.download_button(
258
+ label="Download Textbox ZIP",
259
+ data=st.session_state.textbox_zip_bytes,
260
+ file_name=download_textbox_zip_file_name,
261
+ mime="application/zip"
262
+ )
263
 
264
  # st.image(Image.fromarray(bgr_image))
265
  # for index,pdf_text in enumerate(st.session_state.pdf_text_list):
 
278
  for pdf_table_text_image in st.session_state.pdf_tables_image_list[index]:
279
  st.write(pdf_table_text_image[0])
280
  st.image(Image.fromarray(pdf_table_text_image[1]))
281
+ if not st.session_state.pdf_textbox_image_list[index]:
282
+ st.write("no textbox")
283
+ else:
284
+ for text_box_index,pdf_textbox_image in enumerate(st.session_state.pdf_textbox_image_list[index]):
285
+ st.write("text box "+str(text_box_index))
286
+ st.image(Image.fromarray(pdf_textbox_image))
287
 
288
 
289
 
utils.py CHANGED
@@ -87,8 +87,8 @@ def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple):
87
  def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1):
88
 
89
  x, y, w, h = rect
90
- roi = image[y+1:y+h-1, x+1:x+w-1]
91
-
92
  return np.all(roi == 0)
93
  def get_below_box(image_np, x, y,width,step=15):
94
  #print("x,y,width="+str(x)+","+str(y)+","+str(width))
@@ -140,7 +140,7 @@ def is_note_rectangle(image_np, rect):
140
  text = pytesseract.image_to_string(roi_converted)
141
  text = text.strip()
142
  note_str="note"
143
- print("is note text box="+str(text.lower().startswith(note_str.lower())))
144
  return text.lower().startswith(note_str.lower())
145
  def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple):
146
 
@@ -183,15 +183,17 @@ def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_ch
183
  figure_str ="Figure"
184
  table_str ="Table"
185
  if text.lower().startswith(figure_str.lower()):
186
- print(text)
187
  figures_image_list.append((text,rect_content))
188
 
189
  elif text.lower().startswith(table_str.lower()):
190
- print(text)
191
  tables_image_list.append((text,rect_content))
192
  else:
 
193
  above_rect_content_list.append((text, rect_content))
194
- rect_content_list.append(rect_content)
 
195
 
196
  cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED)
197
  # above_rect_content = image_np[y-above_check_offset:y, x:x+w]
@@ -284,23 +286,23 @@ def extract_two_columns_text(image_index,image_np,debug):
284
  right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR)
285
  draw_edges(right_column_array_bgr_image)
286
  # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img)
287
- if debug:
288
- print("left column image start")
289
- # display(left_column_img)
290
- # st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
291
- print("left column image end")
292
- print("right column image start")
293
- # display(right_column_img)
294
- # st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
295
- print("right column image end")
296
  left_text = pytesseract.image_to_string(left_column_img)
297
  # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
298
  # file.write(left_text)
299
- print("Extracted Text:\n", left_text)
300
  right_text = pytesseract.image_to_string(right_column_img)
301
  # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file:
302
  # file.write(right_text)
303
- print("Extracted Text:\n", right_text)
304
  return left_text + right_text
305
  else:
306
  return "error"
@@ -331,20 +333,20 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
331
  if debug:
332
  debug_text_box_index = 0
333
  for text_box, above_text_box in zip(text_box_list, above_test_box_list):
334
- print("text box start")
335
- if above_text_box is not None:
336
- print(above_text_box[0])#to_be_displayed
337
  # st.write(above_text_box[0])#to_be_displayed
338
  # st.image(Image.fromarray(above_text_box[1]))#to_be_displayed
339
  # st.write(text)
340
  # st.image(Image.fromarray(text_box))#to_be_displayed
341
  debug_text_box_index = debug_text_box_index + 1
342
- for figure in figures_image_list:
343
- print(figure[0])
344
  # st.write(figure[0])#to_be_displayed
345
  # st.image(Image.fromarray(figure[1]))#to_be_displayed
346
- for table in tables_image_list:
347
- print(table[0])
348
  # st.write(table[0])#to_be_displayed
349
  # st.image(Image.fromarray(table[1]))#to_be_displayed
350
  # st.image(Image.fromarray(cropped_image))#to_be_displayed
@@ -352,21 +354,21 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
352
  if found_hor_lines_list is not None:
353
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
354
  draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
355
- print("detected Lines start")
356
- # st.image(Image.fromarray(bgr_image)) #to_be_displayed
357
-
358
- print("detected lines end")
359
  page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
360
  if debug:
361
  debug_page_segment_index = 0
362
  for element in page_segment_np_list:
363
- print("element start")
364
  bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
365
  draw_edges(bgr_image)
366
  # st.image(Image.fromarray(bgr_image))#to_be_displayed
367
 
368
  debug_page_segment_index = debug_page_segment_index + 1
369
- print("element end")
370
  min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
371
  max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
372
  else:
@@ -375,9 +377,9 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
375
  # print("max height image start")
376
  # st.image(Image.fromarray(max_height_image))#to_be_displayed
377
  # print("max height image end")
378
- print("start text extraction")
379
  text=extract_two_columns_text(image_index,max_height_image,debug)
380
- print("gray_pdf_image_np_to_text extracted text",text)
381
  if text == "error":
382
  print("not two columns")
383
  max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB))
@@ -420,6 +422,6 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
420
  if text == "error":
421
  return("error")
422
  else:
423
- return figures_image_list,tables_image_list,text
424
  else:
425
- return figures_image_list,tables_image_list,text
 
87
  def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1):
88
 
89
  x, y, w, h = rect
90
+ # roi = image[y+1:y+h-1, x+1:x+w-1]
91
+ roi = image[y + 10:y + h - 10, x + 10:x + w - 10]
92
  return np.all(roi == 0)
93
  def get_below_box(image_np, x, y,width,step=15):
94
  #print("x,y,width="+str(x)+","+str(y)+","+str(width))
 
140
  text = pytesseract.image_to_string(roi_converted)
141
  text = text.strip()
142
  note_str="note"
143
+ # print("is note text box="+str(text.lower().startswith(note_str.lower())))
144
  return text.lower().startswith(note_str.lower())
145
  def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple):
146
 
 
183
  figure_str ="Figure"
184
  table_str ="Table"
185
  if text.lower().startswith(figure_str.lower()):
186
+ # print(text)
187
  figures_image_list.append((text,rect_content))
188
 
189
  elif text.lower().startswith(table_str.lower()):
190
+ # print(text)
191
  tables_image_list.append((text,rect_content))
192
  else:
193
+ print("*** text box have above content")
194
  above_rect_content_list.append((text, rect_content))
195
+ rect_content_list.append(rect_content)
196
+
197
 
198
  cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED)
199
  # above_rect_content = image_np[y-above_check_offset:y, x:x+w]
 
286
  right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR)
287
  draw_edges(right_column_array_bgr_image)
288
  # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img)
289
+ # if debug:
290
+ # print("left column image start")
291
+ # # display(left_column_img)
292
+ # # st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
293
+ # print("left column image end")
294
+ # print("right column image start")
295
+ # # display(right_column_img)
296
+ # # st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
297
+ # print("right column image end")
298
  left_text = pytesseract.image_to_string(left_column_img)
299
  # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
300
  # file.write(left_text)
301
+ # print("Extracted Text:\n", left_text)
302
  right_text = pytesseract.image_to_string(right_column_img)
303
  # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file:
304
  # file.write(right_text)
305
+ # print("Extracted Text:\n", right_text)
306
  return left_text + right_text
307
  else:
308
  return "error"
 
333
  if debug:
334
  debug_text_box_index = 0
335
  for text_box, above_text_box in zip(text_box_list, above_test_box_list):
336
+ # print("text box start")
337
+ # if above_text_box is not None:
338
+ # print(above_text_box[0])#to_be_displayed
339
  # st.write(above_text_box[0])#to_be_displayed
340
  # st.image(Image.fromarray(above_text_box[1]))#to_be_displayed
341
  # st.write(text)
342
  # st.image(Image.fromarray(text_box))#to_be_displayed
343
  debug_text_box_index = debug_text_box_index + 1
344
+ # for figure in figures_image_list:
345
+ # print(figure[0])
346
  # st.write(figure[0])#to_be_displayed
347
  # st.image(Image.fromarray(figure[1]))#to_be_displayed
348
+ # for table in tables_image_list:
349
+ # print(table[0])
350
  # st.write(table[0])#to_be_displayed
351
  # st.image(Image.fromarray(table[1]))#to_be_displayed
352
  # st.image(Image.fromarray(cropped_image))#to_be_displayed
 
354
  if found_hor_lines_list is not None:
355
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
356
  draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
357
+ # print("detected Lines start")
358
+ # # st.image(Image.fromarray(bgr_image)) #to_be_displayed
359
+ #
360
+ # print("detected lines end")
361
  page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
362
  if debug:
363
  debug_page_segment_index = 0
364
  for element in page_segment_np_list:
365
+ # print("element start")
366
  bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
367
  draw_edges(bgr_image)
368
  # st.image(Image.fromarray(bgr_image))#to_be_displayed
369
 
370
  debug_page_segment_index = debug_page_segment_index + 1
371
+ # print("element end")
372
  min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
373
  max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
374
  else:
 
377
  # print("max height image start")
378
  # st.image(Image.fromarray(max_height_image))#to_be_displayed
379
  # print("max height image end")
380
+ # print("start text extraction")
381
  text=extract_two_columns_text(image_index,max_height_image,debug)
382
+ # print("gray_pdf_image_np_to_text extracted text",text)
383
  if text == "error":
384
  print("not two columns")
385
  max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB))
 
422
  if text == "error":
423
  return("error")
424
  else:
425
+ return figures_image_list,tables_image_list,text_box_list,text
426
  else:
427
+ return figures_image_list,tables_image_list,text_box_list,text