zmbfeng commited on
Commit
6f01943
1 Parent(s): 228cdc2

temp save image to add lines to complete table borders

Browse files
Files changed (2) hide show
  1. app.py +17 -3
  2. utils.py +2 -0
app.py CHANGED
@@ -161,7 +161,7 @@ if 'page_count' in st.session_state:
161
  st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
162
 
163
  if 'num_pages_to_extract2'not in st.session_state:
164
- st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=36, key='num_pages_to_extract_slider2')
165
  else:
166
  st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract2 , key='num_pages_to_extract_slider2')
167
 
@@ -191,8 +191,23 @@ if 'page_count' in st.session_state:
191
  read_pdf_progress_bar.progress(progress_percentage)
192
  read_pdf_progress_bar.progress(0)
193
  for index, image in enumerate(st.session_state.color_image_list):
 
194
  image_np = np.array(image)
195
- st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  progress_percentage = (index) / len(st.session_state.color_image_list)
197
  read_pdf_progress_bar.progress(progress_percentage)
198
  st.session_state.extracted_text = ""
@@ -201,7 +216,6 @@ if 'page_count' in st.session_state:
201
  manage_temp_to_be_zipped_directory(temp_table_dir)
202
  manage_temp_to_be_zipped_directory(temp_textbox_dir)
203
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
204
-
205
  try:
206
  figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
207
  if textbox_image_list:
 
161
  st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
162
 
163
  if 'num_pages_to_extract2'not in st.session_state:
164
+ st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=39, key='num_pages_to_extract_slider2')
165
  else:
166
  st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract2 , key='num_pages_to_extract_slider2')
167
 
 
191
  read_pdf_progress_bar.progress(progress_percentage)
192
  read_pdf_progress_bar.progress(0)
193
  for index, image in enumerate(st.session_state.color_image_list):
194
+ st.write("actual page = " + str(index + st.session_state.num_pages_to_extract))
195
  image_np = np.array(image)
196
+ gray_image_np=cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY)
197
+ cv2.imwrite(f"gray_image_{index}.png", gray_image_np)
198
+ # st.image(Image.fromarray(gray_image_np))
199
+ if index + st.session_state.num_pages_to_extract == 34:
200
+ cv2.line(gray_image_np, (223, 414), (223, 1185), 0, 2)
201
+ cv2.line(gray_image_np, (1527, 414), (1527, 1185), 0, 2)
202
+ if index + st.session_state.num_pages_to_extract == 35:
203
+ cv2.line(gray_image_np, (176, 248), (176, 1760), 0, 2)
204
+ cv2.line(gray_image_np, (1551, 248), (1551, 1760), 0, 2)
205
+ st.image(Image.fromarray(gray_image_np))
206
+ st.session_state.gray_image_np_list.append(gray_image_np)
207
+
208
+
209
+ # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
210
+ # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
211
  progress_percentage = (index) / len(st.session_state.color_image_list)
212
  read_pdf_progress_bar.progress(progress_percentage)
213
  st.session_state.extracted_text = ""
 
216
  manage_temp_to_be_zipped_directory(temp_table_dir)
217
  manage_temp_to_be_zipped_directory(temp_textbox_dir)
218
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
 
219
  try:
220
  figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
221
  if textbox_image_list:
utils.py CHANGED
@@ -324,6 +324,8 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
324
  if debug:
325
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
326
  color_tuple = (0, 255, 0)
 
 
327
  draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
328
  # st.image(Image.fromarray(bgr_image)) #to_be_displayed
329
 
 
324
  if debug:
325
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
326
  color_tuple = (0, 255, 0)
327
+ # print("bounding_boxes_list")
328
+ # print(bounding_boxes_list)
329
  draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
330
  # st.image(Image.fromarray(bgr_image)) #to_be_displayed
331