zmbfeng commited on
Commit
229ebda
1 Parent(s): c6269e3

end to end working

Browse files
Files changed (2) hide show
  1. app.py +1 -0
  2. utils.py +113 -9
app.py CHANGED
@@ -52,6 +52,7 @@ for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:
52
  print("index="+str(index))
53
 
54
  text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
 
55
  #if 'img_index' not in st.session_state:
56
 
57
  # if st.button("Stop"):
 
52
  print("index="+str(index))
53
 
54
  text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
55
+ st.write(text)
56
  #if 'img_index' not in st.session_state:
57
 
58
  # if st.button("Stop"):
utils.py CHANGED
@@ -249,14 +249,72 @@ def draw_edges(np_image):
249
  thickness = 5
250
 
251
  # Get the dimensions of the image
252
- height, width = np_image.shape[:2]
 
 
 
253
 
254
  # Coordinates for the rectangle: start from (0,0) to (width, height)
255
  # We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
256
  cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
257
  thickness)
258
-
259
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
261
  bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
262
  bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
@@ -295,7 +353,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
295
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
296
  draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
297
  print("detected Lines start")
298
- st.image(Image.fromarray(bgr_image)) #to_be_displayed
299
 
300
  print("detected lines end")
301
  page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
@@ -305,16 +363,62 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
305
  print("element start")
306
  bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
307
  draw_edges(bgr_image)
308
- st.image(Image.fromarray(bgr_image))#to_be_displayed
309
 
310
  debug_page_segment_index = debug_page_segment_index + 1
311
  print("element end")
312
  min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
313
  max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
314
- if debug:
315
- print("max height image start")
316
- st.image(Image.fromarray(max_height_image))#to_be_displayed
317
- print("max height image end")
318
  else:
319
  max_height_image = cropped_image.copy()
320
  st.write("selected segment")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  thickness = 5
250
 
251
  # Get the dimensions of the image
252
+ try:
253
+ height, width = np_image.shape[:2]
254
+ except Exception as e:
255
+ print("An error occurred:", e)
256
 
257
  # Coordinates for the rectangle: start from (0,0) to (width, height)
258
  # We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
259
  cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
260
  thickness)
261
+ def is_image_np_two_columns(image_np,horizontal_margin,vertical_margin):
262
+ page_x_center = image_np.shape[1]//2
263
+ page_height=image_np.shape[0]
264
+ image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
265
+ #display_image_np(image_middle_np)
266
+ return np.all(image_middle_np == 255)
267
+ def extract_two_columns_text(image_index,image_np,debug):
268
+ # formatted_index_string = f"{index:03d}"
269
+ if is_image_np_two_columns(image_np,20,10):
270
+ page_x_center = image_np.shape[1] // 2
271
+ # print(page_x_center)
272
+ temp_array = image_np.copy()
273
+ left_column_array = temp_array[:, :page_x_center]
274
+ temp_array = image_np.copy()
275
+ right_column_array = temp_array[:, page_x_center:]
276
+
277
+ left_column_img = Image.fromarray(cv2.cvtColor(left_column_array, cv2.COLOR_BGR2RGB))
278
+ left_column_array_bgr_image = cv2.cvtColor(left_column_array, cv2.COLOR_GRAY2BGR)
279
+ draw_edges(left_column_array_bgr_image)
280
+
281
+ # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_left_column.png", left_column_img)
282
+
283
+ right_column_img = Image.fromarray(cv2.cvtColor(right_column_array, cv2.COLOR_BGR2RGB))
284
+ right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR)
285
+ draw_edges(right_column_array_bgr_image)
286
+ # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img)
287
+ if debug:
288
+ print("left column image start")
289
+ # display(left_column_img)
290
+ st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
291
+ print("left column image end")
292
+ print("right column image start")
293
+ # display(right_column_img)
294
+ st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
295
+ print("right column image end")
296
+ left_text = pytesseract.image_to_string(left_column_img)
297
+ # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
298
+ # file.write(left_text)
299
+ print("Extracted Text:\n", left_text)
300
+ right_text = pytesseract.image_to_string(right_column_img)
301
+ # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file:
302
+ # file.write(right_text)
303
+ print("Extracted Text:\n", right_text)
304
+ return left_text + right_text
305
+ else:
306
+ return "error"
307
+ def get_where_image_np_two_columns_stops(image_np,horizontal_margin,vertical_margin):
308
+ page_x_center = image_np.shape[1]//2
309
+ page_height=image_np.shape[0]
310
+ image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
311
+ #display_image_np(image_middle_np)
312
+ return np.where(image_middle_np != 255)
313
+
314
+ # indices = np.where(image_middle_np != 255)
315
+ # print(len(indices[0]))
316
+ # for i in range(len(indices[0])):
317
+ # print(f"Index: {indices[0][i], indices[1][i]}, Value: {image_middle_np[indices[0][i], indices[1][i]]}")
318
  def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
319
  bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
320
  bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
 
353
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
354
  draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
355
  print("detected Lines start")
356
+ # st.image(Image.fromarray(bgr_image)) #to_be_displayed
357
 
358
  print("detected lines end")
359
  page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
 
363
  print("element start")
364
  bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
365
  draw_edges(bgr_image)
366
+ # st.image(Image.fromarray(bgr_image))#to_be_displayed
367
 
368
  debug_page_segment_index = debug_page_segment_index + 1
369
  print("element end")
370
  min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
371
  max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
 
 
 
 
372
  else:
373
  max_height_image = cropped_image.copy()
374
  st.write("selected segment")
375
+ # print("max height image start")
376
+ # st.image(Image.fromarray(max_height_image))#to_be_displayed
377
+ # print("max height image end")
378
+ text=extract_two_columns_text(image_index,max_height_image,debug)
379
+ print(text)
380
+ if text == "error":
381
+ print("not two columns")
382
+ max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB))
383
+ text = pytesseract.image_to_string(max_height_image_converted)
384
+ text = text.strip()
385
+ toc_str="table of contents"
386
+ # print("Extracted Text:\n", text)
387
+ if text.lower().startswith(toc_str.lower()):
388
+
389
+ #if "Table of Contents" in text:
390
+ print("Table of Contents")
391
+ # display_image_np(max_height_image)
392
+ #print(text)
393
+ return("Table of Contents")
394
+ else:
395
+ print("not Table of Contents")
396
+ indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
397
+ print(indeces_stop[0][0])
398
+ print(max_height_image.shape[0])
399
+ y_start=get_above_box(max_height_image, 0, indeces_stop[0][0],max_height_image.shape[1])
400
+ if debug:
401
+ bgr_image = cv2.cvtColor(max_height_image, cv2.COLOR_GRAY2BGR)
402
+ color_tuple=(0, 255, 0)
403
+ cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
404
+ print("still in the middle start")
405
+ st.image(Image.fromarray(bgr_image))
406
+ print("still in the middle end")
407
+ left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
408
+ if debug:
409
+ print("left over start")
410
+ st.image(Image.fromarray(left_over_content))
411
+ print("left over end")
412
+ max_height_image_copy=max_height_image.copy()
413
+ cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
414
+ if debug:
415
+ print("no left over start")
416
+ st.image(Image.fromarray(max_height_image_copy))
417
+ print("no left over end")
418
+ text=extract_two_columns_text(max_height_image_copy,debug)
419
+ if text == "error":
420
+ return("error")
421
+ else:
422
+ return text
423
+ else:
424
+ return text