Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

zmbfeng commited on May 25, 2024

Commit

15cd602

1 Parent(s): d6482c4

figures, tables, and other text book with above text recognized and below text included in the block text image

Browse files

Files changed (3) hide show

app.py +1 -0
packages.txt +2 -1
utils.py +147 -1

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import time
 # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
 # poppler-utils:
 #   Installed: 22.02.0-2ubuntu0.4
 #page extraction disabled
 big_text = """
     <div style='text-align: center;'>

 # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
 # poppler-utils:
 #   Installed: 22.02.0-2ubuntu0.4
+# install https://github.com/UB-Mannheim/tesseract/wiki
 #page extraction disabled
 big_text = """
     <div style='text-align: center;'>

packages.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- poppler-utils


1	+ poppler-utils
2	+ tesseract-ocr

utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ import streamlit as st
 import cv2
 import numpy as np
 from PIL import Image
 def get_pdf_page_count(pdf_path):
     try:
         # Running pdfinfo command to get information about the PDF
@@ -82,6 +83,131 @@ def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple):
   for x, y, w, h in boxes_list:
       #x, y, w, h = box[0]
       cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)
 def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
     bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
     bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
@@ -90,4 +216,24 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
         bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
         color_tuple = (0, 255, 0)
         draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
-        st.image(Image.fromarray(bgr_image))

 import cv2
 import numpy as np
 from PIL import Image
+import pytesseract
 def get_pdf_page_count(pdf_path):
     try:
         # Running pdfinfo command to get information about the PDF
   for x, y, w, h in boxes_list:
       #x, y, w, h = box[0]
       cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)
+def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1):
+    x, y, w, h = rect
+    roi = image[y+1:y+h-1, x+1:x+w-1]
+    return np.all(roi == 0)
+def get_below_box(image_np, x, y,width,step=15):
+  #print("x,y,width="+str(x)+","+str(y)+","+str(width))
+  index_y = -1
+  #print("get_below_box"+str(image_np.shape))
+  if y+step < image_np.shape[0]:
+    index_y = y
+    while index_y+step < image_np.shape[0]:
+      #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))
+      # image_np_copy = image_np.copy()
+      # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
+      # cv2.rectangle(bgr_image, (x, index_y), (x + width, index_y +step), color_tuple, thickness=5)
+      # display_image_np(bgr_image)
+      if np.all(image_np[index_y:index_y+step,x:x+width] == 255):
+        # index_y += step
+        break
+      index_y += step
+  return index_y
+def get_above_box(image_np, x, y,width,step=15):
+  #print("x,y,width="+str(x)+","+str(y)+","+str(width))
+  index_y = -1
+  #print("get_below_box"+str(image_np.shape))
+  if y-step > 0:
+    index_y = y
+    while index_y-step > 0:
+      #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))
+      # image_np_copy = image_np.copy()
+      # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
+      # color_tuple=(0, 255, 0)
+      # cv2.rectangle(bgr_image, (x, index_y-step), (x + width, index_y), color_tuple, thickness=5)
+      # display_image_np(bgr_image)
+      if np.all(image_np[index_y-step:index_y,x:x+width] == 255):
+        # index_y += step
+        break
+      index_y -= step
+  return index_y
+def is_note_rectangle(image_np, rect):
+  x, y, w, h = rect
+  roi = image_np[y+1:y+h-1, x+1:x+w-1]
+  roi_converted = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
+  text = pytesseract.image_to_string(roi_converted)
+  text = text.strip()
+  note_str="note"
+  print("is note text box="+str(text.lower().startswith(note_str.lower())))
+  return text.lower().startswith(note_str.lower())
+def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple):
+  image_np_copy=image_np.copy()
+  rect_content_list=[]
+  above_rect_content_list=[]
+  figures_image_list=[]
+  tables_image_list=[]
+  index = 0
+  for box in bounding_boxes_list:
+    x, y, w, h = box
+    if not is_filled_rectangle(image_np_copy, box):
+      # print("box="+str(box)+"not filled")
+      y_index= get_below_box(image_np, x, y+h,w)
+      if y_index == -1 or is_note_rectangle(image_np_copy, box):
+        # print("below text not found")
+        rect_content =image_np[y:y+h, x:x+w]
+        # rect_content_list.append(rect_content)
+        cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)
+      else:
+        # print("below text found")
+        rect_content =image_np[y:y_index, x:x+w]
+        # rect_content_list.append(rect_content)
+        cv2.rectangle(image_np_copy, (x, y), (x+w, y_index), color_tuple, cv2.FILLED)
+      cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)
+      above_box_y= get_above_box(image_np, x, y,w)
+      if above_box_y == -1 or above_box_y == y:
+        # print("box="+str(box)+"no above box")
+        above_rect_content_list.append(None)
+        rect_content_list.append(rect_content)
+      else:
+        # print("box="+str(box)+"above box exist")
+        above_rect_content = image_np[above_box_y:y, x:x+w]
+        # above_rect_content_list.append(above_rect_content)
+        above_converted = Image.fromarray(cv2.cvtColor(above_rect_content, cv2.COLOR_BGR2RGB))
+        text = pytesseract.image_to_string(above_converted)
+        text = text.strip()
+        figure_str ="Figure"
+        table_str ="Table"
+        if text.lower().startswith(figure_str.lower()):
+          print(text)
+          figures_image_list.append((text,rect_content))
+        elif  text.lower().startswith(table_str.lower()):
+          print(text)
+          tables_image_list.append((text,rect_content))
+        else:
+          above_rect_content_list.append((text, rect_content))
+        rect_content_list.append(rect_content)
+        cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED)
+      # above_rect_content = image_np[y-above_check_offset:y, x:x+w]
+      # if np.all(above_rect_content == 255):
+      #   # print("box="+str(box)+"above all white")
+      #   above_rect_content_list.append(None)
+      # else:
+      #   # print("box="+str(box)+"above not all white")
+      #   above_rect_content = image_np[y-above_caption_offset:y, x:x+w]
+      #   above_rect_content_list.append(above_rect_content)
+      #   cv2.rectangle(image_np_copy, (x, y), (x+w, y-above_caption_offset), color_tuple, cv2.FILLED)
+      index += 1
+    # else:
+    #   print("box="+str(box)+"filled")
+  return rect_content_list,above_rect_content_list,  figures_image_list, tables_image_list, image_np_copy
 def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
     bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
     bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
         bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
         color_tuple = (0, 255, 0)
         draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
+        # st.image(Image.fromarray(bgr_image)) #to_be_displayed
+    text_box_list, above_test_box_list,figures_image_list,tables_image_list, cropped_image = extract_bounding_boxes_from_image_np(gray_pdf_image_np,
+                                                                                             bounding_boxes_list, 30,
+                                                                                             50, (255, 255, 255))
+    if debug:
+        debug_text_box_index = 0
+        for text_box, above_text_box in zip(text_box_list, above_test_box_list):
+            print("text box start")
+            if above_text_box is not None:
+                st.write(above_text_box[0])
+                st.image(Image.fromarray(above_text_box[1]))
+                # st.write(text)
+            st.image(Image.fromarray(text_box))
+            debug_text_box_index = debug_text_box_index + 1
+        for figure in figures_image_list:
+            st.write(figure[0])
+            st.image(Image.fromarray(figure[1]))
+        for table in tables_image_list:
+            st.write(table[0])
+            st.image(Image.fromarray(table[1]))