Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

zmbfeng commited on May 25, 2024

Commit

d6482c4

1 Parent(s): 0196fd4

locate boxes and green highlightthem

Browse files

Files changed (2) hide show

app.py +19 -10
utils.py +82 -2

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ if 'is_initialized' not in st.session_state:
     st.session_state['is_initialized'] = True
     page_count = utils.get_pdf_page_count(pdf_path)
     print("page_count=",page_count)
-    page_count=50
     print("new page_count=",page_count)
     read_pdf_progress_bar = st.progress(0)
     st.session_state.color_image_list = []
@@ -39,20 +39,29 @@ if 'is_initialized' not in st.session_state:
         st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
         progress_percentage = (index) / (page_count - 1)
         read_pdf_progress_bar.progress(progress_percentage)
     st.session_state.img_index = 0
     st.session_state.stop_button_clicked=False
 #if 'img_index' not in st.session_state:
-if st.button("Stop"):
-    st.session_state.stop_button_clicked = True
-st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
-st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
-if not st.session_state.stop_button_clicked:
-    if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
-        st.session_state.img_index += 1
-        time.sleep(3)
-        st.rerun()
 # col1, col2 = st.columns(2)
 # with col1:
 #     if st.button("Previous"):

     st.session_state['is_initialized'] = True
     page_count = utils.get_pdf_page_count(pdf_path)
     print("page_count=",page_count)
+    page_count=5
     print("new page_count=",page_count)
     read_pdf_progress_bar = st.progress(0)
     st.session_state.color_image_list = []
         st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
         progress_percentage = (index) / (page_count - 1)
         read_pdf_progress_bar.progress(progress_percentage)
+    # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
+    # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
+    # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
+    # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
     st.session_state.img_index = 0
     st.session_state.stop_button_clicked=False
+# st.image(st.session_state.gray_image_np_list[38])
+for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
+  print("index="+str(index))
+  text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
 #if 'img_index' not in st.session_state:
+# if st.button("Stop"):
+#     st.session_state.stop_button_clicked = True
+# st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
+# st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
+# if not st.session_state.stop_button_clicked:
+#     if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
+#         st.session_state.img_index += 1
+#         time.sleep(3)
+#         st.rerun()
 # col1, col2 = st.columns(2)
 # with col1:
 #     if st.button("Previous"):

utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import subprocess
 def get_pdf_page_count(pdf_path):
     try:
         # Running pdfinfo command to get information about the PDF
@@ -10,4 +13,81 @@ def get_pdf_page_count(pdf_path):
                 return int(line.split(':')[1].strip())
     except Exception as e:
         print(f"An error occurred: {e}")
-        return None

 import subprocess
+import streamlit as st
+import cv2
+import numpy as np
+from PIL import Image
 def get_pdf_page_count(pdf_path):
     try:
         # Running pdfinfo command to get information about the PDF
                 return int(line.split(':')[1].strip())
     except Exception as e:
         print(f"An error occurred: {e}")
+        return None
+#configurable extract rectange rectangle size
+def extract_rectangle_from_image(gray, min_width, min_height):
+    bounding_boxes = []
+    #gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+    #edges = cv2.Canny(gray, 10, 200, apertureSize=3)
+    kernel = np.ones((3,3), np.uint8)
+    dilated_edges = cv2.dilate(edges, kernel, iterations=1)
+    contours, _ = cv2.findContours(dilated_edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    #contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    index = 0
+    for cnt in contours:
+        approx = cv2.approxPolyDP(cnt, 0.01*cv2.arcLength(cnt, True), True)
+        #approx = cv2.approxPolyDP(cnt, 0.1*cv2.arcLength(cnt, True), True)
+        if len(approx) == 4:  # Rectangle check
+            x, y, w, h = cv2.boundingRect(approx)
+            # print(f"x: {x}, y: {y}, w: {w}, h: {h}")
+            if w >= min_width and h >= min_height:
+                bounding_boxes.append((x, y, w, h))
+                #print(x, y, w, h)
+    return bounding_boxes
+def is_close(box1, box2, threshold=10):
+    # Calculate the distance between the top-left corners of the two boxes
+    distance = ((box1[0] - box2[0]) ** 2 + (box1[1] - box2[1]) ** 2) ** 0.5
+    return distance < threshold
+def remove_close_boxes(boxes, threshold=10):
+    kept_boxes = []
+    for box in boxes:
+        # Assume the box is not close to others by default
+        is_close_to_others = False
+        for kept_box in kept_boxes:
+            if is_close(box, kept_box, threshold):
+                is_close_to_others = True
+                break
+        # If the box is not close to any box we've kept, add it to the list of kept boxes
+        if not is_close_to_others:
+            kept_boxes.append(box)
+    return kept_boxes
+def is_contained(box1, box2):
+    """
+    Check if box1 is contained within box2.
+    Each box is defined as (x, y, w, h).
+    """
+    x1, y1, w1, h1 = box1
+    x2, y2, w2, h2 = box2
+    # Check if all corners of box1 are inside box2
+    return x2 <= x1 and y2 <= y1 and x2 + w2 >= x1 + w1 and y2 + h2 >= y1 + h1
+def remove_contained_boxes(boxes):
+    """
+    Remove boxes that are contained within other boxes.
+    """
+    non_contained_boxes = []
+    for i, box1 in enumerate(boxes):
+        # Check if there's another box that contains box1
+        if not any(is_contained(box1, box2) for j, box2 in enumerate(boxes) if i != j):
+            non_contained_boxes.append(box1)
+    return non_contained_boxes
+def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple):
+  for x, y, w, h in boxes_list:
+      #x, y, w, h = box[0]
+      cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)
+def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
+    bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
+    bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
+    bounding_boxes_list = remove_contained_boxes(bounding_boxes_list)
+    if debug:
+        bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
+        color_tuple = (0, 255, 0)
+        draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
+        st.image(Image.fromarray(bgr_image))