Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

zmbfeng commited on May 25, 2024

Commit

229ebda

1 Parent(s): c6269e3

end to end working

Browse files

Files changed (2) hide show

app.py +1 -0
utils.py +113 -9

app.py CHANGED Viewed

@@ -52,6 +52,7 @@ for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:
   print("index="+str(index))
   text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
 #if 'img_index' not in st.session_state:
 # if st.button("Stop"):

   print("index="+str(index))
   text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
+  st.write(text)
 #if 'img_index' not in st.session_state:
 # if st.button("Stop"):

utils.py CHANGED Viewed

@@ -249,14 +249,72 @@ def draw_edges(np_image):
     thickness = 5
     # Get the dimensions of the image
-    height, width = np_image.shape[:2]
     # Coordinates for the rectangle: start from (0,0) to (width, height)
     # We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
     cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
                   thickness)
 def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
     bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
     bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
@@ -295,7 +353,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
         bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
         draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
         print("detected Lines start")
-        st.image(Image.fromarray(bgr_image)) #to_be_displayed
         print("detected lines end")
         page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
@@ -305,16 +363,62 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
                 print("element start")
                 bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
                 draw_edges(bgr_image)
-                st.image(Image.fromarray(bgr_image))#to_be_displayed
                 debug_page_segment_index = debug_page_segment_index + 1
                 print("element end")
         min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
         max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
-        if debug:
-            print("max height image start")
-            st.image(Image.fromarray(max_height_image))#to_be_displayed
-            print("max height image end")
     else:
         max_height_image = cropped_image.copy()
     st.write("selected segment")

     thickness = 5
     # Get the dimensions of the image
+    try:
+        height, width = np_image.shape[:2]
+    except Exception as e:
+        print("An error occurred:", e)
     # Coordinates for the rectangle: start from (0,0) to (width, height)
     # We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
     cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
                   thickness)
+def is_image_np_two_columns(image_np,horizontal_margin,vertical_margin):
+  page_x_center = image_np.shape[1]//2
+  page_height=image_np.shape[0]
+  image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
+  #display_image_np(image_middle_np)
+  return np.all(image_middle_np == 255)
+def extract_two_columns_text(image_index,image_np,debug):
+  # formatted_index_string = f"{index:03d}"
+  if is_image_np_two_columns(image_np,20,10):
+      page_x_center = image_np.shape[1] // 2
+      # print(page_x_center)
+      temp_array = image_np.copy()
+      left_column_array = temp_array[:, :page_x_center]
+      temp_array = image_np.copy()
+      right_column_array = temp_array[:, page_x_center:]
+      left_column_img = Image.fromarray(cv2.cvtColor(left_column_array, cv2.COLOR_BGR2RGB))
+      left_column_array_bgr_image = cv2.cvtColor(left_column_array, cv2.COLOR_GRAY2BGR)
+      draw_edges(left_column_array_bgr_image)
+      # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_left_column.png", left_column_img)
+      right_column_img = Image.fromarray(cv2.cvtColor(right_column_array, cv2.COLOR_BGR2RGB))
+      right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR)
+      draw_edges(right_column_array_bgr_image)
+      # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img)
+      if debug:
+          print("left column image start")
+          # display(left_column_img)
+          st.image(Image.fromarray(left_column_array_bgr_image))  # to_be_displayed
+          print("left column image end")
+          print("right column image start")
+          # display(right_column_img)
+          st.image(Image.fromarray(right_column_array_bgr_image))  # to_be_displayed
+          print("right column image end")
+      left_text = pytesseract.image_to_string(left_column_img)
+      # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
+      #   file.write(left_text)
+      print("Extracted Text:\n", left_text)
+      right_text = pytesseract.image_to_string(right_column_img)
+      # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file:
+      #   file.write(right_text)
+      print("Extracted Text:\n", right_text)
+      return left_text + right_text
+  else:
+      return "error"
+def get_where_image_np_two_columns_stops(image_np,horizontal_margin,vertical_margin):
+  page_x_center = image_np.shape[1]//2
+  page_height=image_np.shape[0]
+  image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
+  #display_image_np(image_middle_np)
+  return np.where(image_middle_np != 255)
+  # indices = np.where(image_middle_np != 255)
+  # print(len(indices[0]))
+  # for i in range(len(indices[0])):
+  #     print(f"Index: {indices[0][i], indices[1][i]}, Value: {image_middle_np[indices[0][i], indices[1][i]]}")
 def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
     bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
     bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
         bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
         draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
         print("detected Lines start")
+        # st.image(Image.fromarray(bgr_image)) #to_be_displayed
         print("detected lines end")
         page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
                 print("element start")
                 bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
                 draw_edges(bgr_image)
+                # st.image(Image.fromarray(bgr_image))#to_be_displayed
                 debug_page_segment_index = debug_page_segment_index + 1
                 print("element end")
         min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
         max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
     else:
         max_height_image = cropped_image.copy()
     st.write("selected segment")
+    # print("max height image start")
+    # st.image(Image.fromarray(max_height_image))#to_be_displayed
+    # print("max height image end")
+    text=extract_two_columns_text(image_index,max_height_image,debug)
+    print(text)
+    if text == "error":
+        print("not two columns")
+        max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB))
+        text = pytesseract.image_to_string(max_height_image_converted)
+        text = text.strip()
+        toc_str="table of contents"
+        # print("Extracted Text:\n", text)
+        if text.lower().startswith(toc_str.lower()):
+        #if "Table of Contents" in text:
+          print("Table of Contents")
+          # display_image_np(max_height_image)
+          #print(text)
+          return("Table of Contents")
+        else:
+          print("not Table of Contents")
+          indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
+          print(indeces_stop[0][0])
+          print(max_height_image.shape[0])
+          y_start=get_above_box(max_height_image, 0, indeces_stop[0][0],max_height_image.shape[1])
+          if debug:
+            bgr_image = cv2.cvtColor(max_height_image, cv2.COLOR_GRAY2BGR)
+            color_tuple=(0, 255, 0)
+            cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
+            print("still in the middle start")
+            st.image(Image.fromarray(bgr_image))
+            print("still in the middle end")
+          left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
+          if debug:
+            print("left over start")
+            st.image(Image.fromarray(left_over_content))
+            print("left over end")
+          max_height_image_copy=max_height_image.copy()
+          cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
+          if debug:
+            print("no left over start")
+            st.image(Image.fromarray(max_height_image_copy))
+            print("no left over end")
+          text=extract_two_columns_text(max_height_image_copy,debug)
+          if text == "error":
+            return("error")
+          else:
+            return text
+    else:
+        return text