Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

File size: 19,283 Bytes

import subprocess
import streamlit as st
import cv2
import numpy as np
from PIL import Image
import pytesseract
def get_pdf_page_count(pdf_path):
    try:
        # Running pdfinfo command to get information about the PDF
        result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True)
        # Parsing the output to find the line with the number of pages
        for line in result.stdout.split('\n'):
            if 'Pages:' in line:
                return int(line.split(':')[1].strip())
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
#configurable extract rectange rectangle size

def extract_rectangle_from_image(gray, min_width, min_height):
    bounding_boxes = []
    #gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    #edges = cv2.Canny(gray, 10, 200, apertureSize=3)
    kernel = np.ones((3,3), np.uint8)
    dilated_edges = cv2.dilate(edges, kernel, iterations=1)
    contours, _ = cv2.findContours(dilated_edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    #contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)


    index = 0
    for cnt in contours:
        approx = cv2.approxPolyDP(cnt, 0.01*cv2.arcLength(cnt, True), True)
        #approx = cv2.approxPolyDP(cnt, 0.1*cv2.arcLength(cnt, True), True)
        if len(approx) == 4:  # Rectangle check
            x, y, w, h = cv2.boundingRect(approx)
            # print(f"x: {x}, y: {y}, w: {w}, h: {h}")
            if w >= min_width and h >= min_height:
                bounding_boxes.append((x, y, w, h))
                #print(x, y, w, h)
    return bounding_boxes
def is_close(box1, box2, threshold=10):
    # Calculate the distance between the top-left corners of the two boxes
    distance = ((box1[0] - box2[0]) ** 2 + (box1[1] - box2[1]) ** 2) ** 0.5
    return distance < threshold
def remove_close_boxes(boxes, threshold=10):
    kept_boxes = []
    for box in boxes:
        # Assume the box is not close to others by default
        is_close_to_others = False
        for kept_box in kept_boxes:
            if is_close(box, kept_box, threshold):
                is_close_to_others = True
                break
        # If the box is not close to any box we've kept, add it to the list of kept boxes
        if not is_close_to_others:
            kept_boxes.append(box)
    return kept_boxes
def is_contained(box1, box2):
    """
    Check if box1 is contained within box2.
    Each box is defined as (x, y, w, h).
    """
    x1, y1, w1, h1 = box1
    x2, y2, w2, h2 = box2

    # Check if all corners of box1 are inside box2
    return x2 <= x1 and y2 <= y1 and x2 + w2 >= x1 + w1 and y2 + h2 >= y1 + h1

def remove_contained_boxes(boxes):
    """
    Remove boxes that are contained within other boxes.
    """
    non_contained_boxes = []

    for i, box1 in enumerate(boxes):
        # Check if there's another box that contains box1
        if not any(is_contained(box1, box2) for j, box2 in enumerate(boxes) if i != j):
            non_contained_boxes.append(box1)

    return non_contained_boxes
def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple):
  for x, y, w, h in boxes_list:
      #x, y, w, h = box[0]
      cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)

def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1):

    x, y, w, h = rect
    roi = image[y+1:y+h-1, x+1:x+w-1]

    return np.all(roi == 0)
def get_below_box(image_np, x, y,width,step=15):
  #print("x,y,width="+str(x)+","+str(y)+","+str(width))

  index_y = -1
  #print("get_below_box"+str(image_np.shape))
  if y+step < image_np.shape[0]:
    index_y = y
    while index_y+step < image_np.shape[0]:
      #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))

      # image_np_copy = image_np.copy()
      # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
      # cv2.rectangle(bgr_image, (x, index_y), (x + width, index_y +step), color_tuple, thickness=5)
      # display_image_np(bgr_image)


      if np.all(image_np[index_y:index_y+step,x:x+width] == 255):
        # index_y += step
        break
      index_y += step
  return index_y
def get_above_box(image_np, x, y,width,step=15):
  #print("x,y,width="+str(x)+","+str(y)+","+str(width))

  index_y = -1
  #print("get_below_box"+str(image_np.shape))
  if y-step > 0:
    index_y = y
    while index_y-step > 0:
      #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))

      # image_np_copy = image_np.copy()
      # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
      # color_tuple=(0, 255, 0)
      # cv2.rectangle(bgr_image, (x, index_y-step), (x + width, index_y), color_tuple, thickness=5)
      # display_image_np(bgr_image)


      if np.all(image_np[index_y-step:index_y,x:x+width] == 255):
        # index_y += step
        break
      index_y -= step
  return index_y
def is_note_rectangle(image_np, rect):
  x, y, w, h = rect
  roi = image_np[y+1:y+h-1, x+1:x+w-1]
  roi_converted = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
  text = pytesseract.image_to_string(roi_converted)
  text = text.strip()
  note_str="note"
  print("is note text box="+str(text.lower().startswith(note_str.lower())))
  return text.lower().startswith(note_str.lower())
def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple):

  image_np_copy=image_np.copy()
  rect_content_list=[]
  above_rect_content_list=[]
  figures_image_list=[]
  tables_image_list=[]
  index = 0
  for box in bounding_boxes_list:
    x, y, w, h = box
    if not is_filled_rectangle(image_np_copy, box):
      # print("box="+str(box)+"not filled")
      y_index= get_below_box(image_np, x, y+h,w)
      if y_index == -1 or is_note_rectangle(image_np_copy, box):
        # print("below text not found")
        rect_content =image_np[y:y+h, x:x+w]
        # rect_content_list.append(rect_content)
        cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)
      else:
        # print("below text found")
        rect_content =image_np[y:y_index, x:x+w]
        # rect_content_list.append(rect_content)
        cv2.rectangle(image_np_copy, (x, y), (x+w, y_index), color_tuple, cv2.FILLED)

      cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)

      above_box_y= get_above_box(image_np, x, y,w)
      if above_box_y == -1 or above_box_y == y:
        # print("box="+str(box)+"no above box")
        above_rect_content_list.append(None)
        rect_content_list.append(rect_content)
      else:
        # print("box="+str(box)+"above box exist")
        above_rect_content = image_np[above_box_y:y, x:x+w]
        # above_rect_content_list.append(above_rect_content)
        above_converted = Image.fromarray(cv2.cvtColor(above_rect_content, cv2.COLOR_BGR2RGB))
        text = pytesseract.image_to_string(above_converted)
        text = text.strip()
        figure_str ="Figure"
        table_str ="Table"
        if text.lower().startswith(figure_str.lower()):
          print(text)
          figures_image_list.append((text,rect_content))

        elif  text.lower().startswith(table_str.lower()):
          print(text)
          tables_image_list.append((text,rect_content))
        else:
          above_rect_content_list.append((text, rect_content))
        rect_content_list.append(rect_content)

        cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED)
      # above_rect_content = image_np[y-above_check_offset:y, x:x+w]
      # if np.all(above_rect_content == 255):
      #   # print("box="+str(box)+"above all white")
      #   above_rect_content_list.append(None)
      # else:
      #   # print("box="+str(box)+"above not all white")
      #   above_rect_content = image_np[y-above_caption_offset:y, x:x+w]
      #   above_rect_content_list.append(above_rect_content)
      #   cv2.rectangle(image_np_copy, (x, y), (x+w, y-above_caption_offset), color_tuple, cv2.FILLED)

      index += 1
    # else:
    #   print("box="+str(box)+"filled")
  return rect_content_list,above_rect_content_list,  figures_image_list, tables_image_list, image_np_copy
def find_hor_lines_in_image_np(min_width, min_height,image_np):
  # Apply a horizontal kernel to emphasize horizontal lines
  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1050, 5))  # Adjust size according to your document
  morphed = cv2.morphologyEx(image_np, cv2.MORPH_CLOSE, kernel)

  # Detect edges
  edges = cv2.Canny(morphed, 50, 150, apertureSize=3)

  # Detect lines using HoughLinesP
  lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10)  # Adjust parameters as needed
  return lines
def draw_colored_lines_on_image_np(image, lines,color_tuple):
  for line  in lines:
      x1, y1, x2, y2 = line[0]
      cv2.line(image, (x1, y1), (x2, y2), color_tuple, 3)
def segment_image_np(image_np,hor_lines_list):
  # print("in segment_image_np image_np start")
  # display_image_np(image_np)
  # print("in segment_image_np image_np end")
  segments = []
  previous_y = 0
  for line in sorted(hor_lines_list, key=lambda x: x[0][1]):  # Sort lines by their y-coordinate
      x1, y1, x2, y2 = line[0]
      segment = image_np[previous_y:y1, :]
      segments.append(segment)
      previous_y = y2  # Update to start the next segment from the end of the current line

  # Don't forget the last segment
  last_segment =image_np[previous_y:, :]
  segments.append(last_segment)
  return segments
def filter_segments_by_min_height(segments, min_height):
    return [segment for segment in segments if segment.shape[0] > min_height]

def draw_edges(np_image):
    color = (0, 255, 0)  # Green

    # Define the thickness of the rectangle lines
    thickness = 5

    # Get the dimensions of the image
    try:
        height, width = np_image.shape[:2]
    except Exception as e:
        print("An error occurred:", e)

    # Coordinates for the rectangle: start from (0,0) to (width, height)
    # We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
    cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
                  thickness)
def is_image_np_two_columns(image_np,horizontal_margin,vertical_margin):
  page_x_center = image_np.shape[1]//2
  page_height=image_np.shape[0]
  image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
  #display_image_np(image_middle_np)
  return np.all(image_middle_np == 255)
def extract_two_columns_text(image_index,image_np,debug):
  # formatted_index_string = f"{index:03d}"
  if is_image_np_two_columns(image_np,20,10):
      page_x_center = image_np.shape[1] // 2
      # print(page_x_center)
      temp_array = image_np.copy()
      left_column_array = temp_array[:, :page_x_center]
      temp_array = image_np.copy()
      right_column_array = temp_array[:, page_x_center:]

      left_column_img = Image.fromarray(cv2.cvtColor(left_column_array, cv2.COLOR_BGR2RGB))
      left_column_array_bgr_image = cv2.cvtColor(left_column_array, cv2.COLOR_GRAY2BGR)
      draw_edges(left_column_array_bgr_image)

      # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_left_column.png", left_column_img)

      right_column_img = Image.fromarray(cv2.cvtColor(right_column_array, cv2.COLOR_BGR2RGB))
      right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR)
      draw_edges(right_column_array_bgr_image)
      # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img)
      if debug:
          print("left column image start")
          # display(left_column_img)
          # st.image(Image.fromarray(left_column_array_bgr_image))  # to_be_displayed
          print("left column image end")
          print("right column image start")
          # display(right_column_img)
          # st.image(Image.fromarray(right_column_array_bgr_image))  # to_be_displayed
          print("right column image end")
      left_text = pytesseract.image_to_string(left_column_img)
      # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
      #   file.write(left_text)
      print("Extracted Text:\n", left_text)
      right_text = pytesseract.image_to_string(right_column_img)
      # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file:
      #   file.write(right_text)
      print("Extracted Text:\n", right_text)
      return left_text + right_text
  else:
      return "error"
def get_where_image_np_two_columns_stops(image_np,horizontal_margin,vertical_margin):
  page_x_center = image_np.shape[1]//2
  page_height=image_np.shape[0]
  image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
  #display_image_np(image_middle_np)
  return np.where(image_middle_np != 255)

  # indices = np.where(image_middle_np != 255)
  # print(len(indices[0]))
  # for i in range(len(indices[0])):
  #     print(f"Index: {indices[0][i], indices[1][i]}, Value: {image_middle_np[indices[0][i], indices[1][i]]}")
def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
    bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
    bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
    bounding_boxes_list = remove_contained_boxes(bounding_boxes_list)
    if debug:
        bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
        color_tuple = (0, 255, 0)
        draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
        # st.image(Image.fromarray(bgr_image)) #to_be_displayed

    text_box_list, above_test_box_list,figures_image_list,tables_image_list, cropped_image = extract_bounding_boxes_from_image_np(gray_pdf_image_np,
                                                                                             bounding_boxes_list, 30,
                                                                                             50, (255, 255, 255))
    if debug:
        debug_text_box_index = 0
        for text_box, above_text_box in zip(text_box_list, above_test_box_list):
            print("text box start")
            if above_text_box is not None:
                print(above_text_box[0])#to_be_displayed
                # st.write(above_text_box[0])#to_be_displayed
                # st.image(Image.fromarray(above_text_box[1]))#to_be_displayed
                # st.write(text)
            # st.image(Image.fromarray(text_box))#to_be_displayed
            debug_text_box_index = debug_text_box_index + 1
        for figure in figures_image_list:
            print(figure[0])
            # st.write(figure[0])#to_be_displayed
            # st.image(Image.fromarray(figure[1]))#to_be_displayed
        for table in tables_image_list:
            print(table[0])
            # st.write(table[0])#to_be_displayed
            # st.image(Image.fromarray(table[1]))#to_be_displayed
        # st.image(Image.fromarray(cropped_image))#to_be_displayed
    found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
    if found_hor_lines_list is not None:
        bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
        draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
        print("detected Lines start")
        # st.image(Image.fromarray(bgr_image)) #to_be_displayed

        print("detected lines end")
        page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
        if debug:
            debug_page_segment_index = 0
            for element in page_segment_np_list:
                print("element start")
                bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
                draw_edges(bgr_image)
                # st.image(Image.fromarray(bgr_image))#to_be_displayed

                debug_page_segment_index = debug_page_segment_index + 1
                print("element end")
        min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
        max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
    else:
        max_height_image = cropped_image.copy()
    # st.write("selected segment")
    # print("max height image start")
    # st.image(Image.fromarray(max_height_image))#to_be_displayed
    # print("max height image end")
    print("start text extraction")
    text=extract_two_columns_text(image_index,max_height_image,debug)
    print("gray_pdf_image_np_to_text extracted text",text)
    if text == "error":
        print("not two columns")
        max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB))
        text = pytesseract.image_to_string(max_height_image_converted)
        text = text.strip()
        toc_str="table of contents"
        # print("Extracted Text:\n", text)
        if text.lower().startswith(toc_str.lower()):

        #if "Table of Contents" in text:
          print("Table of Contents")
          # display_image_np(max_height_image)
          #print(text)
          return("Table of Contents")
        else:
          print("not Table of Contents")
          indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
          print(indeces_stop[0][0])
          print(max_height_image.shape[0])
          y_start=get_above_box(max_height_image, 0, indeces_stop[0][0],max_height_image.shape[1])
          if debug:
            bgr_image = cv2.cvtColor(max_height_image, cv2.COLOR_GRAY2BGR)
            color_tuple=(0, 255, 0)
            cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
            print("still in the middle start")
            # st.image(Image.fromarray(bgr_image))
            print("still in the middle end")
          left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
          if debug:
            print("left over start")
            # st.image(Image.fromarray(left_over_content))
            print("left over end")
          max_height_image_copy=max_height_image.copy()
          cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
          if debug:
            print("no left over start")
            # st.image(Image.fromarray(max_height_image_copy))
            print("no left over end")
          text=extract_two_columns_text(max_height_image_copy,debug)
          if text == "error":
            return("error")
          else:
            return figures_image_list,tables_image_list,text
    else:
        return figures_image_list,tables_image_list,text