import subprocess import streamlit as st import cv2 import numpy as np from PIL import Image import pytesseract def get_pdf_page_count(pdf_path): try: # Running pdfinfo command to get information about the PDF result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True) # Parsing the output to find the line with the number of pages for line in result.stdout.split('\n'): if 'Pages:' in line: return int(line.split(':')[1].strip()) except Exception as e: print(f"An error occurred: {e}") return None #configurable extract rectange rectangle size def extract_rectangle_from_image(gray, min_width, min_height): bounding_boxes = [] #gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) edges = cv2.Canny(gray, 50, 150, apertureSize=3) #edges = cv2.Canny(gray, 10, 200, apertureSize=3) kernel = np.ones((3,3), np.uint8) dilated_edges = cv2.dilate(edges, kernel, iterations=1) contours, _ = cv2.findContours(dilated_edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) #contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) index = 0 for cnt in contours: approx = cv2.approxPolyDP(cnt, 0.01*cv2.arcLength(cnt, True), True) #approx = cv2.approxPolyDP(cnt, 0.1*cv2.arcLength(cnt, True), True) if len(approx) == 4: # Rectangle check x, y, w, h = cv2.boundingRect(approx) # print(f"x: {x}, y: {y}, w: {w}, h: {h}") if w >= min_width and h >= min_height: bounding_boxes.append((x, y, w, h)) #print(x, y, w, h) return bounding_boxes def is_close(box1, box2, threshold=10): # Calculate the distance between the top-left corners of the two boxes distance = ((box1[0] - box2[0]) ** 2 + (box1[1] - box2[1]) ** 2) ** 0.5 return distance < threshold def remove_close_boxes(boxes, threshold=10): kept_boxes = [] for box in boxes: # Assume the box is not close to others by default is_close_to_others = False for kept_box in kept_boxes: if is_close(box, kept_box, threshold): is_close_to_others = True break # If the box is not close to any box we've kept, add it to the list of kept boxes if not is_close_to_others: kept_boxes.append(box) return kept_boxes def is_contained(box1, box2): """ Check if box1 is contained within box2. Each box is defined as (x, y, w, h). """ x1, y1, w1, h1 = box1 x2, y2, w2, h2 = box2 # Check if all corners of box1 are inside box2 return x2 <= x1 and y2 <= y1 and x2 + w2 >= x1 + w1 and y2 + h2 >= y1 + h1 def remove_contained_boxes(boxes): """ Remove boxes that are contained within other boxes. """ non_contained_boxes = [] for i, box1 in enumerate(boxes): # Check if there's another box that contains box1 if not any(is_contained(box1, box2) for j, box2 in enumerate(boxes) if i != j): non_contained_boxes.append(box1) return non_contained_boxes def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple): for x, y, w, h in boxes_list: #x, y, w, h = box[0] cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5) def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1): x, y, w, h = rect roi = image[y+1:y+h-1, x+1:x+w-1] return np.all(roi == 0) def get_below_box(image_np, x, y,width,step=15): #print("x,y,width="+str(x)+","+str(y)+","+str(width)) index_y = -1 #print("get_below_box"+str(image_np.shape)) if y+step < image_np.shape[0]: index_y = y while index_y+step < image_np.shape[0]: #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255))) # image_np_copy = image_np.copy() # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR) # cv2.rectangle(bgr_image, (x, index_y), (x + width, index_y +step), color_tuple, thickness=5) # display_image_np(bgr_image) if np.all(image_np[index_y:index_y+step,x:x+width] == 255): # index_y += step break index_y += step return index_y def get_above_box(image_np, x, y,width,step=15): #print("x,y,width="+str(x)+","+str(y)+","+str(width)) index_y = -1 #print("get_below_box"+str(image_np.shape)) if y-step > 0: index_y = y while index_y-step > 0: #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255))) # image_np_copy = image_np.copy() # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR) # color_tuple=(0, 255, 0) # cv2.rectangle(bgr_image, (x, index_y-step), (x + width, index_y), color_tuple, thickness=5) # display_image_np(bgr_image) if np.all(image_np[index_y-step:index_y,x:x+width] == 255): # index_y += step break index_y -= step return index_y def is_note_rectangle(image_np, rect): x, y, w, h = rect roi = image_np[y+1:y+h-1, x+1:x+w-1] roi_converted = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)) text = pytesseract.image_to_string(roi_converted) text = text.strip() note_str="note" print("is note text box="+str(text.lower().startswith(note_str.lower()))) return text.lower().startswith(note_str.lower()) def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple): image_np_copy=image_np.copy() rect_content_list=[] above_rect_content_list=[] figures_image_list=[] tables_image_list=[] index = 0 for box in bounding_boxes_list: x, y, w, h = box if not is_filled_rectangle(image_np_copy, box): # print("box="+str(box)+"not filled") y_index= get_below_box(image_np, x, y+h,w) if y_index == -1 or is_note_rectangle(image_np_copy, box): # print("below text not found") rect_content =image_np[y:y+h, x:x+w] # rect_content_list.append(rect_content) cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED) else: # print("below text found") rect_content =image_np[y:y_index, x:x+w] # rect_content_list.append(rect_content) cv2.rectangle(image_np_copy, (x, y), (x+w, y_index), color_tuple, cv2.FILLED) cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED) above_box_y= get_above_box(image_np, x, y,w) if above_box_y == -1 or above_box_y == y: # print("box="+str(box)+"no above box") above_rect_content_list.append(None) rect_content_list.append(rect_content) else: # print("box="+str(box)+"above box exist") above_rect_content = image_np[above_box_y:y, x:x+w] # above_rect_content_list.append(above_rect_content) above_converted = Image.fromarray(cv2.cvtColor(above_rect_content, cv2.COLOR_BGR2RGB)) text = pytesseract.image_to_string(above_converted) text = text.strip() figure_str ="Figure" table_str ="Table" if text.lower().startswith(figure_str.lower()): print(text) figures_image_list.append((text,rect_content)) elif text.lower().startswith(table_str.lower()): print(text) tables_image_list.append((text,rect_content)) else: above_rect_content_list.append((text, rect_content)) rect_content_list.append(rect_content) cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED) # above_rect_content = image_np[y-above_check_offset:y, x:x+w] # if np.all(above_rect_content == 255): # # print("box="+str(box)+"above all white") # above_rect_content_list.append(None) # else: # # print("box="+str(box)+"above not all white") # above_rect_content = image_np[y-above_caption_offset:y, x:x+w] # above_rect_content_list.append(above_rect_content) # cv2.rectangle(image_np_copy, (x, y), (x+w, y-above_caption_offset), color_tuple, cv2.FILLED) index += 1 # else: # print("box="+str(box)+"filled") return rect_content_list,above_rect_content_list, figures_image_list, tables_image_list, image_np_copy def find_hor_lines_in_image_np(min_width, min_height,image_np): # Apply a horizontal kernel to emphasize horizontal lines kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1050, 5)) # Adjust size according to your document morphed = cv2.morphologyEx(image_np, cv2.MORPH_CLOSE, kernel) # Detect edges edges = cv2.Canny(morphed, 50, 150, apertureSize=3) # Detect lines using HoughLinesP lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10) # Adjust parameters as needed return lines def draw_colored_lines_on_image_np(image, lines,color_tuple): for line in lines: x1, y1, x2, y2 = line[0] cv2.line(image, (x1, y1), (x2, y2), color_tuple, 3) def segment_image_np(image_np,hor_lines_list): # print("in segment_image_np image_np start") # display_image_np(image_np) # print("in segment_image_np image_np end") segments = [] previous_y = 0 for line in sorted(hor_lines_list, key=lambda x: x[0][1]): # Sort lines by their y-coordinate x1, y1, x2, y2 = line[0] segment = image_np[previous_y:y1, :] segments.append(segment) previous_y = y2 # Update to start the next segment from the end of the current line # Don't forget the last segment last_segment =image_np[previous_y:, :] segments.append(last_segment) return segments def filter_segments_by_min_height(segments, min_height): return [segment for segment in segments if segment.shape[0] > min_height] def draw_edges(np_image): color = (0, 255, 0) # Green # Define the thickness of the rectangle lines thickness = 5 # Get the dimensions of the image try: height, width = np_image.shape[:2] except Exception as e: print("An error occurred:", e) # Coordinates for the rectangle: start from (0,0) to (width, height) # We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color, thickness) def is_image_np_two_columns(image_np,horizontal_margin,vertical_margin): page_x_center = image_np.shape[1]//2 page_height=image_np.shape[0] image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin] #display_image_np(image_middle_np) return np.all(image_middle_np == 255) def extract_two_columns_text(image_index,image_np,debug): # formatted_index_string = f"{index:03d}" if is_image_np_two_columns(image_np,20,10): page_x_center = image_np.shape[1] // 2 # print(page_x_center) temp_array = image_np.copy() left_column_array = temp_array[:, :page_x_center] temp_array = image_np.copy() right_column_array = temp_array[:, page_x_center:] left_column_img = Image.fromarray(cv2.cvtColor(left_column_array, cv2.COLOR_BGR2RGB)) left_column_array_bgr_image = cv2.cvtColor(left_column_array, cv2.COLOR_GRAY2BGR) draw_edges(left_column_array_bgr_image) # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_left_column.png", left_column_img) right_column_img = Image.fromarray(cv2.cvtColor(right_column_array, cv2.COLOR_BGR2RGB)) right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR) draw_edges(right_column_array_bgr_image) # imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img) if debug: print("left column image start") # display(left_column_img) # st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed print("left column image end") print("right column image start") # display(right_column_img) # st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed print("right column image end") left_text = pytesseract.image_to_string(left_column_img) # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file: # file.write(left_text) print("Extracted Text:\n", left_text) right_text = pytesseract.image_to_string(right_column_img) # with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file: # file.write(right_text) print("Extracted Text:\n", right_text) return left_text + right_text else: return "error" def get_where_image_np_two_columns_stops(image_np,horizontal_margin,vertical_margin): page_x_center = image_np.shape[1]//2 page_height=image_np.shape[0] image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin] #display_image_np(image_middle_np) return np.where(image_middle_np != 255) # indices = np.where(image_middle_np != 255) # print(len(indices[0])) # for i in range(len(indices[0])): # print(f"Index: {indices[0][i], indices[1][i]}, Value: {image_middle_np[indices[0][i], indices[1][i]]}") def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False): bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20) bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10) bounding_boxes_list = remove_contained_boxes(bounding_boxes_list) if debug: bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR) color_tuple = (0, 255, 0) draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple) # st.image(Image.fromarray(bgr_image)) #to_be_displayed text_box_list, above_test_box_list,figures_image_list,tables_image_list, cropped_image = extract_bounding_boxes_from_image_np(gray_pdf_image_np, bounding_boxes_list, 30, 50, (255, 255, 255)) if debug: debug_text_box_index = 0 for text_box, above_text_box in zip(text_box_list, above_test_box_list): print("text box start") if above_text_box is not None: print(above_text_box[0])#to_be_displayed # st.write(above_text_box[0])#to_be_displayed # st.image(Image.fromarray(above_text_box[1]))#to_be_displayed # st.write(text) # st.image(Image.fromarray(text_box))#to_be_displayed debug_text_box_index = debug_text_box_index + 1 for figure in figures_image_list: print(figure[0]) # st.write(figure[0])#to_be_displayed # st.image(Image.fromarray(figure[1]))#to_be_displayed for table in tables_image_list: print(table[0]) # st.write(table[0])#to_be_displayed # st.image(Image.fromarray(table[1]))#to_be_displayed # st.image(Image.fromarray(cropped_image))#to_be_displayed found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image) if found_hor_lines_list is not None: bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR) draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0)) print("detected Lines start") # st.image(Image.fromarray(bgr_image)) #to_be_displayed print("detected lines end") page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list) if debug: debug_page_segment_index = 0 for element in page_segment_np_list: print("element start") bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR) draw_edges(bgr_image) # st.image(Image.fromarray(bgr_image))#to_be_displayed debug_page_segment_index = debug_page_segment_index + 1 print("element end") min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50) max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0]) else: max_height_image = cropped_image.copy() # st.write("selected segment") # print("max height image start") # st.image(Image.fromarray(max_height_image))#to_be_displayed # print("max height image end") print("start text extraction") text=extract_two_columns_text(image_index,max_height_image,debug) print("gray_pdf_image_np_to_text extracted text",text) if text == "error": print("not two columns") max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB)) text = pytesseract.image_to_string(max_height_image_converted) text = text.strip() toc_str="table of contents" # print("Extracted Text:\n", text) if text.lower().startswith(toc_str.lower()): #if "Table of Contents" in text: print("Table of Contents") # display_image_np(max_height_image) #print(text) return("Table of Contents") else: print("not Table of Contents") indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10) print(indeces_stop[0][0]) print(max_height_image.shape[0]) y_start=get_above_box(max_height_image, 0, indeces_stop[0][0],max_height_image.shape[1]) if debug: bgr_image = cv2.cvtColor(max_height_image, cv2.COLOR_GRAY2BGR) color_tuple=(0, 255, 0) cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5) print("still in the middle start") # st.image(Image.fromarray(bgr_image)) print("still in the middle end") left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]] if debug: print("left over start") # st.image(Image.fromarray(left_over_content)) print("left over end") max_height_image_copy=max_height_image.copy() cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED) if debug: print("no left over start") # st.image(Image.fromarray(max_height_image_copy)) print("no left over end") text=extract_two_columns_text(max_height_image_copy,debug) if text == "error": return("error") else: return text else: return text