Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

locked_pdf_ingestion / utils.py

zmbfeng

adding more debug

3411406 9 months ago

raw

history blame

19.2 kB

	import subprocess
	import streamlit as st
	import cv2
	import numpy as np
	from PIL import Image
	import pytesseract
	def get_pdf_page_count(pdf_path):
	try:
	# Running pdfinfo command to get information about the PDF
	result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True)
	# Parsing the output to find the line with the number of pages
	for line in result.stdout.split('\n'):
	if 'Pages:' in line:
	return int(line.split(':')[1].strip())
	except Exception as e:
	print(f"An error occurred: {e}")
	return None
	#configurable extract rectange rectangle size

	def extract_rectangle_from_image(gray, min_width, min_height):
	bounding_boxes = []
	#gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	edges = cv2.Canny(gray, 50, 150, apertureSize=3)
	#edges = cv2.Canny(gray, 10, 200, apertureSize=3)
	kernel = np.ones((3,3), np.uint8)
	dilated_edges = cv2.dilate(edges, kernel, iterations=1)
	contours, _ = cv2.findContours(dilated_edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
	#contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)


	index = 0
	for cnt in contours:
	approx = cv2.approxPolyDP(cnt, 0.01*cv2.arcLength(cnt, True), True)
	#approx = cv2.approxPolyDP(cnt, 0.1*cv2.arcLength(cnt, True), True)
	if len(approx) == 4: # Rectangle check
	x, y, w, h = cv2.boundingRect(approx)
	# print(f"x: {x}, y: {y}, w: {w}, h: {h}")
	if w >= min_width and h >= min_height:
	bounding_boxes.append((x, y, w, h))
	#print(x, y, w, h)
	return bounding_boxes
	def is_close(box1, box2, threshold=10):
	# Calculate the distance between the top-left corners of the two boxes
	distance = ((box1[0] - box2[0]) 2 + (box1[1] - box2[1]) 2) ** 0.5
	return distance < threshold
	def remove_close_boxes(boxes, threshold=10):
	kept_boxes = []
	for box in boxes:
	# Assume the box is not close to others by default
	is_close_to_others = False
	for kept_box in kept_boxes:
	if is_close(box, kept_box, threshold):
	is_close_to_others = True
	break
	# If the box is not close to any box we've kept, add it to the list of kept boxes
	if not is_close_to_others:
	kept_boxes.append(box)
	return kept_boxes
	def is_contained(box1, box2):
	"""
	Check if box1 is contained within box2.
	Each box is defined as (x, y, w, h).
	"""
	x1, y1, w1, h1 = box1
	x2, y2, w2, h2 = box2

	# Check if all corners of box1 are inside box2
	return x2 <= x1 and y2 <= y1 and x2 + w2 >= x1 + w1 and y2 + h2 >= y1 + h1

	def remove_contained_boxes(boxes):
	"""
	Remove boxes that are contained within other boxes.
	"""
	non_contained_boxes = []

	for i, box1 in enumerate(boxes):
	# Check if there's another box that contains box1
	if not any(is_contained(box1, box2) for j, box2 in enumerate(boxes) if i != j):
	non_contained_boxes.append(box1)

	return non_contained_boxes
	def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple):
	for x, y, w, h in boxes_list:
	#x, y, w, h = box[0]
	cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)

	def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1):

	x, y, w, h = rect
	roi = image[y+1:y+h-1, x+1:x+w-1]

	return np.all(roi == 0)
	def get_below_box(image_np, x, y,width,step=15):
	#print("x,y,width="+str(x)+","+str(y)+","+str(width))

	index_y = -1
	#print("get_below_box"+str(image_np.shape))
	if y+step < image_np.shape[0]:
	index_y = y
	while index_y+step < image_np.shape[0]:
	#print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))

	# image_np_copy = image_np.copy()
	# bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
	# cv2.rectangle(bgr_image, (x, index_y), (x + width, index_y +step), color_tuple, thickness=5)
	# display_image_np(bgr_image)


	if np.all(image_np[index_y:index_y+step,x:x+width] == 255):
	# index_y += step
	break
	index_y += step
	return index_y
	def get_above_box(image_np, x, y,width,step=15):
	#print("x,y,width="+str(x)+","+str(y)+","+str(width))

	index_y = -1
	#print("get_below_box"+str(image_np.shape))
	if y-step > 0:
	index_y = y
	while index_y-step > 0:
	#print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))

	# image_np_copy = image_np.copy()
	# bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
	# color_tuple=(0, 255, 0)
	# cv2.rectangle(bgr_image, (x, index_y-step), (x + width, index_y), color_tuple, thickness=5)
	# display_image_np(bgr_image)


	if np.all(image_np[index_y-step:index_y,x:x+width] == 255):
	# index_y += step
	break
	index_y -= step
	return index_y
	def is_note_rectangle(image_np, rect):
	x, y, w, h = rect
	roi = image_np[y+1:y+h-1, x+1:x+w-1]
	roi_converted = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
	text = pytesseract.image_to_string(roi_converted)
	text = text.strip()
	note_str="note"
	print("is note text box="+str(text.lower().startswith(note_str.lower())))
	return text.lower().startswith(note_str.lower())
	def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple):

	image_np_copy=image_np.copy()
	rect_content_list=[]
	above_rect_content_list=[]
	figures_image_list=[]
	tables_image_list=[]
	index = 0
	for box in bounding_boxes_list:
	x, y, w, h = box
	if not is_filled_rectangle(image_np_copy, box):
	# print("box="+str(box)+"not filled")
	y_index= get_below_box(image_np, x, y+h,w)
	if y_index == -1 or is_note_rectangle(image_np_copy, box):
	# print("below text not found")
	rect_content =image_np[y:y+h, x:x+w]
	# rect_content_list.append(rect_content)
	cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)
	else:
	# print("below text found")
	rect_content =image_np[y:y_index, x:x+w]
	# rect_content_list.append(rect_content)
	cv2.rectangle(image_np_copy, (x, y), (x+w, y_index), color_tuple, cv2.FILLED)

	cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)

	above_box_y= get_above_box(image_np, x, y,w)
	if above_box_y == -1 or above_box_y == y:
	# print("box="+str(box)+"no above box")
	above_rect_content_list.append(None)
	rect_content_list.append(rect_content)
	else:
	# print("box="+str(box)+"above box exist")
	above_rect_content = image_np[above_box_y:y, x:x+w]
	# above_rect_content_list.append(above_rect_content)
	above_converted = Image.fromarray(cv2.cvtColor(above_rect_content, cv2.COLOR_BGR2RGB))
	text = pytesseract.image_to_string(above_converted)
	text = text.strip()
	figure_str ="Figure"
	table_str ="Table"
	if text.lower().startswith(figure_str.lower()):
	print(text)
	figures_image_list.append((text,rect_content))

	elif text.lower().startswith(table_str.lower()):
	print(text)
	tables_image_list.append((text,rect_content))
	else:
	above_rect_content_list.append((text, rect_content))
	rect_content_list.append(rect_content)

	cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED)
	# above_rect_content = image_np[y-above_check_offset:y, x:x+w]
	# if np.all(above_rect_content == 255):
	# # print("box="+str(box)+"above all white")
	# above_rect_content_list.append(None)
	# else:
	# # print("box="+str(box)+"above not all white")
	# above_rect_content = image_np[y-above_caption_offset:y, x:x+w]
	# above_rect_content_list.append(above_rect_content)
	# cv2.rectangle(image_np_copy, (x, y), (x+w, y-above_caption_offset), color_tuple, cv2.FILLED)

	index += 1
	# else:
	# print("box="+str(box)+"filled")
	return rect_content_list,above_rect_content_list, figures_image_list, tables_image_list, image_np_copy
	def find_hor_lines_in_image_np(min_width, min_height,image_np):
	# Apply a horizontal kernel to emphasize horizontal lines
	kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1050, 5)) # Adjust size according to your document
	morphed = cv2.morphologyEx(image_np, cv2.MORPH_CLOSE, kernel)

	# Detect edges
	edges = cv2.Canny(morphed, 50, 150, apertureSize=3)

	# Detect lines using HoughLinesP
	lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10) # Adjust parameters as needed
	return lines
	def draw_colored_lines_on_image_np(image, lines,color_tuple):
	for line in lines:
	x1, y1, x2, y2 = line[0]
	cv2.line(image, (x1, y1), (x2, y2), color_tuple, 3)
	def segment_image_np(image_np,hor_lines_list):
	# print("in segment_image_np image_np start")
	# display_image_np(image_np)
	# print("in segment_image_np image_np end")
	segments = []
	previous_y = 0
	for line in sorted(hor_lines_list, key=lambda x: x[0][1]): # Sort lines by their y-coordinate
	x1, y1, x2, y2 = line[0]
	segment = image_np[previous_y:y1, :]
	segments.append(segment)
	previous_y = y2 # Update to start the next segment from the end of the current line

	# Don't forget the last segment
	last_segment =image_np[previous_y:, :]
	segments.append(last_segment)
	return segments
	def filter_segments_by_min_height(segments, min_height):
	return [segment for segment in segments if segment.shape[0] > min_height]

	def draw_edges(np_image):
	color = (0, 255, 0) # Green

	# Define the thickness of the rectangle lines
	thickness = 5

	# Get the dimensions of the image
	try:
	height, width = np_image.shape[:2]
	except Exception as e:
	print("An error occurred:", e)

	# Coordinates for the rectangle: start from (0,0) to (width, height)
	# We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
	cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
	thickness)
	def is_image_np_two_columns(image_np,horizontal_margin,vertical_margin):
	page_x_center = image_np.shape[1]//2
	page_height=image_np.shape[0]
	image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
	#display_image_np(image_middle_np)
	return np.all(image_middle_np == 255)
	def extract_two_columns_text(image_index,image_np,debug):
	# formatted_index_string = f"{index:03d}"
	if is_image_np_two_columns(image_np,20,10):
	page_x_center = image_np.shape[1] // 2
	# print(page_x_center)
	temp_array = image_np.copy()
	left_column_array = temp_array[:, :page_x_center]
	temp_array = image_np.copy()
	right_column_array = temp_array[:, page_x_center:]

	left_column_img = Image.fromarray(cv2.cvtColor(left_column_array, cv2.COLOR_BGR2RGB))
	left_column_array_bgr_image = cv2.cvtColor(left_column_array, cv2.COLOR_GRAY2BGR)
	draw_edges(left_column_array_bgr_image)

	# imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_left_column.png", left_column_img)

	right_column_img = Image.fromarray(cv2.cvtColor(right_column_array, cv2.COLOR_BGR2RGB))
	right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR)
	draw_edges(right_column_array_bgr_image)
	# imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img)
	if debug:
	print("left column image start")
	# display(left_column_img)
	st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
	print("left column image end")
	print("right column image start")
	# display(right_column_img)
	st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
	print("right column image end")
	left_text = pytesseract.image_to_string(left_column_img)
	# with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
	# file.write(left_text)
	print("Extracted Text:\n", left_text)
	right_text = pytesseract.image_to_string(right_column_img)
	# with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file:
	# file.write(right_text)
	print("Extracted Text:\n", right_text)
	return left_text + right_text
	else:
	return "error"
	def get_where_image_np_two_columns_stops(image_np,horizontal_margin,vertical_margin):
	page_x_center = image_np.shape[1]//2
	page_height=image_np.shape[0]
	image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
	#display_image_np(image_middle_np)
	return np.where(image_middle_np != 255)

	# indices = np.where(image_middle_np != 255)
	# print(len(indices[0]))
	# for i in range(len(indices[0])):
	# print(f"Index: {indices[0][i], indices[1][i]}, Value: {image_middle_np[indices[0][i], indices[1][i]]}")
	def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
	bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
	bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
	bounding_boxes_list = remove_contained_boxes(bounding_boxes_list)
	if debug:
	bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
	color_tuple = (0, 255, 0)
	draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
	# st.image(Image.fromarray(bgr_image)) #to_be_displayed

	text_box_list, above_test_box_list,figures_image_list,tables_image_list, cropped_image = extract_bounding_boxes_from_image_np(gray_pdf_image_np,
	bounding_boxes_list, 30,
	50, (255, 255, 255))
	if debug:
	debug_text_box_index = 0
	for text_box, above_text_box in zip(text_box_list, above_test_box_list):
	print("text box start")
	if above_text_box is not None:
	print(above_text_box[0])#to_be_displayed
	# st.write(above_text_box[0])#to_be_displayed
	# st.image(Image.fromarray(above_text_box[1]))#to_be_displayed
	# st.write(text)
	# st.image(Image.fromarray(text_box))#to_be_displayed
	debug_text_box_index = debug_text_box_index + 1
	for figure in figures_image_list:
	print(figure[0])
	# st.write(figure[0])#to_be_displayed
	# st.image(Image.fromarray(figure[1]))#to_be_displayed
	for table in tables_image_list:
	print(table[0])
	# st.write(table[0])#to_be_displayed
	# st.image(Image.fromarray(table[1]))#to_be_displayed
	st.image(Image.fromarray(cropped_image))#to_be_displayed
	found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
	if found_hor_lines_list is not None:
	bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
	draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
	print("detected Lines start")
	# st.image(Image.fromarray(bgr_image)) #to_be_displayed

	print("detected lines end")
	page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
	if debug:
	debug_page_segment_index = 0
	for element in page_segment_np_list:
	print("element start")
	bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
	draw_edges(bgr_image)
	# st.image(Image.fromarray(bgr_image))#to_be_displayed

	debug_page_segment_index = debug_page_segment_index + 1
	print("element end")
	min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
	max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
	else:
	max_height_image = cropped_image.copy()
	st.write("selected segment")
	# print("max height image start")
	# st.image(Image.fromarray(max_height_image))#to_be_displayed
	# print("max height image end")
	print("start text extraction")
	text=extract_two_columns_text(image_index,max_height_image,debug)
	print("gray_pdf_image_np_to_text extracted text",text)
	if text == "error":
	print("not two columns")
	max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB))
	text = pytesseract.image_to_string(max_height_image_converted)
	text = text.strip()
	toc_str="table of contents"
	# print("Extracted Text:\n", text)
	if text.lower().startswith(toc_str.lower()):

	#if "Table of Contents" in text:
	print("Table of Contents")
	# display_image_np(max_height_image)
	#print(text)
	return("Table of Contents")
	else:
	print("not Table of Contents")
	indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
	print(indeces_stop[0][0])
	print(max_height_image.shape[0])
	y_start=get_above_box(max_height_image, 0, indeces_stop[0][0],max_height_image.shape[1])
	if debug:
	bgr_image = cv2.cvtColor(max_height_image, cv2.COLOR_GRAY2BGR)
	color_tuple=(0, 255, 0)
	cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
	print("still in the middle start")
	st.image(Image.fromarray(bgr_image))
	print("still in the middle end")
	left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
	if debug:
	print("left over start")
	st.image(Image.fromarray(left_over_content))
	print("left over end")
	max_height_image_copy=max_height_image.copy()
	cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
	if debug:
	print("no left over start")
	st.image(Image.fromarray(max_height_image_copy))
	print("no left over end")
	text=extract_two_columns_text(max_height_image_copy,debug)
	if text == "error":
	return("error")
	else:
	return text
	else:
	return text