Spaces:
Sleeping
Sleeping
import subprocess | |
import streamlit as st | |
import cv2 | |
import numpy as np | |
from PIL import Image | |
import pytesseract | |
def get_pdf_page_count(pdf_path): | |
try: | |
# Running pdfinfo command to get information about the PDF | |
result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True) | |
# Parsing the output to find the line with the number of pages | |
for line in result.stdout.split('\n'): | |
if 'Pages:' in line: | |
return int(line.split(':')[1].strip()) | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return None | |
#configurable extract rectange rectangle size | |
def extract_rectangle_from_image(gray, min_width, min_height): | |
bounding_boxes = [] | |
#gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
edges = cv2.Canny(gray, 50, 150, apertureSize=3) | |
#edges = cv2.Canny(gray, 10, 200, apertureSize=3) | |
kernel = np.ones((3,3), np.uint8) | |
dilated_edges = cv2.dilate(edges, kernel, iterations=1) | |
contours, _ = cv2.findContours(dilated_edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) | |
#contours, _ = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) | |
index = 0 | |
for cnt in contours: | |
approx = cv2.approxPolyDP(cnt, 0.01*cv2.arcLength(cnt, True), True) | |
#approx = cv2.approxPolyDP(cnt, 0.1*cv2.arcLength(cnt, True), True) | |
if len(approx) == 4: # Rectangle check | |
x, y, w, h = cv2.boundingRect(approx) | |
# print(f"x: {x}, y: {y}, w: {w}, h: {h}") | |
if w >= min_width and h >= min_height: | |
bounding_boxes.append((x, y, w, h)) | |
#print(x, y, w, h) | |
return bounding_boxes | |
def is_close(box1, box2, threshold=10): | |
# Calculate the distance between the top-left corners of the two boxes | |
distance = ((box1[0] - box2[0]) ** 2 + (box1[1] - box2[1]) ** 2) ** 0.5 | |
return distance < threshold | |
def remove_close_boxes(boxes, threshold=10): | |
kept_boxes = [] | |
for box in boxes: | |
# Assume the box is not close to others by default | |
is_close_to_others = False | |
for kept_box in kept_boxes: | |
if is_close(box, kept_box, threshold): | |
is_close_to_others = True | |
break | |
# If the box is not close to any box we've kept, add it to the list of kept boxes | |
if not is_close_to_others: | |
kept_boxes.append(box) | |
return kept_boxes | |
def is_contained(box1, box2): | |
""" | |
Check if box1 is contained within box2. | |
Each box is defined as (x, y, w, h). | |
""" | |
x1, y1, w1, h1 = box1 | |
x2, y2, w2, h2 = box2 | |
# Check if all corners of box1 are inside box2 | |
return x2 <= x1 and y2 <= y1 and x2 + w2 >= x1 + w1 and y2 + h2 >= y1 + h1 | |
def remove_contained_boxes(boxes): | |
""" | |
Remove boxes that are contained within other boxes. | |
""" | |
non_contained_boxes = [] | |
for i, box1 in enumerate(boxes): | |
# Check if there's another box that contains box1 | |
if not any(is_contained(box1, box2) for j, box2 in enumerate(boxes) if i != j): | |
non_contained_boxes.append(box1) | |
return non_contained_boxes | |
def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple): | |
for x, y, w, h in boxes_list: | |
#x, y, w, h = box[0] | |
cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5) | |
def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1): | |
x, y, w, h = rect | |
roi = image[y+1:y+h-1, x+1:x+w-1] | |
return np.all(roi == 0) | |
def get_below_box(image_np, x, y,width,step=15): | |
#print("x,y,width="+str(x)+","+str(y)+","+str(width)) | |
index_y = -1 | |
#print("get_below_box"+str(image_np.shape)) | |
if y+step < image_np.shape[0]: | |
index_y = y | |
while index_y+step < image_np.shape[0]: | |
#print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255))) | |
# image_np_copy = image_np.copy() | |
# bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR) | |
# cv2.rectangle(bgr_image, (x, index_y), (x + width, index_y +step), color_tuple, thickness=5) | |
# display_image_np(bgr_image) | |
if np.all(image_np[index_y:index_y+step,x:x+width] == 255): | |
# index_y += step | |
break | |
index_y += step | |
return index_y | |
def get_above_box(image_np, x, y,width,step=15): | |
#print("x,y,width="+str(x)+","+str(y)+","+str(width)) | |
index_y = -1 | |
#print("get_below_box"+str(image_np.shape)) | |
if y-step > 0: | |
index_y = y | |
while index_y-step > 0: | |
#print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255))) | |
# image_np_copy = image_np.copy() | |
# bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR) | |
# color_tuple=(0, 255, 0) | |
# cv2.rectangle(bgr_image, (x, index_y-step), (x + width, index_y), color_tuple, thickness=5) | |
# display_image_np(bgr_image) | |
if np.all(image_np[index_y-step:index_y,x:x+width] == 255): | |
# index_y += step | |
break | |
index_y -= step | |
return index_y | |
def is_note_rectangle(image_np, rect): | |
x, y, w, h = rect | |
roi = image_np[y+1:y+h-1, x+1:x+w-1] | |
roi_converted = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)) | |
text = pytesseract.image_to_string(roi_converted) | |
text = text.strip() | |
note_str="note" | |
print("is note text box="+str(text.lower().startswith(note_str.lower()))) | |
return text.lower().startswith(note_str.lower()) | |
def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple): | |
image_np_copy=image_np.copy() | |
rect_content_list=[] | |
above_rect_content_list=[] | |
figures_image_list=[] | |
tables_image_list=[] | |
index = 0 | |
for box in bounding_boxes_list: | |
x, y, w, h = box | |
if not is_filled_rectangle(image_np_copy, box): | |
# print("box="+str(box)+"not filled") | |
y_index= get_below_box(image_np, x, y+h,w) | |
if y_index == -1 or is_note_rectangle(image_np_copy, box): | |
# print("below text not found") | |
rect_content =image_np[y:y+h, x:x+w] | |
# rect_content_list.append(rect_content) | |
cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED) | |
else: | |
# print("below text found") | |
rect_content =image_np[y:y_index, x:x+w] | |
# rect_content_list.append(rect_content) | |
cv2.rectangle(image_np_copy, (x, y), (x+w, y_index), color_tuple, cv2.FILLED) | |
cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED) | |
above_box_y= get_above_box(image_np, x, y,w) | |
if above_box_y == -1 or above_box_y == y: | |
# print("box="+str(box)+"no above box") | |
above_rect_content_list.append(None) | |
rect_content_list.append(rect_content) | |
else: | |
# print("box="+str(box)+"above box exist") | |
above_rect_content = image_np[above_box_y:y, x:x+w] | |
# above_rect_content_list.append(above_rect_content) | |
above_converted = Image.fromarray(cv2.cvtColor(above_rect_content, cv2.COLOR_BGR2RGB)) | |
text = pytesseract.image_to_string(above_converted) | |
text = text.strip() | |
figure_str ="Figure" | |
table_str ="Table" | |
if text.lower().startswith(figure_str.lower()): | |
print(text) | |
figures_image_list.append((text,rect_content)) | |
elif text.lower().startswith(table_str.lower()): | |
print(text) | |
tables_image_list.append((text,rect_content)) | |
else: | |
above_rect_content_list.append((text, rect_content)) | |
rect_content_list.append(rect_content) | |
cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED) | |
# above_rect_content = image_np[y-above_check_offset:y, x:x+w] | |
# if np.all(above_rect_content == 255): | |
# # print("box="+str(box)+"above all white") | |
# above_rect_content_list.append(None) | |
# else: | |
# # print("box="+str(box)+"above not all white") | |
# above_rect_content = image_np[y-above_caption_offset:y, x:x+w] | |
# above_rect_content_list.append(above_rect_content) | |
# cv2.rectangle(image_np_copy, (x, y), (x+w, y-above_caption_offset), color_tuple, cv2.FILLED) | |
index += 1 | |
# else: | |
# print("box="+str(box)+"filled") | |
return rect_content_list,above_rect_content_list, figures_image_list, tables_image_list, image_np_copy | |
def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False): | |
bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20) | |
bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10) | |
bounding_boxes_list = remove_contained_boxes(bounding_boxes_list) | |
if debug: | |
bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR) | |
color_tuple = (0, 255, 0) | |
draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple) | |
# st.image(Image.fromarray(bgr_image)) #to_be_displayed | |
text_box_list, above_test_box_list,figures_image_list,tables_image_list, cropped_image = extract_bounding_boxes_from_image_np(gray_pdf_image_np, | |
bounding_boxes_list, 30, | |
50, (255, 255, 255)) | |
if debug: | |
debug_text_box_index = 0 | |
for text_box, above_text_box in zip(text_box_list, above_test_box_list): | |
print("text box start") | |
if above_text_box is not None: | |
st.write(above_text_box[0]) | |
st.image(Image.fromarray(above_text_box[1])) | |
# st.write(text) | |
st.image(Image.fromarray(text_box)) | |
debug_text_box_index = debug_text_box_index + 1 | |
for figure in figures_image_list: | |
st.write(figure[0]) | |
st.image(Image.fromarray(figure[1])) | |
for table in tables_image_list: | |
st.write(table[0]) | |
st.image(Image.fromarray(table[1])) |