Spaces:
Sleeping
Sleeping
end to end working
Browse files
app.py
CHANGED
@@ -52,6 +52,7 @@ for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:
|
|
52 |
print("index="+str(index))
|
53 |
|
54 |
text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
|
|
55 |
#if 'img_index' not in st.session_state:
|
56 |
|
57 |
# if st.button("Stop"):
|
|
|
52 |
print("index="+str(index))
|
53 |
|
54 |
text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
55 |
+
st.write(text)
|
56 |
#if 'img_index' not in st.session_state:
|
57 |
|
58 |
# if st.button("Stop"):
|
utils.py
CHANGED
@@ -249,14 +249,72 @@ def draw_edges(np_image):
|
|
249 |
thickness = 5
|
250 |
|
251 |
# Get the dimensions of the image
|
252 |
-
|
|
|
|
|
|
|
253 |
|
254 |
# Coordinates for the rectangle: start from (0,0) to (width, height)
|
255 |
# We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
|
256 |
cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
|
257 |
thickness)
|
258 |
-
|
259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
261 |
bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
|
262 |
bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
|
@@ -295,7 +353,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
295 |
bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
|
296 |
draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
|
297 |
print("detected Lines start")
|
298 |
-
st.image(Image.fromarray(bgr_image)) #to_be_displayed
|
299 |
|
300 |
print("detected lines end")
|
301 |
page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
|
@@ -305,16 +363,62 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
305 |
print("element start")
|
306 |
bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
|
307 |
draw_edges(bgr_image)
|
308 |
-
st.image(Image.fromarray(bgr_image))#to_be_displayed
|
309 |
|
310 |
debug_page_segment_index = debug_page_segment_index + 1
|
311 |
print("element end")
|
312 |
min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
|
313 |
max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
|
314 |
-
if debug:
|
315 |
-
print("max height image start")
|
316 |
-
st.image(Image.fromarray(max_height_image))#to_be_displayed
|
317 |
-
print("max height image end")
|
318 |
else:
|
319 |
max_height_image = cropped_image.copy()
|
320 |
st.write("selected segment")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
thickness = 5
|
250 |
|
251 |
# Get the dimensions of the image
|
252 |
+
try:
|
253 |
+
height, width = np_image.shape[:2]
|
254 |
+
except Exception as e:
|
255 |
+
print("An error occurred:", e)
|
256 |
|
257 |
# Coordinates for the rectangle: start from (0,0) to (width, height)
|
258 |
# We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
|
259 |
cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
|
260 |
thickness)
|
261 |
+
def is_image_np_two_columns(image_np,horizontal_margin,vertical_margin):
|
262 |
+
page_x_center = image_np.shape[1]//2
|
263 |
+
page_height=image_np.shape[0]
|
264 |
+
image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
|
265 |
+
#display_image_np(image_middle_np)
|
266 |
+
return np.all(image_middle_np == 255)
|
267 |
+
def extract_two_columns_text(image_index,image_np,debug):
|
268 |
+
# formatted_index_string = f"{index:03d}"
|
269 |
+
if is_image_np_two_columns(image_np,20,10):
|
270 |
+
page_x_center = image_np.shape[1] // 2
|
271 |
+
# print(page_x_center)
|
272 |
+
temp_array = image_np.copy()
|
273 |
+
left_column_array = temp_array[:, :page_x_center]
|
274 |
+
temp_array = image_np.copy()
|
275 |
+
right_column_array = temp_array[:, page_x_center:]
|
276 |
+
|
277 |
+
left_column_img = Image.fromarray(cv2.cvtColor(left_column_array, cv2.COLOR_BGR2RGB))
|
278 |
+
left_column_array_bgr_image = cv2.cvtColor(left_column_array, cv2.COLOR_GRAY2BGR)
|
279 |
+
draw_edges(left_column_array_bgr_image)
|
280 |
+
|
281 |
+
# imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_left_column.png", left_column_img)
|
282 |
+
|
283 |
+
right_column_img = Image.fromarray(cv2.cvtColor(right_column_array, cv2.COLOR_BGR2RGB))
|
284 |
+
right_column_array_bgr_image = cv2.cvtColor(right_column_array, cv2.COLOR_GRAY2BGR)
|
285 |
+
draw_edges(right_column_array_bgr_image)
|
286 |
+
# imageio.imwrite("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step8_right_column.png", right_column_img)
|
287 |
+
if debug:
|
288 |
+
print("left column image start")
|
289 |
+
# display(left_column_img)
|
290 |
+
st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
|
291 |
+
print("left column image end")
|
292 |
+
print("right column image start")
|
293 |
+
# display(right_column_img)
|
294 |
+
st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
|
295 |
+
print("right column image end")
|
296 |
+
left_text = pytesseract.image_to_string(left_column_img)
|
297 |
+
# with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
|
298 |
+
# file.write(left_text)
|
299 |
+
print("Extracted Text:\n", left_text)
|
300 |
+
right_text = pytesseract.image_to_string(right_column_img)
|
301 |
+
# with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_right_column_text.txt", 'w') as file:
|
302 |
+
# file.write(right_text)
|
303 |
+
print("Extracted Text:\n", right_text)
|
304 |
+
return left_text + right_text
|
305 |
+
else:
|
306 |
+
return "error"
|
307 |
+
def get_where_image_np_two_columns_stops(image_np,horizontal_margin,vertical_margin):
|
308 |
+
page_x_center = image_np.shape[1]//2
|
309 |
+
page_height=image_np.shape[0]
|
310 |
+
image_middle_np =image_np[vertical_margin:(page_height-vertical_margin), page_x_center-horizontal_margin:page_x_center+horizontal_margin]
|
311 |
+
#display_image_np(image_middle_np)
|
312 |
+
return np.where(image_middle_np != 255)
|
313 |
+
|
314 |
+
# indices = np.where(image_middle_np != 255)
|
315 |
+
# print(len(indices[0]))
|
316 |
+
# for i in range(len(indices[0])):
|
317 |
+
# print(f"Index: {indices[0][i], indices[1][i]}, Value: {image_middle_np[indices[0][i], indices[1][i]]}")
|
318 |
def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
319 |
bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
|
320 |
bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
|
|
|
353 |
bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
|
354 |
draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
|
355 |
print("detected Lines start")
|
356 |
+
# st.image(Image.fromarray(bgr_image)) #to_be_displayed
|
357 |
|
358 |
print("detected lines end")
|
359 |
page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
|
|
|
363 |
print("element start")
|
364 |
bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
|
365 |
draw_edges(bgr_image)
|
366 |
+
# st.image(Image.fromarray(bgr_image))#to_be_displayed
|
367 |
|
368 |
debug_page_segment_index = debug_page_segment_index + 1
|
369 |
print("element end")
|
370 |
min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
|
371 |
max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
|
|
|
|
|
|
|
|
|
372 |
else:
|
373 |
max_height_image = cropped_image.copy()
|
374 |
st.write("selected segment")
|
375 |
+
# print("max height image start")
|
376 |
+
# st.image(Image.fromarray(max_height_image))#to_be_displayed
|
377 |
+
# print("max height image end")
|
378 |
+
text=extract_two_columns_text(image_index,max_height_image,debug)
|
379 |
+
print(text)
|
380 |
+
if text == "error":
|
381 |
+
print("not two columns")
|
382 |
+
max_height_image_converted = Image.fromarray(cv2.cvtColor(max_height_image, cv2.COLOR_BGR2RGB))
|
383 |
+
text = pytesseract.image_to_string(max_height_image_converted)
|
384 |
+
text = text.strip()
|
385 |
+
toc_str="table of contents"
|
386 |
+
# print("Extracted Text:\n", text)
|
387 |
+
if text.lower().startswith(toc_str.lower()):
|
388 |
+
|
389 |
+
#if "Table of Contents" in text:
|
390 |
+
print("Table of Contents")
|
391 |
+
# display_image_np(max_height_image)
|
392 |
+
#print(text)
|
393 |
+
return("Table of Contents")
|
394 |
+
else:
|
395 |
+
print("not Table of Contents")
|
396 |
+
indeces_stop=get_where_image_np_two_columns_stops(max_height_image,20,10)
|
397 |
+
print(indeces_stop[0][0])
|
398 |
+
print(max_height_image.shape[0])
|
399 |
+
y_start=get_above_box(max_height_image, 0, indeces_stop[0][0],max_height_image.shape[1])
|
400 |
+
if debug:
|
401 |
+
bgr_image = cv2.cvtColor(max_height_image, cv2.COLOR_GRAY2BGR)
|
402 |
+
color_tuple=(0, 255, 0)
|
403 |
+
cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
|
404 |
+
print("still in the middle start")
|
405 |
+
st.image(Image.fromarray(bgr_image))
|
406 |
+
print("still in the middle end")
|
407 |
+
left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
|
408 |
+
if debug:
|
409 |
+
print("left over start")
|
410 |
+
st.image(Image.fromarray(left_over_content))
|
411 |
+
print("left over end")
|
412 |
+
max_height_image_copy=max_height_image.copy()
|
413 |
+
cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
|
414 |
+
if debug:
|
415 |
+
print("no left over start")
|
416 |
+
st.image(Image.fromarray(max_height_image_copy))
|
417 |
+
print("no left over end")
|
418 |
+
text=extract_two_columns_text(max_height_image_copy,debug)
|
419 |
+
if text == "error":
|
420 |
+
return("error")
|
421 |
+
else:
|
422 |
+
return text
|
423 |
+
else:
|
424 |
+
return text
|