zmbfeng commited on
Commit
15cd602
1 Parent(s): d6482c4

figures, tables, and other text book with above text recognized and below text included in the block text image

Browse files
Files changed (3) hide show
  1. app.py +1 -0
  2. packages.txt +2 -1
  3. utils.py +147 -1
app.py CHANGED
@@ -7,6 +7,7 @@ import time
7
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
8
  # poppler-utils:
9
  # Installed: 22.02.0-2ubuntu0.4
 
10
  #page extraction disabled
11
  big_text = """
12
  <div style='text-align: center;'>
 
7
  # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
8
  # poppler-utils:
9
  # Installed: 22.02.0-2ubuntu0.4
10
+ # install https://github.com/UB-Mannheim/tesseract/wiki
11
  #page extraction disabled
12
  big_text = """
13
  <div style='text-align: center;'>
packages.txt CHANGED
@@ -1 +1,2 @@
1
- poppler-utils
 
 
1
+ poppler-utils
2
+ tesseract-ocr
utils.py CHANGED
@@ -3,6 +3,7 @@ import streamlit as st
3
  import cv2
4
  import numpy as np
5
  from PIL import Image
 
6
  def get_pdf_page_count(pdf_path):
7
  try:
8
  # Running pdfinfo command to get information about the PDF
@@ -82,6 +83,131 @@ def draw_colored_boxes_on_image_np(image, boxes_list,color_tuple):
82
  for x, y, w, h in boxes_list:
83
  #x, y, w, h = box[0]
84
  cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
86
  bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
87
  bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
@@ -90,4 +216,24 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
90
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
91
  color_tuple = (0, 255, 0)
92
  draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
93
- st.image(Image.fromarray(bgr_image))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import cv2
4
  import numpy as np
5
  from PIL import Image
6
+ import pytesseract
7
  def get_pdf_page_count(pdf_path):
8
  try:
9
  # Running pdfinfo command to get information about the PDF
 
83
  for x, y, w, h in boxes_list:
84
  #x, y, w, h = box[0]
85
  cv2.rectangle(image, (x, y), (x + w, y + h), color_tuple, thickness=5)
86
+
87
+ def is_filled_rectangle(image, rect, background_threshold=10, variance_threshold=0.1):
88
+
89
+ x, y, w, h = rect
90
+ roi = image[y+1:y+h-1, x+1:x+w-1]
91
+
92
+ return np.all(roi == 0)
93
+ def get_below_box(image_np, x, y,width,step=15):
94
+ #print("x,y,width="+str(x)+","+str(y)+","+str(width))
95
+
96
+ index_y = -1
97
+ #print("get_below_box"+str(image_np.shape))
98
+ if y+step < image_np.shape[0]:
99
+ index_y = y
100
+ while index_y+step < image_np.shape[0]:
101
+ #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))
102
+
103
+ # image_np_copy = image_np.copy()
104
+ # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
105
+ # cv2.rectangle(bgr_image, (x, index_y), (x + width, index_y +step), color_tuple, thickness=5)
106
+ # display_image_np(bgr_image)
107
+
108
+
109
+ if np.all(image_np[index_y:index_y+step,x:x+width] == 255):
110
+ # index_y += step
111
+ break
112
+ index_y += step
113
+ return index_y
114
+ def get_above_box(image_np, x, y,width,step=15):
115
+ #print("x,y,width="+str(x)+","+str(y)+","+str(width))
116
+
117
+ index_y = -1
118
+ #print("get_below_box"+str(image_np.shape))
119
+ if y-step > 0:
120
+ index_y = y
121
+ while index_y-step > 0:
122
+ #print(str( np.all(image_np[index_y:index_y+step,x:x+width] == 255)))
123
+
124
+ # image_np_copy = image_np.copy()
125
+ # bgr_image = cv2.cvtColor(image_np_copy, cv2.COLOR_GRAY2BGR)
126
+ # color_tuple=(0, 255, 0)
127
+ # cv2.rectangle(bgr_image, (x, index_y-step), (x + width, index_y), color_tuple, thickness=5)
128
+ # display_image_np(bgr_image)
129
+
130
+
131
+ if np.all(image_np[index_y-step:index_y,x:x+width] == 255):
132
+ # index_y += step
133
+ break
134
+ index_y -= step
135
+ return index_y
136
+ def is_note_rectangle(image_np, rect):
137
+ x, y, w, h = rect
138
+ roi = image_np[y+1:y+h-1, x+1:x+w-1]
139
+ roi_converted = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
140
+ text = pytesseract.image_to_string(roi_converted)
141
+ text = text.strip()
142
+ note_str="note"
143
+ print("is note text box="+str(text.lower().startswith(note_str.lower())))
144
+ return text.lower().startswith(note_str.lower())
145
+ def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_check_offset, above_caption_offset, color_tuple):
146
+
147
+ image_np_copy=image_np.copy()
148
+ rect_content_list=[]
149
+ above_rect_content_list=[]
150
+ figures_image_list=[]
151
+ tables_image_list=[]
152
+ index = 0
153
+ for box in bounding_boxes_list:
154
+ x, y, w, h = box
155
+ if not is_filled_rectangle(image_np_copy, box):
156
+ # print("box="+str(box)+"not filled")
157
+ y_index= get_below_box(image_np, x, y+h,w)
158
+ if y_index == -1 or is_note_rectangle(image_np_copy, box):
159
+ # print("below text not found")
160
+ rect_content =image_np[y:y+h, x:x+w]
161
+ # rect_content_list.append(rect_content)
162
+ cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)
163
+ else:
164
+ # print("below text found")
165
+ rect_content =image_np[y:y_index, x:x+w]
166
+ # rect_content_list.append(rect_content)
167
+ cv2.rectangle(image_np_copy, (x, y), (x+w, y_index), color_tuple, cv2.FILLED)
168
+
169
+ cv2.rectangle(image_np_copy, (x, y), (x+w, y+h), color_tuple, cv2.FILLED)
170
+
171
+ above_box_y= get_above_box(image_np, x, y,w)
172
+ if above_box_y == -1 or above_box_y == y:
173
+ # print("box="+str(box)+"no above box")
174
+ above_rect_content_list.append(None)
175
+ rect_content_list.append(rect_content)
176
+ else:
177
+ # print("box="+str(box)+"above box exist")
178
+ above_rect_content = image_np[above_box_y:y, x:x+w]
179
+ # above_rect_content_list.append(above_rect_content)
180
+ above_converted = Image.fromarray(cv2.cvtColor(above_rect_content, cv2.COLOR_BGR2RGB))
181
+ text = pytesseract.image_to_string(above_converted)
182
+ text = text.strip()
183
+ figure_str ="Figure"
184
+ table_str ="Table"
185
+ if text.lower().startswith(figure_str.lower()):
186
+ print(text)
187
+ figures_image_list.append((text,rect_content))
188
+
189
+ elif text.lower().startswith(table_str.lower()):
190
+ print(text)
191
+ tables_image_list.append((text,rect_content))
192
+ else:
193
+ above_rect_content_list.append((text, rect_content))
194
+ rect_content_list.append(rect_content)
195
+
196
+ cv2.rectangle(image_np_copy, (x, above_box_y), (x+w, y), color_tuple, cv2.FILLED)
197
+ # above_rect_content = image_np[y-above_check_offset:y, x:x+w]
198
+ # if np.all(above_rect_content == 255):
199
+ # # print("box="+str(box)+"above all white")
200
+ # above_rect_content_list.append(None)
201
+ # else:
202
+ # # print("box="+str(box)+"above not all white")
203
+ # above_rect_content = image_np[y-above_caption_offset:y, x:x+w]
204
+ # above_rect_content_list.append(above_rect_content)
205
+ # cv2.rectangle(image_np_copy, (x, y), (x+w, y-above_caption_offset), color_tuple, cv2.FILLED)
206
+
207
+ index += 1
208
+ # else:
209
+ # print("box="+str(box)+"filled")
210
+ return rect_content_list,above_rect_content_list, figures_image_list, tables_image_list, image_np_copy
211
  def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
212
  bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
213
  bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
 
216
  bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
217
  color_tuple = (0, 255, 0)
218
  draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
219
+ # st.image(Image.fromarray(bgr_image)) #to_be_displayed
220
+
221
+ text_box_list, above_test_box_list,figures_image_list,tables_image_list, cropped_image = extract_bounding_boxes_from_image_np(gray_pdf_image_np,
222
+ bounding_boxes_list, 30,
223
+ 50, (255, 255, 255))
224
+ if debug:
225
+ debug_text_box_index = 0
226
+ for text_box, above_text_box in zip(text_box_list, above_test_box_list):
227
+ print("text box start")
228
+ if above_text_box is not None:
229
+ st.write(above_text_box[0])
230
+ st.image(Image.fromarray(above_text_box[1]))
231
+ # st.write(text)
232
+ st.image(Image.fromarray(text_box))
233
+ debug_text_box_index = debug_text_box_index + 1
234
+ for figure in figures_image_list:
235
+ st.write(figure[0])
236
+ st.image(Image.fromarray(figure[1]))
237
+ for table in tables_image_list:
238
+ st.write(table[0])
239
+ st.image(Image.fromarray(table[1]))