Spaces:
Sleeping
Sleeping
temp save image to add lines to complete table borders
Browse files
app.py
CHANGED
@@ -161,7 +161,7 @@ if 'page_count' in st.session_state:
|
|
161 |
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
|
162 |
|
163 |
if 'num_pages_to_extract2'not in st.session_state:
|
164 |
-
st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=
|
165 |
else:
|
166 |
st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract2 , key='num_pages_to_extract_slider2')
|
167 |
|
@@ -191,8 +191,23 @@ if 'page_count' in st.session_state:
|
|
191 |
read_pdf_progress_bar.progress(progress_percentage)
|
192 |
read_pdf_progress_bar.progress(0)
|
193 |
for index, image in enumerate(st.session_state.color_image_list):
|
|
|
194 |
image_np = np.array(image)
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
progress_percentage = (index) / len(st.session_state.color_image_list)
|
197 |
read_pdf_progress_bar.progress(progress_percentage)
|
198 |
st.session_state.extracted_text = ""
|
@@ -201,7 +216,6 @@ if 'page_count' in st.session_state:
|
|
201 |
manage_temp_to_be_zipped_directory(temp_table_dir)
|
202 |
manage_temp_to_be_zipped_directory(temp_textbox_dir)
|
203 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
204 |
-
|
205 |
try:
|
206 |
figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
207 |
if textbox_image_list:
|
|
|
161 |
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
|
162 |
|
163 |
if 'num_pages_to_extract2'not in st.session_state:
|
164 |
+
st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=39, key='num_pages_to_extract_slider2')
|
165 |
else:
|
166 |
st.session_state.num_pages_to_extract2 = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count-1, value=st.session_state.num_pages_to_extract2 , key='num_pages_to_extract_slider2')
|
167 |
|
|
|
191 |
read_pdf_progress_bar.progress(progress_percentage)
|
192 |
read_pdf_progress_bar.progress(0)
|
193 |
for index, image in enumerate(st.session_state.color_image_list):
|
194 |
+
st.write("actual page = " + str(index + st.session_state.num_pages_to_extract))
|
195 |
image_np = np.array(image)
|
196 |
+
gray_image_np=cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY)
|
197 |
+
cv2.imwrite(f"gray_image_{index}.png", gray_image_np)
|
198 |
+
# st.image(Image.fromarray(gray_image_np))
|
199 |
+
if index + st.session_state.num_pages_to_extract == 34:
|
200 |
+
cv2.line(gray_image_np, (223, 414), (223, 1185), 0, 2)
|
201 |
+
cv2.line(gray_image_np, (1527, 414), (1527, 1185), 0, 2)
|
202 |
+
if index + st.session_state.num_pages_to_extract == 35:
|
203 |
+
cv2.line(gray_image_np, (176, 248), (176, 1760), 0, 2)
|
204 |
+
cv2.line(gray_image_np, (1551, 248), (1551, 1760), 0, 2)
|
205 |
+
st.image(Image.fromarray(gray_image_np))
|
206 |
+
st.session_state.gray_image_np_list.append(gray_image_np)
|
207 |
+
|
208 |
+
|
209 |
+
# cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
|
210 |
+
# cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
|
211 |
progress_percentage = (index) / len(st.session_state.color_image_list)
|
212 |
read_pdf_progress_bar.progress(progress_percentage)
|
213 |
st.session_state.extracted_text = ""
|
|
|
216 |
manage_temp_to_be_zipped_directory(temp_table_dir)
|
217 |
manage_temp_to_be_zipped_directory(temp_textbox_dir)
|
218 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
|
|
219 |
try:
|
220 |
figures_image_list,tables_image_list,textbox_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
221 |
if textbox_image_list:
|
utils.py
CHANGED
@@ -324,6 +324,8 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
324 |
if debug:
|
325 |
bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
|
326 |
color_tuple = (0, 255, 0)
|
|
|
|
|
327 |
draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
|
328 |
# st.image(Image.fromarray(bgr_image)) #to_be_displayed
|
329 |
|
|
|
324 |
if debug:
|
325 |
bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
|
326 |
color_tuple = (0, 255, 0)
|
327 |
+
# print("bounding_boxes_list")
|
328 |
+
# print(bounding_boxes_list)
|
329 |
draw_colored_boxes_on_image_np(bgr_image, bounding_boxes_list, color_tuple)
|
330 |
# st.image(Image.fromarray(bgr_image)) #to_be_displayed
|
331 |
|