Spaces:
Sleeping
Sleeping
upload to display text working
Browse files
app.py
CHANGED
@@ -3,12 +3,33 @@ import pdf2image
|
|
3 |
import utils
|
4 |
import numpy as np
|
5 |
import cv2
|
|
|
|
|
|
|
6 |
import time
|
7 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
8 |
# poppler-utils:
|
9 |
# Installed: 22.02.0-2ubuntu0.4
|
10 |
# install https://github.com/UB-Mannheim/tesseract/wiki
|
11 |
#page extraction disabled
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
big_text = """
|
13 |
<div style='text-align: center;'>
|
14 |
<h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
|
@@ -18,41 +39,124 @@ big_text = """
|
|
18 |
st.markdown(big_text, unsafe_allow_html=True)
|
19 |
|
20 |
|
|
|
21 |
if 'is_initialized' not in st.session_state:
|
22 |
pdf_path = 'uploaded_pdf/data_sheet.pdf'
|
23 |
st.session_state['is_initialized'] = True
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
read_pdf_progress_bar = st.progress(0)
|
29 |
-
st.session_state.
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
#if 'img_index' not in st.session_state:
|
57 |
|
58 |
# if st.button("Stop"):
|
|
|
3 |
import utils
|
4 |
import numpy as np
|
5 |
import cv2
|
6 |
+
import os
|
7 |
+
import shutil
|
8 |
+
|
9 |
import time
|
10 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
11 |
# poppler-utils:
|
12 |
# Installed: 22.02.0-2ubuntu0.4
|
13 |
# install https://github.com/UB-Mannheim/tesseract/wiki
|
14 |
#page extraction disabled
|
15 |
+
|
16 |
+
def is_new_pdf_upload(uploaded_file):
|
17 |
+
if 'last_pdf_uploaded_file' in st.session_state:
|
18 |
+
# Check if the newly uploaded file is different from the last one
|
19 |
+
if (uploaded_file.name != st.session_state.last_pdf_uploaded_file['name'] or
|
20 |
+
uploaded_file.size != st.session_state.last_pdf_uploaded_file['size']):
|
21 |
+
st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
|
22 |
+
# st.write("A new src image file has been uploaded.")
|
23 |
+
return True
|
24 |
+
else:
|
25 |
+
# st.write("The same src image file has been re-uploaded.")
|
26 |
+
return False
|
27 |
+
else:
|
28 |
+
# st.write("This is the first file upload detected.")
|
29 |
+
st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
|
30 |
+
return True
|
31 |
+
# Store current file details in session state
|
32 |
+
|
33 |
big_text = """
|
34 |
<div style='text-align: center;'>
|
35 |
<h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
|
|
|
39 |
st.markdown(big_text, unsafe_allow_html=True)
|
40 |
|
41 |
|
42 |
+
|
43 |
if 'is_initialized' not in st.session_state:
|
44 |
pdf_path = 'uploaded_pdf/data_sheet.pdf'
|
45 |
st.session_state['is_initialized'] = True
|
46 |
+
# page_count = utils.get_pdf_page_count(pdf_path)
|
47 |
+
# print("page_count=",page_count)
|
48 |
+
# page_count=5
|
49 |
+
# print("new page_count=",page_count)
|
50 |
+
# read_pdf_progress_bar = st.progress(0)
|
51 |
+
# st.session_state.color_image_list = []
|
52 |
+
# st.session_state.gray_image_np_list = []
|
53 |
+
# for page_number in range(page_count):
|
54 |
+
# image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
|
55 |
+
# st.session_state.color_image_list.append(image[0])
|
56 |
+
# progress_percentage = (page_number) / (page_count-1)
|
57 |
+
# read_pdf_progress_bar.progress(progress_percentage)
|
58 |
+
# gray_pdf_image_np_list = []
|
59 |
+
# read_pdf_progress_bar.progress(0)
|
60 |
+
# for index, image in enumerate(st.session_state.color_image_list):
|
61 |
+
# image_np = np.array(image)
|
62 |
+
# st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
|
63 |
+
# progress_percentage = (index) / (page_count - 1)
|
64 |
+
# read_pdf_progress_bar.progress(progress_percentage)
|
65 |
+
# # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
|
66 |
+
# # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
|
67 |
+
# # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
|
68 |
+
# # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
|
69 |
+
# st.session_state.img_index = 0
|
70 |
+
# st.session_state.stop_button_clicked=False
|
71 |
+
# # st.image(st.session_state.gray_image_np_list[38])
|
72 |
+
|
73 |
+
uploaded_locked_pdf_file = st.file_uploader("Upload a locked pdf",
|
74 |
+
type=['pdf'])
|
75 |
+
st.markdown(
|
76 |
+
f'<a href="https://ikmtechnology.github.io/ikmtechnology/data_sheet.pdf" target="_blank">Sample 1 download and then upload to above</a>',
|
77 |
+
unsafe_allow_html=True)
|
78 |
+
|
79 |
+
if uploaded_locked_pdf_file is not None:
|
80 |
+
if is_new_pdf_upload(uploaded_locked_pdf_file):
|
81 |
+
# To see details
|
82 |
+
# file_details = {"FileName": uploaded_driving_video_file.name, "FileType": uploaded_driving_video_file.type, "FileSize": uploaded_driving_video_file.size}
|
83 |
+
# st.write(file_details)
|
84 |
+
save_path = './uploaded_videos'
|
85 |
+
if not os.path.exists(save_path):
|
86 |
+
os.makedirs(save_path)
|
87 |
+
with open(os.path.join(save_path, uploaded_locked_pdf_file.name), "wb") as f:
|
88 |
+
f.write(uploaded_locked_pdf_file.getbuffer()) # Write the file to the specified location
|
89 |
+
st.success(f'Saved file temp_{uploaded_locked_pdf_file.name} in {save_path}')
|
90 |
+
st.session_state.uploaded_pdf_path=os.path.join(save_path, uploaded_locked_pdf_file.name)
|
91 |
+
st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
|
92 |
+
print("page_count=",st.session_state.page_count)
|
93 |
+
|
94 |
+
if 'extracted_text' in st.session_state:
|
95 |
+
del st.session_state.extracted_text
|
96 |
+
st.rerun()
|
97 |
+
if 'page_count' in st.session_state:
|
98 |
+
st.write(f"total page count = {st.session_state.page_count}")
|
99 |
+
if 'num_pages_to_extract'not in st.session_state:
|
100 |
+
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=5, key='num_pages_to_extract_slider')
|
101 |
+
else:
|
102 |
+
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
|
103 |
+
|
104 |
+
st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
|
105 |
+
if 'run_button' in st.session_state and st.session_state.run_button == True:
|
106 |
+
st.session_state.running = True
|
107 |
+
else:
|
108 |
+
st.session_state.running = False
|
109 |
read_pdf_progress_bar = st.progress(0)
|
110 |
+
if st.button('Extract Pages', disabled=st.session_state.running, key='run_button'):
|
111 |
+
|
112 |
+
st.session_state.color_image_list = []
|
113 |
+
st.session_state.gray_image_np_list = []
|
114 |
+
for page_number in range(st.session_state.num_pages_to_extract):
|
115 |
+
image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
|
116 |
+
st.session_state.color_image_list.append(image[0])
|
117 |
+
progress_percentage = (page_number) / (st.session_state.num_pages_to_extract-1)
|
118 |
+
read_pdf_progress_bar.progress(progress_percentage)
|
119 |
+
read_pdf_progress_bar.progress(0)
|
120 |
+
for index, image in enumerate(st.session_state.color_image_list):
|
121 |
+
image_np = np.array(image)
|
122 |
+
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
|
123 |
+
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
124 |
+
read_pdf_progress_bar.progress(progress_percentage)
|
125 |
+
st.session_state.extracted_text = ""
|
126 |
+
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
127 |
+
print("index="+str(index))
|
128 |
+
|
129 |
+
text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
130 |
+
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
131 |
+
# st.write(text)
|
132 |
+
# print(text)
|
133 |
+
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
134 |
+
read_pdf_progress_bar.progress(progress_percentage)
|
135 |
+
#add_animation_to_image()
|
136 |
+
#st.session_state['video_generated'] = True
|
137 |
+
st.rerun()
|
138 |
+
if 'extracted_text' in st.session_state:
|
139 |
+
st.write(st.session_state.extracted_text)
|
140 |
+
# for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
|
141 |
+
# print("index="+str(index))
|
142 |
+
#
|
143 |
+
# text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
144 |
+
# st.write(text)
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
#if 'img_index' not in st.session_state:
|
161 |
|
162 |
# if st.button("Stop"):
|
utils.py
CHANGED
@@ -287,11 +287,11 @@ def extract_two_columns_text(image_index,image_np,debug):
|
|
287 |
if debug:
|
288 |
print("left column image start")
|
289 |
# display(left_column_img)
|
290 |
-
st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
|
291 |
print("left column image end")
|
292 |
print("right column image start")
|
293 |
# display(right_column_img)
|
294 |
-
st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
|
295 |
print("right column image end")
|
296 |
left_text = pytesseract.image_to_string(left_column_img)
|
297 |
# with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
|
@@ -347,7 +347,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
347 |
print(table[0])
|
348 |
# st.write(table[0])#to_be_displayed
|
349 |
# st.image(Image.fromarray(table[1]))#to_be_displayed
|
350 |
-
st.image(Image.fromarray(cropped_image))#to_be_displayed
|
351 |
found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
|
352 |
if found_hor_lines_list is not None:
|
353 |
bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
|
@@ -371,7 +371,7 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
371 |
max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
|
372 |
else:
|
373 |
max_height_image = cropped_image.copy()
|
374 |
-
st.write("selected segment")
|
375 |
# print("max height image start")
|
376 |
# st.image(Image.fromarray(max_height_image))#to_be_displayed
|
377 |
# print("max height image end")
|
@@ -403,18 +403,18 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
|
|
403 |
color_tuple=(0, 255, 0)
|
404 |
cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
|
405 |
print("still in the middle start")
|
406 |
-
st.image(Image.fromarray(bgr_image))
|
407 |
print("still in the middle end")
|
408 |
left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
|
409 |
if debug:
|
410 |
print("left over start")
|
411 |
-
st.image(Image.fromarray(left_over_content))
|
412 |
print("left over end")
|
413 |
max_height_image_copy=max_height_image.copy()
|
414 |
cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
|
415 |
if debug:
|
416 |
print("no left over start")
|
417 |
-
st.image(Image.fromarray(max_height_image_copy))
|
418 |
print("no left over end")
|
419 |
text=extract_two_columns_text(max_height_image_copy,debug)
|
420 |
if text == "error":
|
|
|
287 |
if debug:
|
288 |
print("left column image start")
|
289 |
# display(left_column_img)
|
290 |
+
# st.image(Image.fromarray(left_column_array_bgr_image)) # to_be_displayed
|
291 |
print("left column image end")
|
292 |
print("right column image start")
|
293 |
# display(right_column_img)
|
294 |
+
# st.image(Image.fromarray(right_column_array_bgr_image)) # to_be_displayed
|
295 |
print("right column image end")
|
296 |
left_text = pytesseract.image_to_string(left_column_img)
|
297 |
# with open("/content/gdrive/MyDrive/Avatar/demo_pdf_ingestion_steps/page_"+formatted_index_string + "step9_left_column_text.txt", 'w') as file:
|
|
|
347 |
print(table[0])
|
348 |
# st.write(table[0])#to_be_displayed
|
349 |
# st.image(Image.fromarray(table[1]))#to_be_displayed
|
350 |
+
# st.image(Image.fromarray(cropped_image))#to_be_displayed
|
351 |
found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
|
352 |
if found_hor_lines_list is not None:
|
353 |
bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
|
|
|
371 |
max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
|
372 |
else:
|
373 |
max_height_image = cropped_image.copy()
|
374 |
+
# st.write("selected segment")
|
375 |
# print("max height image start")
|
376 |
# st.image(Image.fromarray(max_height_image))#to_be_displayed
|
377 |
# print("max height image end")
|
|
|
403 |
color_tuple=(0, 255, 0)
|
404 |
cv2.rectangle(bgr_image, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), color_tuple, thickness=5)
|
405 |
print("still in the middle start")
|
406 |
+
# st.image(Image.fromarray(bgr_image))
|
407 |
print("still in the middle end")
|
408 |
left_over_content =max_height_image[y_start:max_height_image.shape[0], 0:max_height_image.shape[1]]
|
409 |
if debug:
|
410 |
print("left over start")
|
411 |
+
# st.image(Image.fromarray(left_over_content))
|
412 |
print("left over end")
|
413 |
max_height_image_copy=max_height_image.copy()
|
414 |
cv2.rectangle(max_height_image_copy, (0, y_start), (max_height_image.shape[1], max_height_image.shape[0]), (255, 255, 255), cv2.FILLED)
|
415 |
if debug:
|
416 |
print("no left over start")
|
417 |
+
# st.image(Image.fromarray(max_height_image_copy))
|
418 |
print("no left over end")
|
419 |
text=extract_two_columns_text(max_height_image_copy,debug)
|
420 |
if text == "error":
|