Spaces:
Sleeping
Sleeping
display page text one by one
Browse files
app.py
CHANGED
@@ -114,7 +114,7 @@ if 'page_count' in st.session_state:
|
|
114 |
st.session_state.gray_image_np_list = []
|
115 |
pdf_figures_image_list=[]
|
116 |
pdf_tables_image_list=[]
|
117 |
-
pdf_text_list=[]
|
118 |
|
119 |
for page_number in range(st.session_state.num_pages_to_extract):
|
120 |
image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
|
@@ -128,13 +128,14 @@ if 'page_count' in st.session_state:
|
|
128 |
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
129 |
read_pdf_progress_bar.progress(progress_percentage)
|
130 |
st.session_state.extracted_text = ""
|
|
|
131 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
132 |
print("index="+str(index))
|
133 |
|
134 |
figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
135 |
pdf_figures_image_list.append(figures_image_list)
|
136 |
pdf_tables_image_list.append(tables_image_list)
|
137 |
-
pdf_text_list.append(text)
|
138 |
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
139 |
# st.write(text)
|
140 |
# print(text)
|
@@ -150,7 +151,8 @@ if 'page_count' in st.session_state:
|
|
150 |
data=string_buffer.getvalue(),
|
151 |
file_name=txt_file_path,
|
152 |
mime="text/plain")
|
153 |
-
|
|
|
154 |
|
155 |
# for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
|
156 |
# print("index="+str(index))
|
|
|
114 |
st.session_state.gray_image_np_list = []
|
115 |
pdf_figures_image_list=[]
|
116 |
pdf_tables_image_list=[]
|
117 |
+
st.session_state.pdf_text_list=[]
|
118 |
|
119 |
for page_number in range(st.session_state.num_pages_to_extract):
|
120 |
image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
|
|
|
128 |
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
|
129 |
read_pdf_progress_bar.progress(progress_percentage)
|
130 |
st.session_state.extracted_text = ""
|
131 |
+
|
132 |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
|
133 |
print("index="+str(index))
|
134 |
|
135 |
figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
|
136 |
pdf_figures_image_list.append(figures_image_list)
|
137 |
pdf_tables_image_list.append(tables_image_list)
|
138 |
+
st.session_state.pdf_text_list.append(text)
|
139 |
st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
|
140 |
# st.write(text)
|
141 |
# print(text)
|
|
|
151 |
data=string_buffer.getvalue(),
|
152 |
file_name=txt_file_path,
|
153 |
mime="text/plain")
|
154 |
+
for index,pdf_text in enumerate(st.session_state.pdf_text_list):
|
155 |
+
st.write(f"Page {index+1} \n\n {pdf_text}\n")
|
156 |
|
157 |
# for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
|
158 |
# print("index="+str(index))
|