zmbfeng commited on
Commit
d792040
·
1 Parent(s): f9cf3d0

display page text one by one

Browse files
Files changed (1) hide show
  1. app.py +5 -3
app.py CHANGED
@@ -114,7 +114,7 @@ if 'page_count' in st.session_state:
114
  st.session_state.gray_image_np_list = []
115
  pdf_figures_image_list=[]
116
  pdf_tables_image_list=[]
117
- pdf_text_list=[]
118
 
119
  for page_number in range(st.session_state.num_pages_to_extract):
120
  image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
@@ -128,13 +128,14 @@ if 'page_count' in st.session_state:
128
  progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
129
  read_pdf_progress_bar.progress(progress_percentage)
130
  st.session_state.extracted_text = ""
 
131
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
132
  print("index="+str(index))
133
 
134
  figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
135
  pdf_figures_image_list.append(figures_image_list)
136
  pdf_tables_image_list.append(tables_image_list)
137
- pdf_text_list.append(text)
138
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
139
  # st.write(text)
140
  # print(text)
@@ -150,7 +151,8 @@ if 'page_count' in st.session_state:
150
  data=string_buffer.getvalue(),
151
  file_name=txt_file_path,
152
  mime="text/plain")
153
- st.write(st.session_state.extracted_text)
 
154
 
155
  # for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
156
  # print("index="+str(index))
 
114
  st.session_state.gray_image_np_list = []
115
  pdf_figures_image_list=[]
116
  pdf_tables_image_list=[]
117
+ st.session_state.pdf_text_list=[]
118
 
119
  for page_number in range(st.session_state.num_pages_to_extract):
120
  image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
 
128
  progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
129
  read_pdf_progress_bar.progress(progress_percentage)
130
  st.session_state.extracted_text = ""
131
+
132
  for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
133
  print("index="+str(index))
134
 
135
  figures_image_list,tables_image_list,text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
136
  pdf_figures_image_list.append(figures_image_list)
137
  pdf_tables_image_list.append(tables_image_list)
138
+ st.session_state.pdf_text_list.append(text)
139
  st.session_state.extracted_text=st.session_state.extracted_text+f"<Page {index+1} start>\n" + text + f"\n<Page {index+1} end>\n>"
140
  # st.write(text)
141
  # print(text)
 
151
  data=string_buffer.getvalue(),
152
  file_name=txt_file_path,
153
  mime="text/plain")
154
+ for index,pdf_text in enumerate(st.session_state.pdf_text_list):
155
+ st.write(f"Page {index+1} \n\n {pdf_text}\n")
156
 
157
  # for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
158
  # print("index="+str(index))