import streamlit as st import pdf2image import utils import numpy as np import cv2 import time # get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0 # poppler-utils: # Installed: 22.02.0-2ubuntu0.4 # install https://github.com/UB-Mannheim/tesseract/wiki #page extraction disabled big_text = """

Locked PDF Ingestion

""" # Display the styled text st.markdown(big_text, unsafe_allow_html=True) if 'is_initialized' not in st.session_state: pdf_path = 'uploaded_pdf/data_sheet.pdf' st.session_state['is_initialized'] = True page_count = utils.get_pdf_page_count(pdf_path) print("page_count=",page_count) page_count=5 print("new page_count=",page_count) read_pdf_progress_bar = st.progress(0) st.session_state.color_image_list = [] st.session_state.gray_image_np_list = [] for page_number in range(page_count): image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1) st.session_state.color_image_list.append(image[0]) progress_percentage = (page_number) / (page_count-1) read_pdf_progress_bar.progress(progress_percentage) gray_pdf_image_np_list = [] read_pdf_progress_bar.progress(0) for index, image in enumerate(st.session_state.color_image_list): image_np = np.array(image) st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY)) progress_percentage = (index) / (page_count - 1) read_pdf_progress_bar.progress(progress_percentage) # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2) # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2) # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3) # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2) st.session_state.img_index = 0 st.session_state.stop_button_clicked=False # st.image(st.session_state.gray_image_np_list[38]) for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0): print("index="+str(index)) text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True) st.write(text) #if 'img_index' not in st.session_state: # if st.button("Stop"): # st.session_state.stop_button_clicked = True # st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list))) # st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True) # if not st.session_state.stop_button_clicked: # if st.session_state.img_index < len(st.session_state.color_image_list) - 1: # st.session_state.img_index += 1 # time.sleep(3) # st.rerun() # col1, col2 = st.columns(2) # with col1: # if st.button("Previous"): # print("Previous pressed") # # Decrease index, wrap around if it goes below 0 # print("st.session_state.img_index =", str(st.session_state.img_index)) # if st.session_state.img_index > 0: # print("case 1 before st.session_state.img_index =",str(st.session_state.img_index)) # st.session_state.img_index -= 1 # print("case 2 after st.session_state.img_index =", str(st.session_state.img_index)) # else: # print("case 2 st.session_state.img_index =", str(st.session_state.img_index)) # st.session_state.img_index = len(st.session_state.color_image_list) - 1 # with col2: # if st.button("Next"): # # print("Next pressed") # # Increase index, wrap around if it goes past the last image # if st.session_state.img_index < len(st.session_state.color_image_list) - 1: # st.session_state.img_index += 1 # # else: # st.session_state.img_index = 0 # # # total_pages = 100 # print(f"total_pages = {total_pages}") # st.write(f"total_pages = {total_pages}") # for page_number in range(total_pages): # pdf_image_list = convert_from_path(pdf_path) # images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1) # progress = (page_number + 1) / total_pages * 100 # print(f"Progress: {progress:.2f}%") # print("done")