Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdf2image | |
import utils | |
import numpy as np | |
import cv2 | |
import time | |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0 | |
# poppler-utils: | |
# Installed: 22.02.0-2ubuntu0.4 | |
# install https://github.com/UB-Mannheim/tesseract/wiki | |
#page extraction disabled | |
big_text = """ | |
<div style='text-align: center;'> | |
<h1 style='font-size: 30x;'>Locked PDF Ingestion</h1> | |
</div> | |
""" | |
# Display the styled text | |
st.markdown(big_text, unsafe_allow_html=True) | |
if 'is_initialized' not in st.session_state: | |
pdf_path = 'uploaded_pdf/data_sheet.pdf' | |
st.session_state['is_initialized'] = True | |
page_count = utils.get_pdf_page_count(pdf_path) | |
print("page_count=",page_count) | |
page_count=5 | |
print("new page_count=",page_count) | |
read_pdf_progress_bar = st.progress(0) | |
st.session_state.color_image_list = [] | |
st.session_state.gray_image_np_list = [] | |
for page_number in range(page_count): | |
image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1) | |
st.session_state.color_image_list.append(image[0]) | |
progress_percentage = (page_number) / (page_count-1) | |
read_pdf_progress_bar.progress(progress_percentage) | |
gray_pdf_image_np_list = [] | |
read_pdf_progress_bar.progress(0) | |
for index, image in enumerate(st.session_state.color_image_list): | |
image_np = np.array(image) | |
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY)) | |
progress_percentage = (index) / (page_count - 1) | |
read_pdf_progress_bar.progress(progress_percentage) | |
# cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2) | |
# cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2) | |
# cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3) | |
# cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2) | |
st.session_state.img_index = 0 | |
st.session_state.stop_button_clicked=False | |
# st.image(st.session_state.gray_image_np_list[38]) | |
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0): | |
print("index="+str(index)) | |
text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True) | |
#if 'img_index' not in st.session_state: | |
# if st.button("Stop"): | |
# st.session_state.stop_button_clicked = True | |
# st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list))) | |
# st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True) | |
# if not st.session_state.stop_button_clicked: | |
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1: | |
# st.session_state.img_index += 1 | |
# time.sleep(3) | |
# st.rerun() | |
# col1, col2 = st.columns(2) | |
# with col1: | |
# if st.button("Previous"): | |
# print("Previous pressed") | |
# # Decrease index, wrap around if it goes below 0 | |
# print("st.session_state.img_index =", str(st.session_state.img_index)) | |
# if st.session_state.img_index > 0: | |
# print("case 1 before st.session_state.img_index =",str(st.session_state.img_index)) | |
# st.session_state.img_index -= 1 | |
# print("case 2 after st.session_state.img_index =", str(st.session_state.img_index)) | |
# else: | |
# print("case 2 st.session_state.img_index =", str(st.session_state.img_index)) | |
# st.session_state.img_index = len(st.session_state.color_image_list) - 1 | |
# with col2: | |
# if st.button("Next"): | |
# | |
# print("Next pressed") | |
# # Increase index, wrap around if it goes past the last image | |
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1: | |
# st.session_state.img_index += 1 | |
# | |
# else: | |
# st.session_state.img_index = 0 | |
# # | |
# total_pages = 100 | |
# print(f"total_pages = {total_pages}") | |
# st.write(f"total_pages = {total_pages}") | |
# for page_number in range(total_pages): | |
# pdf_image_list = convert_from_path(pdf_path) | |
# images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1) | |
# progress = (page_number + 1) / total_pages * 100 | |
# print(f"Progress: {progress:.2f}%") | |
# print("done") |