Spaces:
Sleeping
Sleeping
File size: 4,366 Bytes
5a99cc1 ad5ca2c 365522e 6a8bd53 15cd602 ad5ca2c 5a99cc1 6a8bd53 ad5ca2c d6482c4 ad5ca2c d6482c4 ad5ca2c 0196fd4 d6482c4 ad5ca2c d6482c4 229ebda ad5ca2c d6482c4 365522e ad5ca2c 6a8bd53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import pdf2image
import utils
import numpy as np
import cv2
import time
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
# poppler-utils:
# Installed: 22.02.0-2ubuntu0.4
# install https://github.com/UB-Mannheim/tesseract/wiki
#page extraction disabled
big_text = """
<div style='text-align: center;'>
<h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
</div>
"""
# Display the styled text
st.markdown(big_text, unsafe_allow_html=True)
if 'is_initialized' not in st.session_state:
pdf_path = 'uploaded_pdf/data_sheet.pdf'
st.session_state['is_initialized'] = True
page_count = utils.get_pdf_page_count(pdf_path)
print("page_count=",page_count)
page_count=5
print("new page_count=",page_count)
read_pdf_progress_bar = st.progress(0)
st.session_state.color_image_list = []
st.session_state.gray_image_np_list = []
for page_number in range(page_count):
image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
st.session_state.color_image_list.append(image[0])
progress_percentage = (page_number) / (page_count-1)
read_pdf_progress_bar.progress(progress_percentage)
gray_pdf_image_np_list = []
read_pdf_progress_bar.progress(0)
for index, image in enumerate(st.session_state.color_image_list):
image_np = np.array(image)
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
progress_percentage = (index) / (page_count - 1)
read_pdf_progress_bar.progress(progress_percentage)
# cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
# cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
# cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
# cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
st.session_state.img_index = 0
st.session_state.stop_button_clicked=False
# st.image(st.session_state.gray_image_np_list[38])
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
print("index="+str(index))
text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
st.write(text)
#if 'img_index' not in st.session_state:
# if st.button("Stop"):
# st.session_state.stop_button_clicked = True
# st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
# st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
# if not st.session_state.stop_button_clicked:
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
# st.session_state.img_index += 1
# time.sleep(3)
# st.rerun()
# col1, col2 = st.columns(2)
# with col1:
# if st.button("Previous"):
# print("Previous pressed")
# # Decrease index, wrap around if it goes below 0
# print("st.session_state.img_index =", str(st.session_state.img_index))
# if st.session_state.img_index > 0:
# print("case 1 before st.session_state.img_index =",str(st.session_state.img_index))
# st.session_state.img_index -= 1
# print("case 2 after st.session_state.img_index =", str(st.session_state.img_index))
# else:
# print("case 2 st.session_state.img_index =", str(st.session_state.img_index))
# st.session_state.img_index = len(st.session_state.color_image_list) - 1
# with col2:
# if st.button("Next"):
#
# print("Next pressed")
# # Increase index, wrap around if it goes past the last image
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
# st.session_state.img_index += 1
#
# else:
# st.session_state.img_index = 0
# #
# total_pages = 100
# print(f"total_pages = {total_pages}")
# st.write(f"total_pages = {total_pages}")
# for page_number in range(total_pages):
# pdf_image_list = convert_from_path(pdf_path)
# images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
# progress = (page_number + 1) / total_pages * 100
# print(f"Progress: {progress:.2f}%")
# print("done") |