zmbfeng's picture
figures, tables, and other text book with above text recognized and below text included in the block text image
15cd602
raw
history blame
4.35 kB
import streamlit as st
import pdf2image
import utils
import numpy as np
import cv2
import time
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
# poppler-utils:
# Installed: 22.02.0-2ubuntu0.4
# install https://github.com/UB-Mannheim/tesseract/wiki
#page extraction disabled
big_text = """
<div style='text-align: center;'>
<h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
</div>
"""
# Display the styled text
st.markdown(big_text, unsafe_allow_html=True)
if 'is_initialized' not in st.session_state:
pdf_path = 'uploaded_pdf/data_sheet.pdf'
st.session_state['is_initialized'] = True
page_count = utils.get_pdf_page_count(pdf_path)
print("page_count=",page_count)
page_count=5
print("new page_count=",page_count)
read_pdf_progress_bar = st.progress(0)
st.session_state.color_image_list = []
st.session_state.gray_image_np_list = []
for page_number in range(page_count):
image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
st.session_state.color_image_list.append(image[0])
progress_percentage = (page_number) / (page_count-1)
read_pdf_progress_bar.progress(progress_percentage)
gray_pdf_image_np_list = []
read_pdf_progress_bar.progress(0)
for index, image in enumerate(st.session_state.color_image_list):
image_np = np.array(image)
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
progress_percentage = (index) / (page_count - 1)
read_pdf_progress_bar.progress(progress_percentage)
# cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
# cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
# cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
# cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
st.session_state.img_index = 0
st.session_state.stop_button_clicked=False
# st.image(st.session_state.gray_image_np_list[38])
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
print("index="+str(index))
text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
#if 'img_index' not in st.session_state:
# if st.button("Stop"):
# st.session_state.stop_button_clicked = True
# st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
# st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
# if not st.session_state.stop_button_clicked:
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
# st.session_state.img_index += 1
# time.sleep(3)
# st.rerun()
# col1, col2 = st.columns(2)
# with col1:
# if st.button("Previous"):
# print("Previous pressed")
# # Decrease index, wrap around if it goes below 0
# print("st.session_state.img_index =", str(st.session_state.img_index))
# if st.session_state.img_index > 0:
# print("case 1 before st.session_state.img_index =",str(st.session_state.img_index))
# st.session_state.img_index -= 1
# print("case 2 after st.session_state.img_index =", str(st.session_state.img_index))
# else:
# print("case 2 st.session_state.img_index =", str(st.session_state.img_index))
# st.session_state.img_index = len(st.session_state.color_image_list) - 1
# with col2:
# if st.button("Next"):
#
# print("Next pressed")
# # Increase index, wrap around if it goes past the last image
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
# st.session_state.img_index += 1
#
# else:
# st.session_state.img_index = 0
# #
# total_pages = 100
# print(f"total_pages = {total_pages}")
# st.write(f"total_pages = {total_pages}")
# for page_number in range(total_pages):
# pdf_image_list = convert_from_path(pdf_path)
# images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
# progress = (page_number + 1) / total_pages * 100
# print(f"Progress: {progress:.2f}%")
# print("done")