import streamlit as st
import pdf2image
import utils
import numpy as np
import cv2
import os
import io
import shutil
import time
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
# poppler-utils:
# Installed: 22.02.0-2ubuntu0.4
# install https://github.com/UB-Mannheim/tesseract/wiki
#page extraction disabled
def is_new_pdf_upload(uploaded_file):
if 'last_pdf_uploaded_file' in st.session_state:
# Check if the newly uploaded file is different from the last one
if (uploaded_file.name != st.session_state.last_pdf_uploaded_file['name'] or
uploaded_file.size != st.session_state.last_pdf_uploaded_file['size']):
st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
# st.write("A new src image file has been uploaded.")
return True
else:
# st.write("The same src image file has been re-uploaded.")
return False
else:
# st.write("This is the first file upload detected.")
st.session_state.last_pdf_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
return True
# Store current file details in session state
big_text = """
Locked PDF Ingestion
"""
# Display the styled text
st.markdown(big_text, unsafe_allow_html=True)
if 'is_initialized' not in st.session_state:
pdf_path = 'uploaded_pdf/data_sheet.pdf'
st.session_state['is_initialized'] = True
# page_count = utils.get_pdf_page_count(pdf_path)
# print("page_count=",page_count)
# page_count=5
# print("new page_count=",page_count)
# read_pdf_progress_bar = st.progress(0)
# st.session_state.color_image_list = []
# st.session_state.gray_image_np_list = []
# for page_number in range(page_count):
# image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
# st.session_state.color_image_list.append(image[0])
# progress_percentage = (page_number) / (page_count-1)
# read_pdf_progress_bar.progress(progress_percentage)
# gray_pdf_image_np_list = []
# read_pdf_progress_bar.progress(0)
# for index, image in enumerate(st.session_state.color_image_list):
# image_np = np.array(image)
# st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
# progress_percentage = (index) / (page_count - 1)
# read_pdf_progress_bar.progress(progress_percentage)
# # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
# # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
# # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
# # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
# st.session_state.img_index = 0
# st.session_state.stop_button_clicked=False
# # st.image(st.session_state.gray_image_np_list[38])
uploaded_locked_pdf_file = st.file_uploader("Upload a locked pdf",
type=['pdf'])
st.markdown(
f'Sample 1 download and then upload to above',
unsafe_allow_html=True)
if uploaded_locked_pdf_file is not None:
if is_new_pdf_upload(uploaded_locked_pdf_file):
# To see details
# file_details = {"FileName": uploaded_driving_video_file.name, "FileType": uploaded_driving_video_file.type, "FileSize": uploaded_driving_video_file.size}
# st.write(file_details)
save_path = './uploaded_videos'
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(os.path.join(save_path, uploaded_locked_pdf_file.name), "wb") as f:
f.write(uploaded_locked_pdf_file.getbuffer()) # Write the file to the specified location
st.success(f'Saved file temp_{uploaded_locked_pdf_file.name} in {save_path}')
st.session_state.uploaded_pdf_path=os.path.join(save_path, uploaded_locked_pdf_file.name)
st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
print("page_count=",st.session_state.page_count)
if 'extracted_text' in st.session_state:
del st.session_state.extracted_text
st.rerun()
if 'page_count' in st.session_state:
st.write(f"total page count = {st.session_state.page_count}")
if 'num_pages_to_extract'not in st.session_state:
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=5, key='num_pages_to_extract_slider')
else:
st.session_state.num_pages_to_extract = st.slider('Number of pages to extract:', min_value=1, max_value=st.session_state.page_count, value=st.session_state.num_pages_to_extract , key='num_pages_to_extract_slider')
st.write(f"num of pages to extract {st.session_state.num_pages_to_extract}")
if 'run_button' in st.session_state and st.session_state.run_button == True:
st.session_state.running = True
else:
st.session_state.running = False
read_pdf_progress_bar = st.progress(0)
if st.button('Extract Pages', disabled=st.session_state.running, key='run_button'):
st.session_state.color_image_list = []
st.session_state.gray_image_np_list = []
for page_number in range(st.session_state.num_pages_to_extract):
image = pdf2image.convert_from_path(st.session_state.uploaded_pdf_path, first_page=page_number+1, last_page=page_number+1)
st.session_state.color_image_list.append(image[0])
progress_percentage = (page_number) / (st.session_state.num_pages_to_extract-1)
read_pdf_progress_bar.progress(progress_percentage)
read_pdf_progress_bar.progress(0)
for index, image in enumerate(st.session_state.color_image_list):
image_np = np.array(image)
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
read_pdf_progress_bar.progress(progress_percentage)
st.session_state.extracted_text = ""
for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list):
print("index="+str(index))
text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
st.session_state.extracted_text=st.session_state.extracted_text+f"\n" + text + f"\n\n>"
# st.write(text)
# print(text)
progress_percentage = (index) / (st.session_state.num_pages_to_extract - 1)
read_pdf_progress_bar.progress(progress_percentage)
#add_animation_to_image()
#st.session_state['video_generated'] = True
st.rerun()
if 'extracted_text' in st.session_state:
string_buffer = io.StringIO(st.session_state.extracted_text)
txt_file_path=uploaded_locked_pdf_file.name.replace(".pdf", ".txt")
st.download_button(label="Download Extraction txt File",
data=string_buffer.getvalue(),
file_name=txt_file_path,
mime="text/plain")
st.write(st.session_state.extracted_text)
# for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
# print("index="+str(index))
#
# text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
# st.write(text)
#if 'img_index' not in st.session_state:
# if st.button("Stop"):
# st.session_state.stop_button_clicked = True
# st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
# st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
# if not st.session_state.stop_button_clicked:
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
# st.session_state.img_index += 1
# time.sleep(3)
# st.rerun()
# col1, col2 = st.columns(2)
# with col1:
# if st.button("Previous"):
# print("Previous pressed")
# # Decrease index, wrap around if it goes below 0
# print("st.session_state.img_index =", str(st.session_state.img_index))
# if st.session_state.img_index > 0:
# print("case 1 before st.session_state.img_index =",str(st.session_state.img_index))
# st.session_state.img_index -= 1
# print("case 2 after st.session_state.img_index =", str(st.session_state.img_index))
# else:
# print("case 2 st.session_state.img_index =", str(st.session_state.img_index))
# st.session_state.img_index = len(st.session_state.color_image_list) - 1
# with col2:
# if st.button("Next"):
#
# print("Next pressed")
# # Increase index, wrap around if it goes past the last image
# if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
# st.session_state.img_index += 1
#
# else:
# st.session_state.img_index = 0
# #
# total_pages = 100
# print(f"total_pages = {total_pages}")
# st.write(f"total_pages = {total_pages}")
# for page_number in range(total_pages):
# pdf_image_list = convert_from_path(pdf_path)
# images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
# progress = (page_number + 1) / total_pages * 100
# print(f"Progress: {progress:.2f}%")
# print("done")