File size: 4,349 Bytes
5a99cc1
ad5ca2c
 
 
 
365522e
6a8bd53
 
 
15cd602
ad5ca2c
5a99cc1
 
 
 
 
 
6a8bd53
ad5ca2c
 
 
 
 
 
 
d6482c4
ad5ca2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6482c4
 
 
 
ad5ca2c
0196fd4
d6482c4
ad5ca2c
d6482c4
 
 
 
ad5ca2c
 
d6482c4
 
 
 
 
 
 
 
 
365522e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad5ca2c
6a8bd53
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import streamlit as st
import pdf2image
import utils
import numpy as np
import cv2
import time
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
# poppler-utils:
#   Installed: 22.02.0-2ubuntu0.4
# install https://github.com/UB-Mannheim/tesseract/wiki
#page extraction disabled
big_text = """
    <div style='text-align: center;'>
        <h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
    </div>
    """
    # Display the styled text
st.markdown(big_text, unsafe_allow_html=True)


if 'is_initialized' not in st.session_state:
    pdf_path = 'uploaded_pdf/data_sheet.pdf'
    st.session_state['is_initialized'] = True
    page_count = utils.get_pdf_page_count(pdf_path)
    print("page_count=",page_count)
    page_count=5
    print("new page_count=",page_count)
    read_pdf_progress_bar = st.progress(0)
    st.session_state.color_image_list = []
    st.session_state.gray_image_np_list = []
    for page_number in range(page_count):
         image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
         st.session_state.color_image_list.append(image[0])
         progress_percentage = (page_number) / (page_count-1)
         read_pdf_progress_bar.progress(progress_percentage)
    gray_pdf_image_np_list = []
    read_pdf_progress_bar.progress(0)
    for index, image in enumerate(st.session_state.color_image_list):
        image_np = np.array(image)
        st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
        progress_percentage = (index) / (page_count - 1)
        read_pdf_progress_bar.progress(progress_percentage)
    # cv2.line(st.session_state.gray_image_np_list[37], (174, 227), (174, 1790), 0, 2)
    # cv2.line(st.session_state.gray_image_np_list[37], (1550, 227), (1550, 1790), 0, 2)
    # cv2.line(st.session_state.gray_image_np_list[38], (226,227),(226,1444), 0,3)
    # cv2.line(st.session_state.gray_image_np_list[38], (1601,227),(1601,1444), 0,2)
    st.session_state.img_index = 0
    st.session_state.stop_button_clicked=False
# st.image(st.session_state.gray_image_np_list[38])

for index, gray_pdf_image_np in enumerate(st.session_state.gray_image_np_list[0:5], start=0):
  print("index="+str(index))

  text=utils.gray_pdf_image_np_to_text(index,gray_pdf_image_np, debug=True)
#if 'img_index' not in st.session_state:

# if st.button("Stop"):
#     st.session_state.stop_button_clicked = True
# st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
# st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
# if not st.session_state.stop_button_clicked:
#     if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
#         st.session_state.img_index += 1
#         time.sleep(3)
#         st.rerun()
# col1, col2 = st.columns(2)
# with col1:
#     if st.button("Previous"):
#         print("Previous pressed")
#         # Decrease index, wrap around if it goes below 0
#         print("st.session_state.img_index =", str(st.session_state.img_index))
#         if st.session_state.img_index > 0:
#             print("case 1 before st.session_state.img_index =",str(st.session_state.img_index))
#             st.session_state.img_index -= 1
#             print("case 2 after  st.session_state.img_index =", str(st.session_state.img_index))
#         else:
#             print("case 2 st.session_state.img_index =", str(st.session_state.img_index))
#             st.session_state.img_index = len(st.session_state.color_image_list) - 1
# with col2:
#     if st.button("Next"):
#
#         print("Next pressed")
#         # Increase index, wrap around if it goes past the last image
#         if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
#             st.session_state.img_index += 1
#
#         else:
#             st.session_state.img_index = 0
# #
# total_pages = 100
# print(f"total_pages = {total_pages}")
# st.write(f"total_pages = {total_pages}")
# for page_number in range(total_pages):
#     pdf_image_list = convert_from_path(pdf_path)
#     images = convert_from_path(pdf_path, first_page=page_number + 1, last_page=page_number + 1)
#     progress = (page_number + 1) / total_pages * 100
#     print(f"Progress: {progress:.2f}%")
# print("done")