Spaces:
Sleeping
Sleeping
slide show implemented
Browse files- app.py +61 -9
- requirements.txt +1 -0
- utils.py +13 -0
app.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
-
|
4 |
-
|
|
|
5 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
6 |
# poppler-utils:
|
7 |
# Installed: 22.02.0-2ubuntu0.4
|
8 |
-
|
9 |
big_text = """
|
10 |
<div style='text-align: center;'>
|
11 |
<h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
|
@@ -13,11 +14,62 @@ big_text = """
|
|
13 |
"""
|
14 |
# Display the styled text
|
15 |
st.markdown(big_text, unsafe_allow_html=True)
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# total_pages = 100
|
22 |
# print(f"total_pages = {total_pages}")
|
23 |
# st.write(f"total_pages = {total_pages}")
|
|
|
1 |
import streamlit as st
|
2 |
+
import pdf2image
|
3 |
+
import utils
|
4 |
+
import numpy as np
|
5 |
+
import cv2
|
6 |
# get https://github.com/oschwartz10612/poppler-windows/releases/tag/v22.01.0-0
|
7 |
# poppler-utils:
|
8 |
# Installed: 22.02.0-2ubuntu0.4
|
9 |
+
#page extraction disabled
|
10 |
big_text = """
|
11 |
<div style='text-align: center;'>
|
12 |
<h1 style='font-size: 30x;'>Locked PDF Ingestion</h1>
|
|
|
14 |
"""
|
15 |
# Display the styled text
|
16 |
st.markdown(big_text, unsafe_allow_html=True)
|
17 |
+
|
18 |
+
|
19 |
+
if 'is_initialized' not in st.session_state:
|
20 |
+
pdf_path = 'uploaded_pdf/data_sheet.pdf'
|
21 |
+
st.session_state['is_initialized'] = True
|
22 |
+
page_count = utils.get_pdf_page_count(pdf_path)
|
23 |
+
print("page_count=",page_count)
|
24 |
+
page_count=50
|
25 |
+
print("new page_count=",page_count)
|
26 |
+
read_pdf_progress_bar = st.progress(0)
|
27 |
+
st.session_state.color_image_list = []
|
28 |
+
st.session_state.gray_image_np_list = []
|
29 |
+
for page_number in range(page_count):
|
30 |
+
image = pdf2image.convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)
|
31 |
+
st.session_state.color_image_list.append(image[0])
|
32 |
+
progress_percentage = (page_number) / (page_count-1)
|
33 |
+
read_pdf_progress_bar.progress(progress_percentage)
|
34 |
+
gray_pdf_image_np_list = []
|
35 |
+
read_pdf_progress_bar.progress(0)
|
36 |
+
for index, image in enumerate(st.session_state.color_image_list):
|
37 |
+
image_np = np.array(image)
|
38 |
+
st.session_state.gray_image_np_list.append(cv2.cvtColor(np.array(image_np), cv2.COLOR_BGR2GRAY))
|
39 |
+
progress_percentage = (index) / (page_count - 1)
|
40 |
+
read_pdf_progress_bar.progress(progress_percentage)
|
41 |
+
st.session_state.img_index = 0
|
42 |
+
|
43 |
+
#if 'img_index' not in st.session_state:
|
44 |
+
|
45 |
+
|
46 |
+
st.write(str(st.session_state.img_index+1) +"/" + str(len(st.session_state.color_image_list)))
|
47 |
+
st.image(st.session_state.gray_image_np_list[st.session_state.img_index], use_column_width=True)
|
48 |
+
|
49 |
+
col1, col2 = st.columns(2)
|
50 |
+
with col1:
|
51 |
+
if st.button("Previous"):
|
52 |
+
print("Previous pressed")
|
53 |
+
# Decrease index, wrap around if it goes below 0
|
54 |
+
print("st.session_state.img_index =", str(st.session_state.img_index))
|
55 |
+
if st.session_state.img_index > 0:
|
56 |
+
print("case 1 before st.session_state.img_index =",str(st.session_state.img_index))
|
57 |
+
st.session_state.img_index -= 1
|
58 |
+
print("case 2 after st.session_state.img_index =", str(st.session_state.img_index))
|
59 |
+
else:
|
60 |
+
print("case 2 st.session_state.img_index =", str(st.session_state.img_index))
|
61 |
+
st.session_state.img_index = len(st.session_state.color_image_list) - 1
|
62 |
+
with col2:
|
63 |
+
if st.button("Next"):
|
64 |
+
|
65 |
+
print("Next pressed")
|
66 |
+
# Increase index, wrap around if it goes past the last image
|
67 |
+
if st.session_state.img_index < len(st.session_state.color_image_list) - 1:
|
68 |
+
st.session_state.img_index += 1
|
69 |
+
|
70 |
+
else:
|
71 |
+
st.session_state.img_index = 0
|
72 |
+
# #
|
73 |
# total_pages = 100
|
74 |
# print(f"total_pages = {total_pages}")
|
75 |
# st.write(f"total_pages = {total_pages}")
|
requirements.txt
CHANGED
@@ -4,3 +4,4 @@ opencv-python
|
|
4 |
pytesseract
|
5 |
pdf2image
|
6 |
Pillow
|
|
|
|
4 |
pytesseract
|
5 |
pdf2image
|
6 |
Pillow
|
7 |
+
numpy
|
utils.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
|
3 |
+
def get_pdf_page_count(pdf_path):
|
4 |
+
try:
|
5 |
+
# Running pdfinfo command to get information about the PDF
|
6 |
+
result = subprocess.run(['pdfinfo', pdf_path], stdout=subprocess.PIPE, text=True)
|
7 |
+
# Parsing the output to find the line with the number of pages
|
8 |
+
for line in result.stdout.split('\n'):
|
9 |
+
if 'Pages:' in line:
|
10 |
+
return int(line.split(':')[1].strip())
|
11 |
+
except Exception as e:
|
12 |
+
print(f"An error occurred: {e}")
|
13 |
+
return None
|