Spaces:
Running
Running
Update app.py, pages.py, and section_extract.py
Browse filesAdding new features: Session state of the pages, so you don't have to wait the process if the pages has processed beforehand.
- __pycache__/pages.cpython-312.pyc +0 -0
- __pycache__/section_extract.cpython-312.pyc +0 -0
- app.py +3 -1
- pages.py +79 -24
- section_extract.py +210 -205
__pycache__/pages.cpython-312.pyc
CHANGED
Binary files a/__pycache__/pages.cpython-312.pyc and b/__pycache__/pages.cpython-312.pyc differ
|
|
__pycache__/section_extract.cpython-312.pyc
CHANGED
Binary files a/__pycache__/section_extract.cpython-312.pyc and b/__pycache__/section_extract.cpython-312.pyc differ
|
|
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
-
from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow
|
|
|
|
|
3 |
|
4 |
# Define pages
|
5 |
pages = {
|
|
|
1 |
import streamlit as st
|
2 |
+
from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow, uploader_sidebar
|
3 |
+
|
4 |
+
uploader_sidebar()
|
5 |
|
6 |
# Define pages
|
7 |
pages = {
|
pages.py
CHANGED
@@ -2,44 +2,99 @@ import streamlit as st
|
|
2 |
from section_extract import find_cover, find_underwriter, find_financial
|
3 |
from streamlit_pdf_viewer import pdf_viewer
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def home():
|
6 |
st.title("Prospectus Lens")
|
7 |
-
st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus
|
8 |
-
uploaded_file = st.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
|
9 |
-
st.session_state["uploaded_file"] = uploaded_file
|
10 |
-
st.caption("Made with ❤️ by @michael_sr24")
|
11 |
|
12 |
def cover():
|
13 |
-
|
14 |
-
if
|
15 |
-
|
16 |
else:
|
17 |
-
st.warning("
|
18 |
|
19 |
def underwriter():
|
20 |
-
|
21 |
-
if
|
22 |
-
|
23 |
else:
|
24 |
-
st.warning("
|
25 |
|
26 |
def income_statement():
|
27 |
-
|
28 |
-
if
|
29 |
-
|
30 |
else:
|
31 |
-
st.warning("
|
32 |
|
33 |
def balance_sheet():
|
34 |
-
|
35 |
-
if
|
36 |
-
|
37 |
else:
|
38 |
-
st.warning("
|
39 |
|
40 |
def cash_flow():
|
41 |
-
|
42 |
-
if
|
43 |
-
|
44 |
else:
|
45 |
-
st.warning("
|
|
|
2 |
from section_extract import find_cover, find_underwriter, find_financial
|
3 |
from streamlit_pdf_viewer import pdf_viewer
|
4 |
|
5 |
+
def uploader_sidebar():
|
6 |
+
uploaded_file = st.sidebar.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
|
7 |
+
st.sidebar.caption("Made with ❤️ by @michael_sr24")
|
8 |
+
|
9 |
+
if uploaded_file:
|
10 |
+
# Initialize session state for processing flags and paths
|
11 |
+
if "uploaded_file" not in st.session_state:
|
12 |
+
st.session_state["uploaded_file"] = uploaded_file
|
13 |
+
st.session_state["cover_path"] = None
|
14 |
+
st.session_state["underwriter_path"] = None
|
15 |
+
st.session_state["income_statement_path"] = None
|
16 |
+
st.session_state["balance_sheet_path"] = None
|
17 |
+
st.session_state["cash_flow_path"] = None
|
18 |
+
st.session_state["processing"] = {
|
19 |
+
"cover_path": False,
|
20 |
+
"underwriter_path": False,
|
21 |
+
"income_statement_path": False,
|
22 |
+
"balance_sheet_path": False,
|
23 |
+
"cash_flow_path": False,
|
24 |
+
}
|
25 |
+
st.session_state["all_processed"] = False
|
26 |
+
else:
|
27 |
+
st.session_state["uploaded_file"] = uploaded_file
|
28 |
+
process_sections()
|
29 |
+
|
30 |
+
def process_sections():
|
31 |
+
"""Continuously process all sections in the background."""
|
32 |
+
if "processing" in st.session_state and not st.session_state.get("all_processed", False):
|
33 |
+
for key, processed in st.session_state["processing"].items():
|
34 |
+
if not processed:
|
35 |
+
if key == "cover_path":
|
36 |
+
st.session_state[key] = find_cover(st.session_state["uploaded_file"])
|
37 |
+
elif key == "underwriter_path":
|
38 |
+
st.session_state[key] = find_underwriter(st.session_state["uploaded_file"])
|
39 |
+
elif key == "income_statement_path":
|
40 |
+
st.session_state[key] = find_financial(st.session_state["uploaded_file"], "income_statement")
|
41 |
+
elif key == "balance_sheet_path":
|
42 |
+
st.session_state[key] = find_financial(st.session_state["uploaded_file"], "balance_sheet")
|
43 |
+
elif key == "cash_flow_path":
|
44 |
+
st.session_state[key] = find_financial(st.session_state["uploaded_file"], "cash_flow")
|
45 |
+
|
46 |
+
st.session_state["processing"][key] = True # Mark as processed
|
47 |
+
break
|
48 |
+
|
49 |
+
# Check if all sections are processed
|
50 |
+
st.session_state["all_processed"] = all(st.session_state["processing"].values())
|
51 |
+
|
52 |
+
def show_section(section_key):
|
53 |
+
"""Display the section if available, otherwise inform the user."""
|
54 |
+
temp_path = st.session_state.get(section_key)
|
55 |
+
if temp_path:
|
56 |
+
pdf_viewer(temp_path)
|
57 |
+
else:
|
58 |
+
if not st.session_state["processing"].get(section_key, False):
|
59 |
+
st.info(f"{section_key.replace('_', ' ').capitalize()} is still being processed.")
|
60 |
+
else:
|
61 |
+
st.warning(f"Could not process {section_key.replace('_', ' ')}.")
|
62 |
+
|
63 |
def home():
|
64 |
st.title("Prospectus Lens")
|
65 |
+
st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus on the left sidebar!")
|
|
|
|
|
|
|
66 |
|
67 |
def cover():
|
68 |
+
st.title("Cover")
|
69 |
+
if "uploaded_file" in st.session_state:
|
70 |
+
show_section("cover_path")
|
71 |
else:
|
72 |
+
st.warning("Please upload a file first!")
|
73 |
|
74 |
def underwriter():
|
75 |
+
st.title("Underwriter")
|
76 |
+
if "uploaded_file" in st.session_state:
|
77 |
+
show_section("underwriter_path")
|
78 |
else:
|
79 |
+
st.warning("Please upload a file first!")
|
80 |
|
81 |
def income_statement():
|
82 |
+
st.title("Income Statement")
|
83 |
+
if "uploaded_file" in st.session_state:
|
84 |
+
show_section("income_statement_path")
|
85 |
else:
|
86 |
+
st.warning("Please upload a file first!")
|
87 |
|
88 |
def balance_sheet():
|
89 |
+
st.title("Balance Sheet")
|
90 |
+
if "uploaded_file" in st.session_state:
|
91 |
+
show_section("balance_sheet_path")
|
92 |
else:
|
93 |
+
st.warning("Please upload a file first!")
|
94 |
|
95 |
def cash_flow():
|
96 |
+
st.title("Cash Flow")
|
97 |
+
if "uploaded_file" in st.session_state:
|
98 |
+
show_section("cash_flow_path")
|
99 |
else:
|
100 |
+
st.warning("Please upload a file first!")
|
section_extract.py
CHANGED
@@ -1,206 +1,211 @@
|
|
1 |
-
import os
|
2 |
-
import re
|
3 |
-
from PyPDF2 import PdfReader, PdfWriter
|
4 |
-
import streamlit as st
|
5 |
-
from config import keywords_dict, stop_keywords, anti_keywords
|
6 |
-
|
7 |
-
def find_cover(uploaded_file):
|
8 |
-
"""
|
9 |
-
Extracts and saves the first page of a PDF to a temporary file.
|
10 |
-
|
11 |
-
Parameters:
|
12 |
-
uploaded_file: The uploaded PDF file.
|
13 |
-
|
14 |
-
Returns:
|
15 |
-
str: Path to the temporary file containing the first page of the PDF.
|
16 |
-
"""
|
17 |
-
section_title = "cover"
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
text
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
st.
|
86 |
-
return None
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
#
|
202 |
-
return
|
203 |
-
|
204 |
-
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
206 |
return False
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from PyPDF2 import PdfReader, PdfWriter
|
4 |
+
import streamlit as st
|
5 |
+
from config import keywords_dict, stop_keywords, anti_keywords
|
6 |
+
|
7 |
+
def find_cover(uploaded_file):
|
8 |
+
"""
|
9 |
+
Extracts and saves the first page of a PDF to a temporary file.
|
10 |
+
|
11 |
+
Parameters:
|
12 |
+
uploaded_file: The uploaded PDF file.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
str: Path to the temporary file containing the first page of the PDF.
|
16 |
+
"""
|
17 |
+
section_title = "cover"
|
18 |
+
if uploaded_file:
|
19 |
+
try:
|
20 |
+
# Read the PDF and extract the first page
|
21 |
+
pdf_reader = PdfReader(uploaded_file)
|
22 |
+
first_page = pdf_reader.pages[0]
|
23 |
+
|
24 |
+
pdf_writer = PdfWriter()
|
25 |
+
temp_cover_page_path = os.path.join(f"temp_{section_title}_1.pdf")
|
26 |
+
with open(temp_cover_page_path, "wb") as f:
|
27 |
+
pdf_writer.add_page(first_page)
|
28 |
+
pdf_writer.write(f)
|
29 |
+
|
30 |
+
# Return the path to the temporary file
|
31 |
+
return temp_cover_page_path
|
32 |
+
except Exception as e:
|
33 |
+
st.error(f"An error occurred while processing the PDF: {e}")
|
34 |
+
return None
|
35 |
+
else:
|
36 |
+
st.warning("Please upload a PDF on the Home page first.")
|
37 |
+
return None
|
38 |
+
|
39 |
+
|
40 |
+
def find_underwriter(uploaded_file):
|
41 |
+
"""
|
42 |
+
Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
|
43 |
+
|
44 |
+
Parameters:
|
45 |
+
uploaded_file: The uploaded PDF file.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
str: Path to the temporary file containing the extracted 'underwriter' page(s).
|
49 |
+
"""
|
50 |
+
section_name = "underwriter"
|
51 |
+
|
52 |
+
keyword_sets = keywords_dict.get(section_name, [])
|
53 |
+
if not keyword_sets:
|
54 |
+
st.error(f"No keywords defined for section: {section_name}")
|
55 |
+
return None
|
56 |
+
|
57 |
+
if uploaded_file:
|
58 |
+
try:
|
59 |
+
pdf_reader = PdfReader(uploaded_file)
|
60 |
+
total_pages = len(pdf_reader.pages)
|
61 |
+
start_page = total_pages // 3 # Skip the first 1/3 of the PDF
|
62 |
+
pages = pdf_reader.pages[start_page:]
|
63 |
+
|
64 |
+
# Loop through the keyword sets
|
65 |
+
for keyword_set in keyword_sets:
|
66 |
+
for page_num, page in enumerate(pages, start=start_page + 1):
|
67 |
+
text = page.extract_text()
|
68 |
+
|
69 |
+
# Check if any keyword in the set is found on the page
|
70 |
+
if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
|
71 |
+
# Save the matched page to a temporary file
|
72 |
+
pdf_writer = PdfWriter()
|
73 |
+
pdf_writer.add_page(page)
|
74 |
+
|
75 |
+
temp_page_path = os.path.join(f"temp_{section_name}_{page_num}.pdf")
|
76 |
+
with open(temp_page_path, "wb") as f:
|
77 |
+
pdf_writer.write(f)
|
78 |
+
|
79 |
+
# Return the path of the extracted page
|
80 |
+
return temp_page_path
|
81 |
+
|
82 |
+
st.warning(f"No pages contain the specified keywords for {section_name}.")
|
83 |
+
return None
|
84 |
+
except Exception as e:
|
85 |
+
st.error(f"An error occurred while processing the PDF: {e}")
|
86 |
+
return None
|
87 |
+
else:
|
88 |
+
st.warning("Please upload a PDF on the Home page first.")
|
89 |
+
return None
|
90 |
+
|
91 |
+
def find_financial(uploaded_file, section_name):
|
92 |
+
"""
|
93 |
+
Extracts and displays sections of a PDF based on keyword matches.
|
94 |
+
|
95 |
+
Parameters:
|
96 |
+
uploaded_file: The uploaded PDF file (Streamlit file uploader object).
|
97 |
+
section_name: The name of the section to search for (e.g., "income_statement").
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
bool: True if processing completed without interruptions; False if stopped or an error occurred.
|
101 |
+
"""
|
102 |
+
if uploaded_file:
|
103 |
+
try:
|
104 |
+
pdf_reader = PdfReader(uploaded_file)
|
105 |
+
total_pages = len(pdf_reader.pages)
|
106 |
+
|
107 |
+
# Step 1: Start from the second half of the PDF
|
108 |
+
start_page = total_pages // 2
|
109 |
+
pages = pdf_reader.pages[start_page:]
|
110 |
+
|
111 |
+
section_keywords = keywords_dict.get(section_name, [])
|
112 |
+
section_stop_keywords = stop_keywords.get(section_name, [])
|
113 |
+
section_anti_keywords = anti_keywords.get(section_name, [])
|
114 |
+
|
115 |
+
pdf_writer = PdfWriter() # Writer for the extracted pages
|
116 |
+
extraction_started = False # Flag to check if extraction has started
|
117 |
+
extraction_start_page = None # Track the starting page number
|
118 |
+
pages_extracted = 0 # Counter for extracted pages
|
119 |
+
|
120 |
+
for page_num, page in enumerate(pages, start=start_page + 1):
|
121 |
+
text = page.extract_text()
|
122 |
+
|
123 |
+
# Step 2: Find the keywords within the keywords_dict
|
124 |
+
if not extraction_started:
|
125 |
+
for keyword_set in section_keywords:
|
126 |
+
if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
|
127 |
+
pdf_writer.add_page(page)
|
128 |
+
pages_extracted += 1
|
129 |
+
extraction_start_page = page_num # Set the starting page number
|
130 |
+
|
131 |
+
# Check for stop keywords on the same page
|
132 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
|
133 |
+
for stop_set in section_stop_keywords):
|
134 |
+
|
135 |
+
# Check for anti-keywords before stopping
|
136 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
|
137 |
+
for anti_set in section_anti_keywords):
|
138 |
+
pdf_writer.pages.pop() # Remove the last added page
|
139 |
+
pages_extracted -= 1
|
140 |
+
|
141 |
+
# Save and display the extracted pages (if any)
|
142 |
+
if len(pdf_writer.pages) > 0:
|
143 |
+
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
|
144 |
+
with open(temp_section_path, "wb") as f:
|
145 |
+
pdf_writer.write(f)
|
146 |
+
return temp_section_path
|
147 |
+
else:
|
148 |
+
st.warning(f"No pages matched the criteria for {section_name}.")
|
149 |
+
|
150 |
+
# Stop extraction immediately and signal to stop all processing
|
151 |
+
return False
|
152 |
+
else:
|
153 |
+
# Continue extraction
|
154 |
+
extraction_started = True
|
155 |
+
break
|
156 |
+
elif extraction_started:
|
157 |
+
# Check if we've reached the 3-page limit
|
158 |
+
if pages_extracted >= 3:
|
159 |
+
if len(pdf_writer.pages) > 0:
|
160 |
+
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num-1}.pdf")
|
161 |
+
with open(temp_section_path, "wb") as f:
|
162 |
+
pdf_writer.write(f)
|
163 |
+
return temp_section_path
|
164 |
+
return False
|
165 |
+
|
166 |
+
# Step 3: Add the page to the output
|
167 |
+
pdf_writer.add_page(page)
|
168 |
+
pages_extracted += 1
|
169 |
+
|
170 |
+
# Step 4: Check for stop keywords
|
171 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
|
172 |
+
for stop_set in section_stop_keywords):
|
173 |
+
|
174 |
+
# Step 5: After stopping, check for anti-keywords
|
175 |
+
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
|
176 |
+
for anti_set in section_anti_keywords):
|
177 |
+
pdf_writer.pages.pop() # Remove the last added page
|
178 |
+
pages_extracted -= 1
|
179 |
+
|
180 |
+
# Save and display the extracted pages (if any)
|
181 |
+
if len(pdf_writer.pages) > 0:
|
182 |
+
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
|
183 |
+
with open(temp_section_path, "wb") as f:
|
184 |
+
pdf_writer.write(f)
|
185 |
+
return temp_section_path
|
186 |
+
else:
|
187 |
+
st.warning(f"No pages matched the criteria for {section_name}.")
|
188 |
+
|
189 |
+
# Stop extraction and signal to stop all processing
|
190 |
+
return False
|
191 |
+
|
192 |
+
# If extraction finished without hitting stop keywords, save and display the pages
|
193 |
+
if len(pdf_writer.pages) > 0:
|
194 |
+
temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
|
195 |
+
with open(temp_section_path, "wb") as f:
|
196 |
+
pdf_writer.write(f)
|
197 |
+
return temp_section_path
|
198 |
+
else:
|
199 |
+
st.warning(f"No pages matched the criteria for {section_name}.")
|
200 |
+
|
201 |
+
# Indicate that processing can continue
|
202 |
+
return True
|
203 |
+
|
204 |
+
except Exception as e:
|
205 |
+
st.error(f"An error occurred while processing the PDF: {e}")
|
206 |
+
# Stop processing due to an error
|
207 |
+
return False
|
208 |
+
else:
|
209 |
+
st.warning("Please upload a PDF on the Home page first.")
|
210 |
+
# Stop processing since no file is uploaded
|
211 |
return False
|