Spaces:
Running
Running
Update pages.py and section_extract.py to preview the pdf in the pages.py
Browse files- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/pages.cpython-312.pyc +0 -0
- __pycache__/section_extract.cpython-312.pyc +0 -0
- pages.py +26 -5
- section_extract.py +22 -23
__pycache__/config.cpython-312.pyc
ADDED
Binary file (1.61 kB). View file
|
|
__pycache__/pages.cpython-312.pyc
CHANGED
Binary files a/__pycache__/pages.cpython-312.pyc and b/__pycache__/pages.cpython-312.pyc differ
|
|
__pycache__/section_extract.cpython-312.pyc
CHANGED
Binary files a/__pycache__/section_extract.cpython-312.pyc and b/__pycache__/section_extract.cpython-312.pyc differ
|
|
pages.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
from section_extract import find_cover, find_underwriter, find_financial
|
|
|
3 |
|
4 |
def home():
|
5 |
st.title("Prospectus Lens")
|
@@ -9,16 +10,36 @@ def home():
|
|
9 |
st.caption("Made with ❤️ by @michael_sr24")
|
10 |
|
11 |
def cover():
|
12 |
-
find_cover(uploaded_file=st.session_state.get("uploaded_file"))
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def underwriter():
|
15 |
-
find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def income_statement():
|
18 |
-
find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def balance_sheet():
|
21 |
-
find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def cash_flow():
|
24 |
-
find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from section_extract import find_cover, find_underwriter, find_financial
|
3 |
+
from streamlit_pdf_viewer import pdf_viewer
|
4 |
|
5 |
def home():
|
6 |
st.title("Prospectus Lens")
|
|
|
10 |
st.caption("Made with ❤️ by @michael_sr24")
|
11 |
|
12 |
def cover():
|
13 |
+
temp_cover_page_path = find_cover(uploaded_file=st.session_state.get("uploaded_file"))
|
14 |
+
if temp_cover_page_path:
|
15 |
+
pdf_viewer(temp_cover_page_path)
|
16 |
+
else:
|
17 |
+
st.warning("Could not process the PDF file.")
|
18 |
|
19 |
def underwriter():
|
20 |
+
temp_page_path = find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
|
21 |
+
if temp_page_path:
|
22 |
+
pdf_viewer(temp_page_path)
|
23 |
+
else:
|
24 |
+
st.warning("Could not extract the underwriter section.")
|
25 |
|
26 |
def income_statement():
|
27 |
+
temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
|
28 |
+
if temp_section_path:
|
29 |
+
pdf_viewer(temp_section_path)
|
30 |
+
else:
|
31 |
+
st.warning("Could not extract the income statement section.")
|
32 |
|
33 |
def balance_sheet():
|
34 |
+
temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
|
35 |
+
if temp_section_path:
|
36 |
+
pdf_viewer(temp_section_path)
|
37 |
+
else:
|
38 |
+
st.warning("Could not extract the balance sheet section.")
|
39 |
|
40 |
def cash_flow():
|
41 |
+
temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")
|
42 |
+
if temp_section_path:
|
43 |
+
pdf_viewer(temp_section_path)
|
44 |
+
else:
|
45 |
+
st.warning("Could not extract the cash flow section.")
|
section_extract.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
from PyPDF2 import PdfReader, PdfWriter
|
4 |
-
from streamlit_pdf_viewer import pdf_viewer
|
5 |
import streamlit as st
|
6 |
from config import keywords_dict, stop_keywords, anti_keywords
|
7 |
|
8 |
def find_cover(uploaded_file):
|
9 |
"""
|
10 |
-
Extracts and
|
11 |
|
12 |
Parameters:
|
13 |
uploaded_file: The uploaded PDF file.
|
14 |
|
15 |
Returns:
|
16 |
-
|
17 |
"""
|
18 |
section_title = "cover"
|
19 |
st.title(section_title.title())
|
@@ -25,31 +24,30 @@ def find_cover(uploaded_file):
|
|
25 |
first_page = pdf_reader.pages[0]
|
26 |
|
27 |
pdf_writer = PdfWriter()
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
temp_first_page_path = os.path.join(f"temp_{section_title}.pdf")
|
32 |
-
with open(temp_first_page_path, "wb") as f:
|
33 |
pdf_writer.write(f)
|
34 |
|
35 |
-
#
|
36 |
-
|
37 |
except Exception as e:
|
38 |
st.error(f"An error occurred while processing the PDF: {e}")
|
|
|
39 |
else:
|
40 |
st.warning("Please upload a PDF on the Home page first.")
|
|
|
41 |
|
42 |
|
43 |
def find_underwriter(uploaded_file):
|
44 |
"""
|
45 |
-
Searches for pages in a PDF containing specific keywords for the 'underwriter' section and
|
46 |
-
starting from the last 2/3 of the PDF to improve performance.
|
47 |
|
48 |
Parameters:
|
49 |
uploaded_file: The uploaded PDF file.
|
50 |
|
51 |
Returns:
|
52 |
-
|
53 |
"""
|
54 |
section_name = "underwriter"
|
55 |
st.title(section_name.title())
|
@@ -57,7 +55,7 @@ def find_underwriter(uploaded_file):
|
|
57 |
keyword_sets = keywords_dict.get(section_name, [])
|
58 |
if not keyword_sets:
|
59 |
st.error(f"No keywords defined for section: {section_name}")
|
60 |
-
return
|
61 |
|
62 |
if uploaded_file:
|
63 |
try:
|
@@ -73,7 +71,7 @@ def find_underwriter(uploaded_file):
|
|
73 |
|
74 |
# Check if any keyword in the set is found on the page
|
75 |
if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
|
76 |
-
#
|
77 |
pdf_writer = PdfWriter()
|
78 |
pdf_writer.add_page(page)
|
79 |
|
@@ -81,16 +79,17 @@ def find_underwriter(uploaded_file):
|
|
81 |
with open(temp_page_path, "wb") as f:
|
82 |
pdf_writer.write(f)
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
return # Exit after finding the first match
|
87 |
|
88 |
st.warning(f"No pages contain the specified keywords for {section_name}.")
|
|
|
89 |
except Exception as e:
|
90 |
st.error(f"An error occurred while processing the PDF: {e}")
|
|
|
91 |
else:
|
92 |
st.warning("Please upload a PDF on the Home page first.")
|
93 |
-
|
94 |
|
95 |
def find_financial(uploaded_file, section_name):
|
96 |
"""
|
@@ -148,7 +147,7 @@ def find_financial(uploaded_file, section_name):
|
|
148 |
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
149 |
with open(temp_section_path, "wb") as f:
|
150 |
pdf_writer.write(f)
|
151 |
-
|
152 |
else:
|
153 |
st.warning(f"No pages matched the criteria for {section_name}.")
|
154 |
|
@@ -178,7 +177,7 @@ def find_financial(uploaded_file, section_name):
|
|
178 |
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
179 |
with open(temp_section_path, "wb") as f:
|
180 |
pdf_writer.write(f)
|
181 |
-
|
182 |
else:
|
183 |
st.warning(f"No pages matched the criteria for {section_name}.")
|
184 |
|
@@ -190,7 +189,7 @@ def find_financial(uploaded_file, section_name):
|
|
190 |
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
191 |
with open(temp_section_path, "wb") as f:
|
192 |
pdf_writer.write(f)
|
193 |
-
|
194 |
else:
|
195 |
st.warning(f"No pages matched the criteria for {section_name}.")
|
196 |
|
@@ -204,4 +203,4 @@ def find_financial(uploaded_file, section_name):
|
|
204 |
else:
|
205 |
st.warning("Please upload a PDF on the Home page first.")
|
206 |
# Stop processing since no file is uploaded
|
207 |
-
return False
|
|
|
1 |
import os
|
2 |
import re
|
3 |
from PyPDF2 import PdfReader, PdfWriter
|
|
|
4 |
import streamlit as st
|
5 |
from config import keywords_dict, stop_keywords, anti_keywords
|
6 |
|
7 |
def find_cover(uploaded_file):
|
8 |
"""
|
9 |
+
Extracts and saves the first page of a PDF to a temporary file.
|
10 |
|
11 |
Parameters:
|
12 |
uploaded_file: The uploaded PDF file.
|
13 |
|
14 |
Returns:
|
15 |
+
str: Path to the temporary file containing the first page of the PDF.
|
16 |
"""
|
17 |
section_title = "cover"
|
18 |
st.title(section_title.title())
|
|
|
24 |
first_page = pdf_reader.pages[0]
|
25 |
|
26 |
pdf_writer = PdfWriter()
|
27 |
+
temp_cover_page_path = os.path.join(f"temp_{section_title}.pdf")
|
28 |
+
with open(temp_cover_page_path, "wb") as f:
|
29 |
+
pdf_writer.add_page(first_page)
|
|
|
|
|
30 |
pdf_writer.write(f)
|
31 |
|
32 |
+
# Return the path to the temporary file
|
33 |
+
return temp_cover_page_path
|
34 |
except Exception as e:
|
35 |
st.error(f"An error occurred while processing the PDF: {e}")
|
36 |
+
return None
|
37 |
else:
|
38 |
st.warning("Please upload a PDF on the Home page first.")
|
39 |
+
return None
|
40 |
|
41 |
|
42 |
def find_underwriter(uploaded_file):
|
43 |
"""
|
44 |
+
Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
|
|
|
45 |
|
46 |
Parameters:
|
47 |
uploaded_file: The uploaded PDF file.
|
48 |
|
49 |
Returns:
|
50 |
+
str: Path to the temporary file containing the extracted 'underwriter' page(s).
|
51 |
"""
|
52 |
section_name = "underwriter"
|
53 |
st.title(section_name.title())
|
|
|
55 |
keyword_sets = keywords_dict.get(section_name, [])
|
56 |
if not keyword_sets:
|
57 |
st.error(f"No keywords defined for section: {section_name}")
|
58 |
+
return None
|
59 |
|
60 |
if uploaded_file:
|
61 |
try:
|
|
|
71 |
|
72 |
# Check if any keyword in the set is found on the page
|
73 |
if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
|
74 |
+
# Save the matched page to a temporary file
|
75 |
pdf_writer = PdfWriter()
|
76 |
pdf_writer.add_page(page)
|
77 |
|
|
|
79 |
with open(temp_page_path, "wb") as f:
|
80 |
pdf_writer.write(f)
|
81 |
|
82 |
+
# Return the path of the extracted page
|
83 |
+
return temp_page_path
|
|
|
84 |
|
85 |
st.warning(f"No pages contain the specified keywords for {section_name}.")
|
86 |
+
return None
|
87 |
except Exception as e:
|
88 |
st.error(f"An error occurred while processing the PDF: {e}")
|
89 |
+
return None
|
90 |
else:
|
91 |
st.warning("Please upload a PDF on the Home page first.")
|
92 |
+
return None
|
93 |
|
94 |
def find_financial(uploaded_file, section_name):
|
95 |
"""
|
|
|
147 |
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
148 |
with open(temp_section_path, "wb") as f:
|
149 |
pdf_writer.write(f)
|
150 |
+
return temp_section_path
|
151 |
else:
|
152 |
st.warning(f"No pages matched the criteria for {section_name}.")
|
153 |
|
|
|
177 |
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
178 |
with open(temp_section_path, "wb") as f:
|
179 |
pdf_writer.write(f)
|
180 |
+
return temp_section_path
|
181 |
else:
|
182 |
st.warning(f"No pages matched the criteria for {section_name}.")
|
183 |
|
|
|
189 |
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
|
190 |
with open(temp_section_path, "wb") as f:
|
191 |
pdf_writer.write(f)
|
192 |
+
return temp_section_path
|
193 |
else:
|
194 |
st.warning(f"No pages matched the criteria for {section_name}.")
|
195 |
|
|
|
203 |
else:
|
204 |
st.warning("Please upload a PDF on the Home page first.")
|
205 |
# Stop processing since no file is uploaded
|
206 |
+
return False
|