msr2903 commited on
Commit
e9c51d1
1 Parent(s): 162a16c

Update pages.py and section_extract.py to preview the pdf in the pages.py

Browse files
__pycache__/config.cpython-312.pyc ADDED
Binary file (1.61 kB). View file
 
__pycache__/pages.cpython-312.pyc CHANGED
Binary files a/__pycache__/pages.cpython-312.pyc and b/__pycache__/pages.cpython-312.pyc differ
 
__pycache__/section_extract.cpython-312.pyc CHANGED
Binary files a/__pycache__/section_extract.cpython-312.pyc and b/__pycache__/section_extract.cpython-312.pyc differ
 
pages.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  from section_extract import find_cover, find_underwriter, find_financial
 
3
 
4
  def home():
5
  st.title("Prospectus Lens")
@@ -9,16 +10,36 @@ def home():
9
  st.caption("Made with ❤️ by @michael_sr24")
10
 
11
  def cover():
12
- find_cover(uploaded_file=st.session_state.get("uploaded_file"))
 
 
 
 
13
 
14
  def underwriter():
15
- find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
 
 
 
 
16
 
17
  def income_statement():
18
- find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
 
 
 
 
19
 
20
  def balance_sheet():
21
- find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
 
 
 
 
22
 
23
  def cash_flow():
24
- find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")
 
 
 
 
 
1
  import streamlit as st
2
  from section_extract import find_cover, find_underwriter, find_financial
3
+ from streamlit_pdf_viewer import pdf_viewer
4
 
5
  def home():
6
  st.title("Prospectus Lens")
 
10
  st.caption("Made with ❤️ by @michael_sr24")
11
 
12
  def cover():
13
+ temp_cover_page_path = find_cover(uploaded_file=st.session_state.get("uploaded_file"))
14
+ if temp_cover_page_path:
15
+ pdf_viewer(temp_cover_page_path)
16
+ else:
17
+ st.warning("Could not process the PDF file.")
18
 
19
  def underwriter():
20
+ temp_page_path = find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
21
+ if temp_page_path:
22
+ pdf_viewer(temp_page_path)
23
+ else:
24
+ st.warning("Could not extract the underwriter section.")
25
 
26
  def income_statement():
27
+ temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
28
+ if temp_section_path:
29
+ pdf_viewer(temp_section_path)
30
+ else:
31
+ st.warning("Could not extract the income statement section.")
32
 
33
  def balance_sheet():
34
+ temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
35
+ if temp_section_path:
36
+ pdf_viewer(temp_section_path)
37
+ else:
38
+ st.warning("Could not extract the balance sheet section.")
39
 
40
  def cash_flow():
41
+ temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")
42
+ if temp_section_path:
43
+ pdf_viewer(temp_section_path)
44
+ else:
45
+ st.warning("Could not extract the cash flow section.")
section_extract.py CHANGED
@@ -1,19 +1,18 @@
1
  import os
2
  import re
3
  from PyPDF2 import PdfReader, PdfWriter
4
- from streamlit_pdf_viewer import pdf_viewer
5
  import streamlit as st
6
  from config import keywords_dict, stop_keywords, anti_keywords
7
 
8
  def find_cover(uploaded_file):
9
  """
10
- Extracts and displays the first page of a PDF.
11
 
12
  Parameters:
13
  uploaded_file: The uploaded PDF file.
14
 
15
  Returns:
16
- None
17
  """
18
  section_title = "cover"
19
  st.title(section_title.title())
@@ -25,31 +24,30 @@ def find_cover(uploaded_file):
25
  first_page = pdf_reader.pages[0]
26
 
27
  pdf_writer = PdfWriter()
28
- pdf_writer.add_page(first_page)
29
-
30
- # Save the first page to a temporary file
31
- temp_first_page_path = os.path.join(f"temp_{section_title}.pdf")
32
- with open(temp_first_page_path, "wb") as f:
33
  pdf_writer.write(f)
34
 
35
- # Display the first page using pdf_viewer
36
- pdf_viewer(temp_first_page_path)
37
  except Exception as e:
38
  st.error(f"An error occurred while processing the PDF: {e}")
 
39
  else:
40
  st.warning("Please upload a PDF on the Home page first.")
 
41
 
42
 
43
  def find_underwriter(uploaded_file):
44
  """
45
- Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
46
- starting from the last 2/3 of the PDF to improve performance.
47
 
48
  Parameters:
49
  uploaded_file: The uploaded PDF file.
50
 
51
  Returns:
52
- None
53
  """
54
  section_name = "underwriter"
55
  st.title(section_name.title())
@@ -57,7 +55,7 @@ def find_underwriter(uploaded_file):
57
  keyword_sets = keywords_dict.get(section_name, [])
58
  if not keyword_sets:
59
  st.error(f"No keywords defined for section: {section_name}")
60
- return
61
 
62
  if uploaded_file:
63
  try:
@@ -73,7 +71,7 @@ def find_underwriter(uploaded_file):
73
 
74
  # Check if any keyword in the set is found on the page
75
  if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
76
- # Display the matched page
77
  pdf_writer = PdfWriter()
78
  pdf_writer.add_page(page)
79
 
@@ -81,16 +79,17 @@ def find_underwriter(uploaded_file):
81
  with open(temp_page_path, "wb") as f:
82
  pdf_writer.write(f)
83
 
84
- st.write(f"Keyword found on page {page_num}")
85
- pdf_viewer(temp_page_path)
86
- return # Exit after finding the first match
87
 
88
  st.warning(f"No pages contain the specified keywords for {section_name}.")
 
89
  except Exception as e:
90
  st.error(f"An error occurred while processing the PDF: {e}")
 
91
  else:
92
  st.warning("Please upload a PDF on the Home page first.")
93
-
94
 
95
  def find_financial(uploaded_file, section_name):
96
  """
@@ -148,7 +147,7 @@ def find_financial(uploaded_file, section_name):
148
  temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
149
  with open(temp_section_path, "wb") as f:
150
  pdf_writer.write(f)
151
- pdf_viewer(temp_section_path)
152
  else:
153
  st.warning(f"No pages matched the criteria for {section_name}.")
154
 
@@ -178,7 +177,7 @@ def find_financial(uploaded_file, section_name):
178
  temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
179
  with open(temp_section_path, "wb") as f:
180
  pdf_writer.write(f)
181
- pdf_viewer(temp_section_path)
182
  else:
183
  st.warning(f"No pages matched the criteria for {section_name}.")
184
 
@@ -190,7 +189,7 @@ def find_financial(uploaded_file, section_name):
190
  temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
191
  with open(temp_section_path, "wb") as f:
192
  pdf_writer.write(f)
193
- pdf_viewer(temp_section_path)
194
  else:
195
  st.warning(f"No pages matched the criteria for {section_name}.")
196
 
@@ -204,4 +203,4 @@ def find_financial(uploaded_file, section_name):
204
  else:
205
  st.warning("Please upload a PDF on the Home page first.")
206
  # Stop processing since no file is uploaded
207
- return False
 
1
  import os
2
  import re
3
  from PyPDF2 import PdfReader, PdfWriter
 
4
  import streamlit as st
5
  from config import keywords_dict, stop_keywords, anti_keywords
6
 
7
  def find_cover(uploaded_file):
8
  """
9
+ Extracts and saves the first page of a PDF to a temporary file.
10
 
11
  Parameters:
12
  uploaded_file: The uploaded PDF file.
13
 
14
  Returns:
15
+ str: Path to the temporary file containing the first page of the PDF.
16
  """
17
  section_title = "cover"
18
  st.title(section_title.title())
 
24
  first_page = pdf_reader.pages[0]
25
 
26
  pdf_writer = PdfWriter()
27
+ temp_cover_page_path = os.path.join(f"temp_{section_title}.pdf")
28
+ with open(temp_cover_page_path, "wb") as f:
29
+ pdf_writer.add_page(first_page)
 
 
30
  pdf_writer.write(f)
31
 
32
+ # Return the path to the temporary file
33
+ return temp_cover_page_path
34
  except Exception as e:
35
  st.error(f"An error occurred while processing the PDF: {e}")
36
+ return None
37
  else:
38
  st.warning("Please upload a PDF on the Home page first.")
39
+ return None
40
 
41
 
42
  def find_underwriter(uploaded_file):
43
  """
44
+ Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
 
45
 
46
  Parameters:
47
  uploaded_file: The uploaded PDF file.
48
 
49
  Returns:
50
+ str: Path to the temporary file containing the extracted 'underwriter' page(s).
51
  """
52
  section_name = "underwriter"
53
  st.title(section_name.title())
 
55
  keyword_sets = keywords_dict.get(section_name, [])
56
  if not keyword_sets:
57
  st.error(f"No keywords defined for section: {section_name}")
58
+ return None
59
 
60
  if uploaded_file:
61
  try:
 
71
 
72
  # Check if any keyword in the set is found on the page
73
  if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
74
+ # Save the matched page to a temporary file
75
  pdf_writer = PdfWriter()
76
  pdf_writer.add_page(page)
77
 
 
79
  with open(temp_page_path, "wb") as f:
80
  pdf_writer.write(f)
81
 
82
+ # Return the path of the extracted page
83
+ return temp_page_path
 
84
 
85
  st.warning(f"No pages contain the specified keywords for {section_name}.")
86
+ return None
87
  except Exception as e:
88
  st.error(f"An error occurred while processing the PDF: {e}")
89
+ return None
90
  else:
91
  st.warning("Please upload a PDF on the Home page first.")
92
+ return None
93
 
94
  def find_financial(uploaded_file, section_name):
95
  """
 
147
  temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
148
  with open(temp_section_path, "wb") as f:
149
  pdf_writer.write(f)
150
+ return temp_section_path
151
  else:
152
  st.warning(f"No pages matched the criteria for {section_name}.")
153
 
 
177
  temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
178
  with open(temp_section_path, "wb") as f:
179
  pdf_writer.write(f)
180
+ return temp_section_path
181
  else:
182
  st.warning(f"No pages matched the criteria for {section_name}.")
183
 
 
189
  temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
190
  with open(temp_section_path, "wb") as f:
191
  pdf_writer.write(f)
192
+ return temp_section_path
193
  else:
194
  st.warning(f"No pages matched the criteria for {section_name}.")
195
 
 
203
  else:
204
  st.warning("Please upload a PDF on the Home page first.")
205
  # Stop processing since no file is uploaded
206
+ return False