Spaces:

msr2903
/

ProspectusLens

Sleeping

App Files Files Community

msr2903 commited on Nov 28, 2024

Commit

e9c51d1

1 Parent(s): 162a16c

Update pages.py and section_extract.py to preview the pdf in the pages.py

Browse files

Files changed (5) hide show

__pycache__/config.cpython-312.pyc +0 -0
__pycache__/pages.cpython-312.pyc +0 -0
__pycache__/section_extract.cpython-312.pyc +0 -0
pages.py +26 -5
section_extract.py +22 -23

__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.61 kB). View file

__pycache__/pages.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/pages.cpython-312.pyc and b/__pycache__/pages.cpython-312.pyc differ

__pycache__/section_extract.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/section_extract.cpython-312.pyc and b/__pycache__/section_extract.cpython-312.pyc differ

pages.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import streamlit as st
 from section_extract import find_cover, find_underwriter, find_financial
 def home():
     st.title("Prospectus Lens")
@@ -9,16 +10,36 @@ def home():
     st.caption("Made with ❤️ by @michael_sr24")
 def cover():
-    find_cover(uploaded_file=st.session_state.get("uploaded_file"))
 def underwriter():
-    find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
 def income_statement():
-    find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
 def balance_sheet():
-    find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
 def cash_flow():
-    find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")

 import streamlit as st
 from section_extract import find_cover, find_underwriter, find_financial
+from streamlit_pdf_viewer import pdf_viewer
 def home():
     st.title("Prospectus Lens")
     st.caption("Made with ❤️ by @michael_sr24")
 def cover():
+    temp_cover_page_path = find_cover(uploaded_file=st.session_state.get("uploaded_file"))
+    if temp_cover_page_path:
+        pdf_viewer(temp_cover_page_path)
+    else:
+        st.warning("Could not process the PDF file.")
 def underwriter():
+    temp_page_path = find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
+    if temp_page_path:
+        pdf_viewer(temp_page_path)
+    else:
+        st.warning("Could not extract the underwriter section.")
 def income_statement():
+    temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
+    if temp_section_path:
+        pdf_viewer(temp_section_path)
+    else:
+        st.warning("Could not extract the income statement section.")
 def balance_sheet():
+    temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
+    if temp_section_path:
+        pdf_viewer(temp_section_path)
+    else:
+        st.warning("Could not extract the balance sheet section.")
 def cash_flow():
+    temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")
+    if temp_section_path:
+        pdf_viewer(temp_section_path)
+    else:
+        st.warning("Could not extract the cash flow section.")

section_extract.py CHANGED Viewed

@@ -1,19 +1,18 @@
 import os
 import re
 from PyPDF2 import PdfReader, PdfWriter
-from streamlit_pdf_viewer import pdf_viewer
 import streamlit as st
 from config import keywords_dict, stop_keywords, anti_keywords
 def find_cover(uploaded_file):
     """
-    Extracts and displays the first page of a PDF.
     Parameters:
         uploaded_file: The uploaded PDF file.
     Returns:
-        None
     """
     section_title = "cover"
     st.title(section_title.title())
@@ -25,31 +24,30 @@ def find_cover(uploaded_file):
             first_page = pdf_reader.pages[0]
             pdf_writer = PdfWriter()
-            pdf_writer.add_page(first_page)
-            # Save the first page to a temporary file
-            temp_first_page_path = os.path.join(f"temp_{section_title}.pdf")
-            with open(temp_first_page_path, "wb") as f:
                 pdf_writer.write(f)
-            # Display the first page using pdf_viewer
-            pdf_viewer(temp_first_page_path)
         except Exception as e:
             st.error(f"An error occurred while processing the PDF: {e}")
     else:
         st.warning("Please upload a PDF on the Home page first.")
 def find_underwriter(uploaded_file):
     """
-    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
-    starting from the last 2/3 of the PDF to improve performance.
     Parameters:
         uploaded_file: The uploaded PDF file.
     Returns:
-        None
     """
     section_name = "underwriter"
     st.title(section_name.title())
@@ -57,7 +55,7 @@ def find_underwriter(uploaded_file):
     keyword_sets = keywords_dict.get(section_name, [])
     if not keyword_sets:
         st.error(f"No keywords defined for section: {section_name}")
-        return
     if uploaded_file:
         try:
@@ -73,7 +71,7 @@ def find_underwriter(uploaded_file):
                     # Check if any keyword in the set is found on the page
                     if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
-                        # Display the matched page
                         pdf_writer = PdfWriter()
                         pdf_writer.add_page(page)
@@ -81,16 +79,17 @@ def find_underwriter(uploaded_file):
                         with open(temp_page_path, "wb") as f:
                             pdf_writer.write(f)
-                        st.write(f"Keyword found on page {page_num}")
-                        pdf_viewer(temp_page_path)
-                        return  # Exit after finding the first match
             st.warning(f"No pages contain the specified keywords for {section_name}.")
         except Exception as e:
             st.error(f"An error occurred while processing the PDF: {e}")
     else:
         st.warning("Please upload a PDF on the Home page first.")
 def find_financial(uploaded_file, section_name):
     """
@@ -148,7 +147,7 @@ def find_financial(uploaded_file, section_name):
                                     temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                                     with open(temp_section_path, "wb") as f:
                                         pdf_writer.write(f)
-                                    pdf_viewer(temp_section_path)
                                 else:
                                     st.warning(f"No pages matched the criteria for {section_name}.")
@@ -178,7 +177,7 @@ def find_financial(uploaded_file, section_name):
                             temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                             with open(temp_section_path, "wb") as f:
                                 pdf_writer.write(f)
-                            pdf_viewer(temp_section_path)
                         else:
                             st.warning(f"No pages matched the criteria for {section_name}.")
@@ -190,7 +189,7 @@ def find_financial(uploaded_file, section_name):
                 temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                 with open(temp_section_path, "wb") as f:
                     pdf_writer.write(f)
-                pdf_viewer(temp_section_path)
             else:
                 st.warning(f"No pages matched the criteria for {section_name}.")
@@ -204,4 +203,4 @@ def find_financial(uploaded_file, section_name):
     else:
         st.warning("Please upload a PDF on the Home page first.")
         # Stop processing since no file is uploaded
-        return False

 import os
 import re
 from PyPDF2 import PdfReader, PdfWriter
 import streamlit as st
 from config import keywords_dict, stop_keywords, anti_keywords
 def find_cover(uploaded_file):
     """
+    Extracts and saves the first page of a PDF to a temporary file.
     Parameters:
         uploaded_file: The uploaded PDF file.
     Returns:
+        str: Path to the temporary file containing the first page of the PDF.
     """
     section_title = "cover"
     st.title(section_title.title())
             first_page = pdf_reader.pages[0]
             pdf_writer = PdfWriter()
+            temp_cover_page_path = os.path.join(f"temp_{section_title}.pdf")
+            with open(temp_cover_page_path, "wb") as f:
+                pdf_writer.add_page(first_page)
                 pdf_writer.write(f)
+            # Return the path to the temporary file
+            return temp_cover_page_path
         except Exception as e:
             st.error(f"An error occurred while processing the PDF: {e}")
+            return None
     else:
         st.warning("Please upload a PDF on the Home page first.")
+        return None
 def find_underwriter(uploaded_file):
     """
+    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
     Parameters:
         uploaded_file: The uploaded PDF file.
     Returns:
+        str: Path to the temporary file containing the extracted 'underwriter' page(s).
     """
     section_name = "underwriter"
     st.title(section_name.title())
     keyword_sets = keywords_dict.get(section_name, [])
     if not keyword_sets:
         st.error(f"No keywords defined for section: {section_name}")
+        return None
     if uploaded_file:
         try:
                     # Check if any keyword in the set is found on the page
                     if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
+                        # Save the matched page to a temporary file
                         pdf_writer = PdfWriter()
                         pdf_writer.add_page(page)
                         with open(temp_page_path, "wb") as f:
                             pdf_writer.write(f)
+                        # Return the path of the extracted page
+                        return temp_page_path
             st.warning(f"No pages contain the specified keywords for {section_name}.")
+            return None
         except Exception as e:
             st.error(f"An error occurred while processing the PDF: {e}")
+            return None
     else:
         st.warning("Please upload a PDF on the Home page first.")
+        return None
 def find_financial(uploaded_file, section_name):
     """
                                     temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                                     with open(temp_section_path, "wb") as f:
                                         pdf_writer.write(f)
+                                    return temp_section_path
                                 else:
                                     st.warning(f"No pages matched the criteria for {section_name}.")
                             temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                             with open(temp_section_path, "wb") as f:
                                 pdf_writer.write(f)
+                            return temp_section_path
                         else:
                             st.warning(f"No pages matched the criteria for {section_name}.")
                 temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                 with open(temp_section_path, "wb") as f:
                     pdf_writer.write(f)
+                return temp_section_path
             else:
                 st.warning(f"No pages matched the criteria for {section_name}.")
     else:
         st.warning("Please upload a PDF on the Home page first.")
         # Stop processing since no file is uploaded
+        return False