Spaces:

msr2903
/

ProspectusLens

Running

App Files Files Community

msr2903 commited on 28 days ago

Commit

3eed450

•

1 Parent(s): e9c51d1

Update app.py, pages.py, and section_extract.py

Browse files

Adding new features: Session state of the pages, so you don't have to wait the process if the pages has processed beforehand.

Files changed (5) hide show

__pycache__/pages.cpython-312.pyc +0 -0
__pycache__/section_extract.cpython-312.pyc +0 -0
app.py +3 -1
pages.py +79 -24
section_extract.py +210 -205

__pycache__/pages.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/pages.cpython-312.pyc and b/__pycache__/pages.cpython-312.pyc differ

__pycache__/section_extract.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/section_extract.cpython-312.pyc and b/__pycache__/section_extract.cpython-312.pyc differ

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import streamlit as st
-from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow
 # Define pages
 pages = {

 import streamlit as st
+from pages import home, cover, underwriter, income_statement, balance_sheet, cash_flow, uploader_sidebar
+uploader_sidebar()
 # Define pages
 pages = {

pages.py CHANGED Viewed

@@ -2,44 +2,99 @@ import streamlit as st
 from section_extract import find_cover, find_underwriter, find_financial
 from streamlit_pdf_viewer import pdf_viewer
 def home():
     st.title("Prospectus Lens")
-    st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus below!")
-    uploaded_file = st.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
-    st.session_state["uploaded_file"] = uploaded_file
-    st.caption("Made with ❤️ by @michael_sr24")
 def cover():
-    temp_cover_page_path = find_cover(uploaded_file=st.session_state.get("uploaded_file"))
-    if temp_cover_page_path:
-        pdf_viewer(temp_cover_page_path)
     else:
-        st.warning("Could not process the PDF file.")
 def underwriter():
-    temp_page_path = find_underwriter(uploaded_file=st.session_state.get("uploaded_file"))
-    if temp_page_path:
-        pdf_viewer(temp_page_path)
     else:
-        st.warning("Could not extract the underwriter section.")
 def income_statement():
-    temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="income_statement")
-    if temp_section_path:
-        pdf_viewer(temp_section_path)
     else:
-        st.warning("Could not extract the income statement section.")
 def balance_sheet():
-    temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="balance_sheet")
-    if temp_section_path:
-        pdf_viewer(temp_section_path)
     else:
-        st.warning("Could not extract the balance sheet section.")
 def cash_flow():
-    temp_section_path = find_financial(uploaded_file=st.session_state.get("uploaded_file"), section_name="cash_flow")
-    if temp_section_path:
-        pdf_viewer(temp_section_path)
     else:
-        st.warning("Could not extract the cash flow section.")

 from section_extract import find_cover, find_underwriter, find_financial
 from streamlit_pdf_viewer import pdf_viewer
+def uploader_sidebar():
+    uploaded_file = st.sidebar.file_uploader("Upload your Prospectus File", accept_multiple_files=False, type=["pdf"])
+    st.sidebar.caption("Made with ❤️ by @michael_sr24")
+    if uploaded_file:
+        # Initialize session state for processing flags and paths
+        if "uploaded_file" not in st.session_state:
+            st.session_state["uploaded_file"] = uploaded_file
+            st.session_state["cover_path"] = None
+            st.session_state["underwriter_path"] = None
+            st.session_state["income_statement_path"] = None
+            st.session_state["balance_sheet_path"] = None
+            st.session_state["cash_flow_path"] = None
+            st.session_state["processing"] = {
+                "cover_path": False,
+                "underwriter_path": False,
+                "income_statement_path": False,
+                "balance_sheet_path": False,
+                "cash_flow_path": False,
+            }
+            st.session_state["all_processed"] = False
+        else:
+            st.session_state["uploaded_file"] = uploaded_file
+        process_sections()
+def process_sections():
+    """Continuously process all sections in the background."""
+    if "processing" in st.session_state and not st.session_state.get("all_processed", False):
+        for key, processed in st.session_state["processing"].items():
+            if not processed:
+                if key == "cover_path":
+                    st.session_state[key] = find_cover(st.session_state["uploaded_file"])
+                elif key == "underwriter_path":
+                    st.session_state[key] = find_underwriter(st.session_state["uploaded_file"])
+                elif key == "income_statement_path":
+                    st.session_state[key] = find_financial(st.session_state["uploaded_file"], "income_statement")
+                elif key == "balance_sheet_path":
+                    st.session_state[key] = find_financial(st.session_state["uploaded_file"], "balance_sheet")
+                elif key == "cash_flow_path":
+                    st.session_state[key] = find_financial(st.session_state["uploaded_file"], "cash_flow")
+                st.session_state["processing"][key] = True  # Mark as processed
+                break
+        # Check if all sections are processed
+        st.session_state["all_processed"] = all(st.session_state["processing"].values())
+def show_section(section_key):
+    """Display the section if available, otherwise inform the user."""
+    temp_path = st.session_state.get(section_key)
+    if temp_path:
+        pdf_viewer(temp_path)
+    else:
+        if not st.session_state["processing"].get(section_key, False):
+            st.info(f"{section_key.replace('_', ' ').capitalize()} is still being processed.")
+        else:
+            st.warning(f"Could not process {section_key.replace('_', ' ')}.")
 def home():
     st.title("Prospectus Lens")
+    st.write("Welcome to the Prospectus Lens! Upload the PDF of the prospectus on the left sidebar!")
 def cover():
+    st.title("Cover")
+    if "uploaded_file" in st.session_state:
+        show_section("cover_path")
     else:
+        st.warning("Please upload a file first!")
 def underwriter():
+    st.title("Underwriter")
+    if "uploaded_file" in st.session_state:
+        show_section("underwriter_path")
     else:
+        st.warning("Please upload a file first!")
 def income_statement():
+    st.title("Income Statement")
+    if "uploaded_file" in st.session_state:
+        show_section("income_statement_path")
     else:
+        st.warning("Please upload a file first!")
 def balance_sheet():
+    st.title("Balance Sheet")
+    if "uploaded_file" in st.session_state:
+        show_section("balance_sheet_path")
     else:
+        st.warning("Please upload a file first!")
 def cash_flow():
+    st.title("Cash Flow")
+    if "uploaded_file" in st.session_state:
+        show_section("cash_flow_path")
     else:
+        st.warning("Please upload a file first!")

section_extract.py CHANGED Viewed

@@ -1,206 +1,211 @@
-import os
-import re
-from PyPDF2 import PdfReader, PdfWriter
-import streamlit as st
-from config import keywords_dict, stop_keywords, anti_keywords
-def find_cover(uploaded_file):
-    """
-    Extracts and saves the first page of a PDF to a temporary file.
-    Parameters:
-        uploaded_file: The uploaded PDF file.
-    Returns:
-        str: Path to the temporary file containing the first page of the PDF.
-    """
-    section_title = "cover"
-    st.title(section_title.title())
-    if uploaded_file:
-        try:
-            # Read the PDF and extract the first page
-            pdf_reader = PdfReader(uploaded_file)
-            first_page = pdf_reader.pages[0]
-            pdf_writer = PdfWriter()
-            temp_cover_page_path = os.path.join(f"temp_{section_title}.pdf")
-            with open(temp_cover_page_path, "wb") as f:
-                pdf_writer.add_page(first_page)
-                pdf_writer.write(f)
-            # Return the path to the temporary file
-            return temp_cover_page_path
-        except Exception as e:
-            st.error(f"An error occurred while processing the PDF: {e}")
-            return None
-    else:
-        st.warning("Please upload a PDF on the Home page first.")
-        return None
-def find_underwriter(uploaded_file):
-    """
-    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
-    Parameters:
-        uploaded_file: The uploaded PDF file.
-    Returns:
-        str: Path to the temporary file containing the extracted 'underwriter' page(s).
-    """
-    section_name = "underwriter"
-    st.title(section_name.title())
-    keyword_sets = keywords_dict.get(section_name, [])
-    if not keyword_sets:
-        st.error(f"No keywords defined for section: {section_name}")
-        return None
-    if uploaded_file:
-        try:
-            pdf_reader = PdfReader(uploaded_file)
-            total_pages = len(pdf_reader.pages)
-            start_page = total_pages // 3  # Skip the first 1/3 of the PDF
-            pages = pdf_reader.pages[start_page:]
-            # Loop through the keyword sets
-            for keyword_set in keyword_sets:
-                for page_num, page in enumerate(pages, start=start_page + 1):
-                    text = page.extract_text()
-                    # Check if any keyword in the set is found on the page
-                    if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
-                        # Save the matched page to a temporary file
-                        pdf_writer = PdfWriter()
-                        pdf_writer.add_page(page)
-                        temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
-                        with open(temp_page_path, "wb") as f:
-                            pdf_writer.write(f)
-                        # Return the path of the extracted page
-                        return temp_page_path
-            st.warning(f"No pages contain the specified keywords for {section_name}.")
-            return None
-        except Exception as e:
-            st.error(f"An error occurred while processing the PDF: {e}")
-            return None
-    else:
-        st.warning("Please upload a PDF on the Home page first.")
-        return None
-def find_financial(uploaded_file, section_name):
-    """
-    Extracts and displays sections of a PDF based on keyword matches.
-    Parameters:
-        uploaded_file: The uploaded PDF file (Streamlit file uploader object).
-        section_name: The name of the section to search for (e.g., "income_statement").
-    Returns:
-        bool: True if processing completed without interruptions; False if stopped or an error occurred.
-    """
-    st.title(section_name.replace("_", " ").title())
-    if uploaded_file:
-        try:
-            pdf_reader = PdfReader(uploaded_file)
-            total_pages = len(pdf_reader.pages)
-            # Step 1: Start from the second half of the PDF
-            start_page = total_pages // 2
-            pages = pdf_reader.pages[start_page:]
-            section_keywords = keywords_dict.get(section_name, [])
-            section_stop_keywords = stop_keywords.get(section_name, [])
-            section_anti_keywords = anti_keywords.get(section_name, [])
-            pdf_writer = PdfWriter()  # Writer for the extracted pages
-            extraction_started = False  # Flag to check if extraction has started
-            for page_num, page in enumerate(pages, start=start_page + 1):
-                text = page.extract_text()
-                # Step 2: Find the keywords within the keywords_dict
-                if not extraction_started:
-                    for keyword_set in section_keywords:
-                        if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
-                            st.write(f"Keywords matched on page {page_num}. Starting extraction.")
-                            pdf_writer.add_page(page)
-                            # Check for stop keywords on the same page
-                            if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
-                                   for stop_set in section_stop_keywords):
-                                st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")
-                                # Check for anti-keywords before stopping
-                                if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
-                                       for anti_set in section_anti_keywords):
-                                    st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
-                                    pdf_writer.pages.pop()  # Remove the last added page
-                                # Save and display the extracted pages (if any)
-                                if len(pdf_writer.pages) > 0:
-                                    temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
-                                    with open(temp_section_path, "wb") as f:
-                                        pdf_writer.write(f)
-                                    return temp_section_path
-                                else:
-                                    st.warning(f"No pages matched the criteria for {section_name}.")
-                                # Stop extraction immediately and signal to stop all processing
-                                return False
-                            else:
-                                # Continue extraction
-                                extraction_started = True
-                                break
-                elif extraction_started:
-                    # Step 3: Add the page to the output
-                    pdf_writer.add_page(page)
-                    # Step 4: Check for stop keywords
-                    if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
-                           for stop_set in section_stop_keywords):
-                        st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")
-                        # Step 5: After stopping, check for anti-keywords
-                        if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
-                               for anti_set in section_anti_keywords):
-                            st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
-                            pdf_writer.pages.pop()  # Remove the last added page
-                        # Save and display the extracted pages (if any)
-                        if len(pdf_writer.pages) > 0:
-                            temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
-                            with open(temp_section_path, "wb") as f:
-                                pdf_writer.write(f)
-                            return temp_section_path
-                        else:
-                            st.warning(f"No pages matched the criteria for {section_name}.")
-                        # Stop extraction and signal to stop all processing
-                        return False
-            # If extraction finished without hitting stop keywords, save and display the pages
-            if len(pdf_writer.pages) > 0:
-                temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
-                with open(temp_section_path, "wb") as f:
-                    pdf_writer.write(f)
-                return temp_section_path
-            else:
-                st.warning(f"No pages matched the criteria for {section_name}.")
-            # Indicate that processing can continue
-            return True
-        except Exception as e:
-            st.error(f"An error occurred while processing the PDF: {e}")
-            # Stop processing due to an error
-            return False
-    else:
-        st.warning("Please upload a PDF on the Home page first.")
-        # Stop processing since no file is uploaded
         return False

+import os
+import re
+from PyPDF2 import PdfReader, PdfWriter
+import streamlit as st
+from config import keywords_dict, stop_keywords, anti_keywords
+def find_cover(uploaded_file):
+    """
+    Extracts and saves the first page of a PDF to a temporary file.
+    Parameters:
+        uploaded_file: The uploaded PDF file.
+    Returns:
+        str: Path to the temporary file containing the first page of the PDF.
+    """
+    section_title = "cover"
+    if uploaded_file:
+        try:
+            # Read the PDF and extract the first page
+            pdf_reader = PdfReader(uploaded_file)
+            first_page = pdf_reader.pages[0]
+            pdf_writer = PdfWriter()
+            temp_cover_page_path = os.path.join(f"temp_{section_title}_1.pdf")
+            with open(temp_cover_page_path, "wb") as f:
+                pdf_writer.add_page(first_page)
+                pdf_writer.write(f)
+            # Return the path to the temporary file
+            return temp_cover_page_path
+        except Exception as e:
+            st.error(f"An error occurred while processing the PDF: {e}")
+            return None
+    else:
+        st.warning("Please upload a PDF on the Home page first.")
+        return None
+def find_underwriter(uploaded_file):
+    """
+    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and returns the extracted file path.
+    Parameters:
+        uploaded_file: The uploaded PDF file.
+    Returns:
+        str: Path to the temporary file containing the extracted 'underwriter' page(s).
+    """
+    section_name = "underwriter"
+    keyword_sets = keywords_dict.get(section_name, [])
+    if not keyword_sets:
+        st.error(f"No keywords defined for section: {section_name}")
+        return None
+    if uploaded_file:
+        try:
+            pdf_reader = PdfReader(uploaded_file)
+            total_pages = len(pdf_reader.pages)
+            start_page = total_pages // 3  # Skip the first 1/3 of the PDF
+            pages = pdf_reader.pages[start_page:]
+            # Loop through the keyword sets
+            for keyword_set in keyword_sets:
+                for page_num, page in enumerate(pages, start=start_page + 1):
+                    text = page.extract_text()
+                    # Check if any keyword in the set is found on the page
+                    if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
+                        # Save the matched page to a temporary file
+                        pdf_writer = PdfWriter()
+                        pdf_writer.add_page(page)
+                        temp_page_path = os.path.join(f"temp_{section_name}_{page_num}.pdf")
+                        with open(temp_page_path, "wb") as f:
+                            pdf_writer.write(f)
+                        # Return the path of the extracted page
+                        return temp_page_path
+            st.warning(f"No pages contain the specified keywords for {section_name}.")
+            return None
+        except Exception as e:
+            st.error(f"An error occurred while processing the PDF: {e}")
+            return None
+    else:
+        st.warning("Please upload a PDF on the Home page first.")
+        return None
+def find_financial(uploaded_file, section_name):
+    """
+    Extracts and displays sections of a PDF based on keyword matches.
+    Parameters:
+        uploaded_file: The uploaded PDF file (Streamlit file uploader object).
+        section_name: The name of the section to search for (e.g., "income_statement").
+    Returns:
+        bool: True if processing completed without interruptions; False if stopped or an error occurred.
+    """
+    if uploaded_file:
+        try:
+            pdf_reader = PdfReader(uploaded_file)
+            total_pages = len(pdf_reader.pages)
+            # Step 1: Start from the second half of the PDF
+            start_page = total_pages // 2
+            pages = pdf_reader.pages[start_page:]
+            section_keywords = keywords_dict.get(section_name, [])
+            section_stop_keywords = stop_keywords.get(section_name, [])
+            section_anti_keywords = anti_keywords.get(section_name, [])
+            pdf_writer = PdfWriter()  # Writer for the extracted pages
+            extraction_started = False  # Flag to check if extraction has started
+            extraction_start_page = None  # Track the starting page number
+            pages_extracted = 0  # Counter for extracted pages
+            for page_num, page in enumerate(pages, start=start_page + 1):
+                text = page.extract_text()
+                # Step 2: Find the keywords within the keywords_dict
+                if not extraction_started:
+                    for keyword_set in section_keywords:
+                        if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
+                            pdf_writer.add_page(page)
+                            pages_extracted += 1
+                            extraction_start_page = page_num  # Set the starting page number
+                            # Check for stop keywords on the same page
+                            if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
+                                   for stop_set in section_stop_keywords):
+                                # Check for anti-keywords before stopping
+                                if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
+                                       for anti_set in section_anti_keywords):
+                                    pdf_writer.pages.pop()  # Remove the last added page
+                                    pages_extracted -= 1
+                                # Save and display the extracted pages (if any)
+                                if len(pdf_writer.pages) > 0:
+                                    temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
+                                    with open(temp_section_path, "wb") as f:
+                                        pdf_writer.write(f)
+                                    return temp_section_path
+                                else:
+                                    st.warning(f"No pages matched the criteria for {section_name}.")
+                                # Stop extraction immediately and signal to stop all processing
+                                return False
+                            else:
+                                # Continue extraction
+                                extraction_started = True
+                                break
+                elif extraction_started:
+                    # Check if we've reached the 3-page limit
+                    if pages_extracted >= 3:
+                        if len(pdf_writer.pages) > 0:
+                            temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num-1}.pdf")
+                            with open(temp_section_path, "wb") as f:
+                                pdf_writer.write(f)
+                            return temp_section_path
+                        return False
+                    # Step 3: Add the page to the output
+                    pdf_writer.add_page(page)
+                    pages_extracted += 1
+                    # Step 4: Check for stop keywords
+                    if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
+                           for stop_set in section_stop_keywords):
+                        # Step 5: After stopping, check for anti-keywords
+                        if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
+                               for anti_set in section_anti_keywords):
+                            pdf_writer.pages.pop()  # Remove the last added page
+                            pages_extracted -= 1
+                        # Save and display the extracted pages (if any)
+                        if len(pdf_writer.pages) > 0:
+                            temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
+                            with open(temp_section_path, "wb") as f:
+                                pdf_writer.write(f)
+                            return temp_section_path
+                        else:
+                            st.warning(f"No pages matched the criteria for {section_name}.")
+                        # Stop extraction and signal to stop all processing
+                        return False
+            # If extraction finished without hitting stop keywords, save and display the pages
+            if len(pdf_writer.pages) > 0:
+                temp_section_path = os.path.join(f"temp_{section_name}_{extraction_start_page}-{page_num}.pdf")
+                with open(temp_section_path, "wb") as f:
+                    pdf_writer.write(f)
+                return temp_section_path
+            else:
+                st.warning(f"No pages matched the criteria for {section_name}.")
+            # Indicate that processing can continue
+            return True
+        except Exception as e:
+            st.error(f"An error occurred while processing the PDF: {e}")
+            # Stop processing due to an error
+            return False
+    else:
+        st.warning("Please upload a PDF on the Home page first.")
+        # Stop processing since no file is uploaded
         return False