Spaces:

msr2903
/

ProspectusLens

Running

File size: 9,706 Bytes

d58052d

import os
import re
from PyPDF2 import PdfReader, PdfWriter
from streamlit_pdf_viewer import pdf_viewer
import streamlit as st

def find_cover(uploaded_file):
    """

    Extracts and displays the first page of a PDF.



    Parameters:

        uploaded_file: The uploaded PDF file.



    Returns:

        None

    """
    section_title = "Cover"
    st.title(section_title)

    if uploaded_file:
        try:
            # Read the PDF and extract the first page
            pdf_reader = PdfReader(uploaded_file)
            first_page = pdf_reader.pages[0]

            pdf_writer = PdfWriter()
            pdf_writer.add_page(first_page)

            # Save the first page to a temporary file
            temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
            with open(temp_first_page_path, "wb") as f:
                pdf_writer.write(f)

            # Display the first page using pdf_viewer
            pdf_viewer(temp_first_page_path)
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
    else:
        st.warning("Please upload a PDF on the Home page first.")


def find_underwriter(uploaded_file, section_name, keywords_dict):
    """

    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,

    starting from the last 2/3 of the PDF to improve performance.



    Parameters:

        uploaded_file: The uploaded PDF file.

        section_name: The name of the section (e.g., "Underwriter").

        keywords_dict: Dictionary containing keyword sets for different sections.



    Returns:

        None

    """
    st.title(section_name.title())

    keyword_sets = keywords_dict.get(section_name, [])
    if not keyword_sets:
        st.error(f"No keywords defined for section: {section_name}")
        return

    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)
            start_page = total_pages // 3  # Skip the first 1/3 of the PDF
            pages = pdf_reader.pages[start_page:]

            # Loop through the keyword sets
            for keyword_set in keyword_sets:
                for page_num, page in enumerate(pages, start=start_page + 1):
                    text = page.extract_text()
                    
                    # Check if any keyword in the set is found on the page
                    if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                        # Display the matched page
                        pdf_writer = PdfWriter()
                        pdf_writer.add_page(page)

                        temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
                        with open(temp_page_path, "wb") as f:
                            pdf_writer.write(f)

                        st.write(f"Keyword found on page {page_num}")
                        pdf_viewer(temp_page_path)
                        return  # Exit after finding the first match

            st.warning(f"No pages contain the specified keywords for {section_name}.")
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
    else:
        st.warning("Please upload a PDF on the Home page first.")


def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
    """

    Extracts and displays sections of a PDF based on keyword matches.



    Parameters:

        uploaded_file: The uploaded PDF file (Streamlit file uploader object).

        section_name: The name of the section to search for (e.g., "income_statement").

        keywords_dict: A dictionary containing keyword sets for different sections.

        stop_keywords: A dictionary of keywords to indicate where extraction should stop.

        anti_keywords: A dictionary of keywords to exclude specific pages from the results.



    Returns:

        bool: True if processing completed without interruptions; False if stopped or an error occurred.

    """
    st.title(section_name.replace("_", " ").title())

    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)

            # Step 1: Start from the second half of the PDF
            start_page = total_pages // 2
            pages = pdf_reader.pages[start_page:]

            section_keywords = keywords_dict.get(section_name, [])
            section_stop_keywords = stop_keywords.get(section_name, [])
            section_anti_keywords = anti_keywords.get(section_name, [])

            pdf_writer = PdfWriter()  # Writer for the extracted pages
            extraction_started = False  # Flag to check if extraction has started

            for page_num, page in enumerate(pages, start=start_page + 1):
                text = page.extract_text()

                # Step 2: Find the keywords within the keywords_dict
                if not extraction_started:
                    for keyword_set in section_keywords:
                        if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                            st.write(f"Keywords matched on page {page_num}. Starting extraction.")
                            pdf_writer.add_page(page)

                            # Check for stop keywords on the same page
                            if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                                   for stop_set in section_stop_keywords):
                                st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")

                                # Check for anti-keywords before stopping
                                if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                                       for anti_set in section_anti_keywords):
                                    st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
                                    pdf_writer.pages.pop()  # Remove the last added page

                                # Save and display the extracted pages (if any)
                                if len(pdf_writer.pages) > 0:
                                    temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                                    with open(temp_section_path, "wb") as f:
                                        pdf_writer.write(f)
                                    pdf_viewer(temp_section_path)
                                else:
                                    st.warning(f"No pages matched the criteria for {section_name}.")

                                # Stop extraction immediately and signal to stop all processing
                                return False
                            else:
                                # Continue extraction
                                extraction_started = True
                                break
                elif extraction_started:
                    # Step 3: Add the page to the output
                    pdf_writer.add_page(page)

                    # Step 4: Check for stop keywords
                    if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                           for stop_set in section_stop_keywords):
                        st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")

                        # Step 5: After stopping, check for anti-keywords
                        if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                               for anti_set in section_anti_keywords):
                            st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
                            pdf_writer.pages.pop()  # Remove the last added page

                        # Save and display the extracted pages (if any)
                        if len(pdf_writer.pages) > 0:
                            temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                            with open(temp_section_path, "wb") as f:
                                pdf_writer.write(f)
                            pdf_viewer(temp_section_path)
                        else:
                            st.warning(f"No pages matched the criteria for {section_name}.")

                        # Stop extraction and signal to stop all processing
                        return False

            # If extraction finished without hitting stop keywords, save and display the pages
            if len(pdf_writer.pages) > 0:
                temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                with open(temp_section_path, "wb") as f:
                    pdf_writer.write(f)
                pdf_viewer(temp_section_path)
            else:
                st.warning(f"No pages matched the criteria for {section_name}.")

            # Indicate that processing can continue
            return True

        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            # Stop processing due to an error
            return False
    else:
        st.warning("Please upload a PDF on the Home page first.")
        # Stop processing since no file is uploaded
        return False