File size: 9,706 Bytes
d58052d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import os
import re
from PyPDF2 import PdfReader, PdfWriter
from streamlit_pdf_viewer import pdf_viewer
import streamlit as st

def find_cover(uploaded_file):
    """

    Extracts and displays the first page of a PDF.



    Parameters:

        uploaded_file: The uploaded PDF file.



    Returns:

        None

    """
    section_title = "Cover"
    st.title(section_title)

    if uploaded_file:
        try:
            # Read the PDF and extract the first page
            pdf_reader = PdfReader(uploaded_file)
            first_page = pdf_reader.pages[0]

            pdf_writer = PdfWriter()
            pdf_writer.add_page(first_page)

            # Save the first page to a temporary file
            temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
            with open(temp_first_page_path, "wb") as f:
                pdf_writer.write(f)

            # Display the first page using pdf_viewer
            pdf_viewer(temp_first_page_path)
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
    else:
        st.warning("Please upload a PDF on the Home page first.")


def find_underwriter(uploaded_file, section_name, keywords_dict):
    """

    Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,

    starting from the last 2/3 of the PDF to improve performance.



    Parameters:

        uploaded_file: The uploaded PDF file.

        section_name: The name of the section (e.g., "Underwriter").

        keywords_dict: Dictionary containing keyword sets for different sections.



    Returns:

        None

    """
    st.title(section_name.title())

    keyword_sets = keywords_dict.get(section_name, [])
    if not keyword_sets:
        st.error(f"No keywords defined for section: {section_name}")
        return

    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)
            start_page = total_pages // 3  # Skip the first 1/3 of the PDF
            pages = pdf_reader.pages[start_page:]

            # Loop through the keyword sets
            for keyword_set in keyword_sets:
                for page_num, page in enumerate(pages, start=start_page + 1):
                    text = page.extract_text()
                    
                    # Check if any keyword in the set is found on the page
                    if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                        # Display the matched page
                        pdf_writer = PdfWriter()
                        pdf_writer.add_page(page)

                        temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
                        with open(temp_page_path, "wb") as f:
                            pdf_writer.write(f)

                        st.write(f"Keyword found on page {page_num}")
                        pdf_viewer(temp_page_path)
                        return  # Exit after finding the first match

            st.warning(f"No pages contain the specified keywords for {section_name}.")
        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
    else:
        st.warning("Please upload a PDF on the Home page first.")


def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
    """

    Extracts and displays sections of a PDF based on keyword matches.



    Parameters:

        uploaded_file: The uploaded PDF file (Streamlit file uploader object).

        section_name: The name of the section to search for (e.g., "income_statement").

        keywords_dict: A dictionary containing keyword sets for different sections.

        stop_keywords: A dictionary of keywords to indicate where extraction should stop.

        anti_keywords: A dictionary of keywords to exclude specific pages from the results.



    Returns:

        bool: True if processing completed without interruptions; False if stopped or an error occurred.

    """
    st.title(section_name.replace("_", " ").title())

    if uploaded_file:
        try:
            pdf_reader = PdfReader(uploaded_file)
            total_pages = len(pdf_reader.pages)

            # Step 1: Start from the second half of the PDF
            start_page = total_pages // 2
            pages = pdf_reader.pages[start_page:]

            section_keywords = keywords_dict.get(section_name, [])
            section_stop_keywords = stop_keywords.get(section_name, [])
            section_anti_keywords = anti_keywords.get(section_name, [])

            pdf_writer = PdfWriter()  # Writer for the extracted pages
            extraction_started = False  # Flag to check if extraction has started

            for page_num, page in enumerate(pages, start=start_page + 1):
                text = page.extract_text()

                # Step 2: Find the keywords within the keywords_dict
                if not extraction_started:
                    for keyword_set in section_keywords:
                        if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
                            st.write(f"Keywords matched on page {page_num}. Starting extraction.")
                            pdf_writer.add_page(page)

                            # Check for stop keywords on the same page
                            if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                                   for stop_set in section_stop_keywords):
                                st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")

                                # Check for anti-keywords before stopping
                                if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                                       for anti_set in section_anti_keywords):
                                    st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
                                    pdf_writer.pages.pop()  # Remove the last added page

                                # Save and display the extracted pages (if any)
                                if len(pdf_writer.pages) > 0:
                                    temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                                    with open(temp_section_path, "wb") as f:
                                        pdf_writer.write(f)
                                    pdf_viewer(temp_section_path)
                                else:
                                    st.warning(f"No pages matched the criteria for {section_name}.")

                                # Stop extraction immediately and signal to stop all processing
                                return False
                            else:
                                # Continue extraction
                                extraction_started = True
                                break
                elif extraction_started:
                    # Step 3: Add the page to the output
                    pdf_writer.add_page(page)

                    # Step 4: Check for stop keywords
                    if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
                           for stop_set in section_stop_keywords):
                        st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")

                        # Step 5: After stopping, check for anti-keywords
                        if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
                               for anti_set in section_anti_keywords):
                            st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
                            pdf_writer.pages.pop()  # Remove the last added page

                        # Save and display the extracted pages (if any)
                        if len(pdf_writer.pages) > 0:
                            temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                            with open(temp_section_path, "wb") as f:
                                pdf_writer.write(f)
                            pdf_viewer(temp_section_path)
                        else:
                            st.warning(f"No pages matched the criteria for {section_name}.")

                        # Stop extraction and signal to stop all processing
                        return False

            # If extraction finished without hitting stop keywords, save and display the pages
            if len(pdf_writer.pages) > 0:
                temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
                with open(temp_section_path, "wb") as f:
                    pdf_writer.write(f)
                pdf_viewer(temp_section_path)
            else:
                st.warning(f"No pages matched the criteria for {section_name}.")

            # Indicate that processing can continue
            return True

        except Exception as e:
            st.error(f"An error occurred while processing the PDF: {e}")
            # Stop processing due to an error
            return False
    else:
        st.warning("Please upload a PDF on the Home page first.")
        # Stop processing since no file is uploaded
        return False