Spaces:
Running
Running
File size: 9,706 Bytes
d58052d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
import os
import re
from PyPDF2 import PdfReader, PdfWriter
from streamlit_pdf_viewer import pdf_viewer
import streamlit as st
def find_cover(uploaded_file):
"""
Extracts and displays the first page of a PDF.
Parameters:
uploaded_file: The uploaded PDF file.
Returns:
None
"""
section_title = "Cover"
st.title(section_title)
if uploaded_file:
try:
# Read the PDF and extract the first page
pdf_reader = PdfReader(uploaded_file)
first_page = pdf_reader.pages[0]
pdf_writer = PdfWriter()
pdf_writer.add_page(first_page)
# Save the first page to a temporary file
temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
with open(temp_first_page_path, "wb") as f:
pdf_writer.write(f)
# Display the first page using pdf_viewer
pdf_viewer(temp_first_page_path)
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
else:
st.warning("Please upload a PDF on the Home page first.")
def find_underwriter(uploaded_file, section_name, keywords_dict):
"""
Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
starting from the last 2/3 of the PDF to improve performance.
Parameters:
uploaded_file: The uploaded PDF file.
section_name: The name of the section (e.g., "Underwriter").
keywords_dict: Dictionary containing keyword sets for different sections.
Returns:
None
"""
st.title(section_name.title())
keyword_sets = keywords_dict.get(section_name, [])
if not keyword_sets:
st.error(f"No keywords defined for section: {section_name}")
return
if uploaded_file:
try:
pdf_reader = PdfReader(uploaded_file)
total_pages = len(pdf_reader.pages)
start_page = total_pages // 3 # Skip the first 1/3 of the PDF
pages = pdf_reader.pages[start_page:]
# Loop through the keyword sets
for keyword_set in keyword_sets:
for page_num, page in enumerate(pages, start=start_page + 1):
text = page.extract_text()
# Check if any keyword in the set is found on the page
if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
# Display the matched page
pdf_writer = PdfWriter()
pdf_writer.add_page(page)
temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
with open(temp_page_path, "wb") as f:
pdf_writer.write(f)
st.write(f"Keyword found on page {page_num}")
pdf_viewer(temp_page_path)
return # Exit after finding the first match
st.warning(f"No pages contain the specified keywords for {section_name}.")
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
else:
st.warning("Please upload a PDF on the Home page first.")
def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
"""
Extracts and displays sections of a PDF based on keyword matches.
Parameters:
uploaded_file: The uploaded PDF file (Streamlit file uploader object).
section_name: The name of the section to search for (e.g., "income_statement").
keywords_dict: A dictionary containing keyword sets for different sections.
stop_keywords: A dictionary of keywords to indicate where extraction should stop.
anti_keywords: A dictionary of keywords to exclude specific pages from the results.
Returns:
bool: True if processing completed without interruptions; False if stopped or an error occurred.
"""
st.title(section_name.replace("_", " ").title())
if uploaded_file:
try:
pdf_reader = PdfReader(uploaded_file)
total_pages = len(pdf_reader.pages)
# Step 1: Start from the second half of the PDF
start_page = total_pages // 2
pages = pdf_reader.pages[start_page:]
section_keywords = keywords_dict.get(section_name, [])
section_stop_keywords = stop_keywords.get(section_name, [])
section_anti_keywords = anti_keywords.get(section_name, [])
pdf_writer = PdfWriter() # Writer for the extracted pages
extraction_started = False # Flag to check if extraction has started
for page_num, page in enumerate(pages, start=start_page + 1):
text = page.extract_text()
# Step 2: Find the keywords within the keywords_dict
if not extraction_started:
for keyword_set in section_keywords:
if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
st.write(f"Keywords matched on page {page_num}. Starting extraction.")
pdf_writer.add_page(page)
# Check for stop keywords on the same page
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
for stop_set in section_stop_keywords):
st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")
# Check for anti-keywords before stopping
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
for anti_set in section_anti_keywords):
st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
pdf_writer.pages.pop() # Remove the last added page
# Save and display the extracted pages (if any)
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
pdf_viewer(temp_section_path)
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Stop extraction immediately and signal to stop all processing
return False
else:
# Continue extraction
extraction_started = True
break
elif extraction_started:
# Step 3: Add the page to the output
pdf_writer.add_page(page)
# Step 4: Check for stop keywords
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
for stop_set in section_stop_keywords):
st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")
# Step 5: After stopping, check for anti-keywords
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
for anti_set in section_anti_keywords):
st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
pdf_writer.pages.pop() # Remove the last added page
# Save and display the extracted pages (if any)
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
pdf_viewer(temp_section_path)
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Stop extraction and signal to stop all processing
return False
# If extraction finished without hitting stop keywords, save and display the pages
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
pdf_viewer(temp_section_path)
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Indicate that processing can continue
return True
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
# Stop processing due to an error
return False
else:
st.warning("Please upload a PDF on the Home page first.")
# Stop processing since no file is uploaded
return False
|