ProspectusLens / section_extract.py
msr2903's picture
Upload 4 files
d58052d verified
raw
history blame
9.71 kB
import os
import re
from PyPDF2 import PdfReader, PdfWriter
from streamlit_pdf_viewer import pdf_viewer
import streamlit as st
def find_cover(uploaded_file):
"""
Extracts and displays the first page of a PDF.
Parameters:
uploaded_file: The uploaded PDF file.
Returns:
None
"""
section_title = "Cover"
st.title(section_title)
if uploaded_file:
try:
# Read the PDF and extract the first page
pdf_reader = PdfReader(uploaded_file)
first_page = pdf_reader.pages[0]
pdf_writer = PdfWriter()
pdf_writer.add_page(first_page)
# Save the first page to a temporary file
temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
with open(temp_first_page_path, "wb") as f:
pdf_writer.write(f)
# Display the first page using pdf_viewer
pdf_viewer(temp_first_page_path)
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
else:
st.warning("Please upload a PDF on the Home page first.")
def find_underwriter(uploaded_file, section_name, keywords_dict):
"""
Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
starting from the last 2/3 of the PDF to improve performance.
Parameters:
uploaded_file: The uploaded PDF file.
section_name: The name of the section (e.g., "Underwriter").
keywords_dict: Dictionary containing keyword sets for different sections.
Returns:
None
"""
st.title(section_name.title())
keyword_sets = keywords_dict.get(section_name, [])
if not keyword_sets:
st.error(f"No keywords defined for section: {section_name}")
return
if uploaded_file:
try:
pdf_reader = PdfReader(uploaded_file)
total_pages = len(pdf_reader.pages)
start_page = total_pages // 3 # Skip the first 1/3 of the PDF
pages = pdf_reader.pages[start_page:]
# Loop through the keyword sets
for keyword_set in keyword_sets:
for page_num, page in enumerate(pages, start=start_page + 1):
text = page.extract_text()
# Check if any keyword in the set is found on the page
if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
# Display the matched page
pdf_writer = PdfWriter()
pdf_writer.add_page(page)
temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
with open(temp_page_path, "wb") as f:
pdf_writer.write(f)
st.write(f"Keyword found on page {page_num}")
pdf_viewer(temp_page_path)
return # Exit after finding the first match
st.warning(f"No pages contain the specified keywords for {section_name}.")
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
else:
st.warning("Please upload a PDF on the Home page first.")
def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
"""
Extracts and displays sections of a PDF based on keyword matches.
Parameters:
uploaded_file: The uploaded PDF file (Streamlit file uploader object).
section_name: The name of the section to search for (e.g., "income_statement").
keywords_dict: A dictionary containing keyword sets for different sections.
stop_keywords: A dictionary of keywords to indicate where extraction should stop.
anti_keywords: A dictionary of keywords to exclude specific pages from the results.
Returns:
bool: True if processing completed without interruptions; False if stopped or an error occurred.
"""
st.title(section_name.replace("_", " ").title())
if uploaded_file:
try:
pdf_reader = PdfReader(uploaded_file)
total_pages = len(pdf_reader.pages)
# Step 1: Start from the second half of the PDF
start_page = total_pages // 2
pages = pdf_reader.pages[start_page:]
section_keywords = keywords_dict.get(section_name, [])
section_stop_keywords = stop_keywords.get(section_name, [])
section_anti_keywords = anti_keywords.get(section_name, [])
pdf_writer = PdfWriter() # Writer for the extracted pages
extraction_started = False # Flag to check if extraction has started
for page_num, page in enumerate(pages, start=start_page + 1):
text = page.extract_text()
# Step 2: Find the keywords within the keywords_dict
if not extraction_started:
for keyword_set in section_keywords:
if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
st.write(f"Keywords matched on page {page_num}. Starting extraction.")
pdf_writer.add_page(page)
# Check for stop keywords on the same page
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
for stop_set in section_stop_keywords):
st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")
# Check for anti-keywords before stopping
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
for anti_set in section_anti_keywords):
st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
pdf_writer.pages.pop() # Remove the last added page
# Save and display the extracted pages (if any)
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
pdf_viewer(temp_section_path)
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Stop extraction immediately and signal to stop all processing
return False
else:
# Continue extraction
extraction_started = True
break
elif extraction_started:
# Step 3: Add the page to the output
pdf_writer.add_page(page)
# Step 4: Check for stop keywords
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
for stop_set in section_stop_keywords):
st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")
# Step 5: After stopping, check for anti-keywords
if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
for anti_set in section_anti_keywords):
st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
pdf_writer.pages.pop() # Remove the last added page
# Save and display the extracted pages (if any)
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
pdf_viewer(temp_section_path)
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Stop extraction and signal to stop all processing
return False
# If extraction finished without hitting stop keywords, save and display the pages
if len(pdf_writer.pages) > 0:
temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
with open(temp_section_path, "wb") as f:
pdf_writer.write(f)
pdf_viewer(temp_section_path)
else:
st.warning(f"No pages matched the criteria for {section_name}.")
# Indicate that processing can continue
return True
except Exception as e:
st.error(f"An error occurred while processing the PDF: {e}")
# Stop processing due to an error
return False
else:
st.warning("Please upload a PDF on the Home page first.")
# Stop processing since no file is uploaded
return False