import os import re from PyPDF2 import PdfReader, PdfWriter from streamlit_pdf_viewer import pdf_viewer import streamlit as st def find_cover(uploaded_file): """ Extracts and displays the first page of a PDF. Parameters: uploaded_file: The uploaded PDF file. Returns: None """ section_title = "Cover" st.title(section_title) if uploaded_file: try: # Read the PDF and extract the first page pdf_reader = PdfReader(uploaded_file) first_page = pdf_reader.pages[0] pdf_writer = PdfWriter() pdf_writer.add_page(first_page) # Save the first page to a temporary file temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf") with open(temp_first_page_path, "wb") as f: pdf_writer.write(f) # Display the first page using pdf_viewer pdf_viewer(temp_first_page_path) except Exception as e: st.error(f"An error occurred while processing the PDF: {e}") else: st.warning("Please upload a PDF on the Home page first.") def find_underwriter(uploaded_file, section_name, keywords_dict): """ Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them, starting from the last 2/3 of the PDF to improve performance. Parameters: uploaded_file: The uploaded PDF file. section_name: The name of the section (e.g., "Underwriter"). keywords_dict: Dictionary containing keyword sets for different sections. Returns: None """ st.title(section_name.title()) keyword_sets = keywords_dict.get(section_name, []) if not keyword_sets: st.error(f"No keywords defined for section: {section_name}") return if uploaded_file: try: pdf_reader = PdfReader(uploaded_file) total_pages = len(pdf_reader.pages) start_page = total_pages // 3 # Skip the first 1/3 of the PDF pages = pdf_reader.pages[start_page:] # Loop through the keyword sets for keyword_set in keyword_sets: for page_num, page in enumerate(pages, start=start_page + 1): text = page.extract_text() # Check if any keyword in the set is found on the page if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set): # Display the matched page pdf_writer = PdfWriter() pdf_writer.add_page(page) temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf") with open(temp_page_path, "wb") as f: pdf_writer.write(f) st.write(f"Keyword found on page {page_num}") pdf_viewer(temp_page_path) return # Exit after finding the first match st.warning(f"No pages contain the specified keywords for {section_name}.") except Exception as e: st.error(f"An error occurred while processing the PDF: {e}") else: st.warning("Please upload a PDF on the Home page first.") def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords): """ Extracts and displays sections of a PDF based on keyword matches. Parameters: uploaded_file: The uploaded PDF file (Streamlit file uploader object). section_name: The name of the section to search for (e.g., "income_statement"). keywords_dict: A dictionary containing keyword sets for different sections. stop_keywords: A dictionary of keywords to indicate where extraction should stop. anti_keywords: A dictionary of keywords to exclude specific pages from the results. Returns: bool: True if processing completed without interruptions; False if stopped or an error occurred. """ st.title(section_name.replace("_", " ").title()) if uploaded_file: try: pdf_reader = PdfReader(uploaded_file) total_pages = len(pdf_reader.pages) # Step 1: Start from the second half of the PDF start_page = total_pages // 2 pages = pdf_reader.pages[start_page:] section_keywords = keywords_dict.get(section_name, []) section_stop_keywords = stop_keywords.get(section_name, []) section_anti_keywords = anti_keywords.get(section_name, []) pdf_writer = PdfWriter() # Writer for the extracted pages extraction_started = False # Flag to check if extraction has started for page_num, page in enumerate(pages, start=start_page + 1): text = page.extract_text() # Step 2: Find the keywords within the keywords_dict if not extraction_started: for keyword_set in section_keywords: if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set): st.write(f"Keywords matched on page {page_num}. Starting extraction.") pdf_writer.add_page(page) # Check for stop keywords on the same page if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set) for stop_set in section_stop_keywords): st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.") # Check for anti-keywords before stopping if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set) for anti_set in section_anti_keywords): st.write(f"Page {page_num} contains anti-keywords. Excluding from results.") pdf_writer.pages.pop() # Remove the last added page # Save and display the extracted pages (if any) if len(pdf_writer.pages) > 0: temp_section_path = os.path.join(f"temp_{section_name}_section.pdf") with open(temp_section_path, "wb") as f: pdf_writer.write(f) pdf_viewer(temp_section_path) else: st.warning(f"No pages matched the criteria for {section_name}.") # Stop extraction immediately and signal to stop all processing return False else: # Continue extraction extraction_started = True break elif extraction_started: # Step 3: Add the page to the output pdf_writer.add_page(page) # Step 4: Check for stop keywords if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set) for stop_set in section_stop_keywords): st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.") # Step 5: After stopping, check for anti-keywords if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set) for anti_set in section_anti_keywords): st.write(f"Page {page_num} contains anti-keywords. Excluding from results.") pdf_writer.pages.pop() # Remove the last added page # Save and display the extracted pages (if any) if len(pdf_writer.pages) > 0: temp_section_path = os.path.join(f"temp_{section_name}_section.pdf") with open(temp_section_path, "wb") as f: pdf_writer.write(f) pdf_viewer(temp_section_path) else: st.warning(f"No pages matched the criteria for {section_name}.") # Stop extraction and signal to stop all processing return False # If extraction finished without hitting stop keywords, save and display the pages if len(pdf_writer.pages) > 0: temp_section_path = os.path.join(f"temp_{section_name}_section.pdf") with open(temp_section_path, "wb") as f: pdf_writer.write(f) pdf_viewer(temp_section_path) else: st.warning(f"No pages matched the criteria for {section_name}.") # Indicate that processing can continue return True except Exception as e: st.error(f"An error occurred while processing the PDF: {e}") # Stop processing due to an error return False else: st.warning("Please upload a PDF on the Home page first.") # Stop processing since no file is uploaded return False