Spaces:

msr2903
/

ProspectusLens

Sleeping

App Files Files Community

ProspectusLens / section_extract.py

msr2903

Upload 4 files

d58052d verified 7 months ago

raw

history blame

9.71 kB

	import os
	import re
	from PyPDF2 import PdfReader, PdfWriter
	from streamlit_pdf_viewer import pdf_viewer
	import streamlit as st

	def find_cover(uploaded_file):
	"""
	Extracts and displays the first page of a PDF.

	Parameters:
	uploaded_file: The uploaded PDF file.

	Returns:
	None
	"""
	section_title = "Cover"
	st.title(section_title)

	if uploaded_file:
	try:
	# Read the PDF and extract the first page
	pdf_reader = PdfReader(uploaded_file)
	first_page = pdf_reader.pages[0]

	pdf_writer = PdfWriter()
	pdf_writer.add_page(first_page)

	# Save the first page to a temporary file
	temp_first_page_path = os.path.join(f"temp_{section_title.lower()}.pdf")
	with open(temp_first_page_path, "wb") as f:
	pdf_writer.write(f)

	# Display the first page using pdf_viewer
	pdf_viewer(temp_first_page_path)
	except Exception as e:
	st.error(f"An error occurred while processing the PDF: {e}")
	else:
	st.warning("Please upload a PDF on the Home page first.")


	def find_underwriter(uploaded_file, section_name, keywords_dict):
	"""
	Searches for pages in a PDF containing specific keywords for the 'underwriter' section and displays them,
	starting from the last 2/3 of the PDF to improve performance.

	Parameters:
	uploaded_file: The uploaded PDF file.
	section_name: The name of the section (e.g., "Underwriter").
	keywords_dict: Dictionary containing keyword sets for different sections.

	Returns:
	None
	"""
	st.title(section_name.title())

	keyword_sets = keywords_dict.get(section_name, [])
	if not keyword_sets:
	st.error(f"No keywords defined for section: {section_name}")
	return

	if uploaded_file:
	try:
	pdf_reader = PdfReader(uploaded_file)
	total_pages = len(pdf_reader.pages)
	start_page = total_pages // 3 # Skip the first 1/3 of the PDF
	pages = pdf_reader.pages[start_page:]

	# Loop through the keyword sets
	for keyword_set in keyword_sets:
	for page_num, page in enumerate(pages, start=start_page + 1):
	text = page.extract_text()

	# Check if any keyword in the set is found on the page
	if any(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
	# Display the matched page
	pdf_writer = PdfWriter()
	pdf_writer.add_page(page)

	temp_page_path = os.path.join(f"temp_{section_name.lower()}_page_{page_num + 1}.pdf")
	with open(temp_page_path, "wb") as f:
	pdf_writer.write(f)

	st.write(f"Keyword found on page {page_num}")
	pdf_viewer(temp_page_path)
	return # Exit after finding the first match

	st.warning(f"No pages contain the specified keywords for {section_name}.")
	except Exception as e:
	st.error(f"An error occurred while processing the PDF: {e}")
	else:
	st.warning("Please upload a PDF on the Home page first.")


	def find_section(uploaded_file, section_name, keywords_dict, stop_keywords, anti_keywords):
	"""
	Extracts and displays sections of a PDF based on keyword matches.

	Parameters:
	uploaded_file: The uploaded PDF file (Streamlit file uploader object).
	section_name: The name of the section to search for (e.g., "income_statement").
	keywords_dict: A dictionary containing keyword sets for different sections.
	stop_keywords: A dictionary of keywords to indicate where extraction should stop.
	anti_keywords: A dictionary of keywords to exclude specific pages from the results.

	Returns:
	bool: True if processing completed without interruptions; False if stopped or an error occurred.
	"""
	st.title(section_name.replace("_", " ").title())

	if uploaded_file:
	try:
	pdf_reader = PdfReader(uploaded_file)
	total_pages = len(pdf_reader.pages)

	# Step 1: Start from the second half of the PDF
	start_page = total_pages // 2
	pages = pdf_reader.pages[start_page:]

	section_keywords = keywords_dict.get(section_name, [])
	section_stop_keywords = stop_keywords.get(section_name, [])
	section_anti_keywords = anti_keywords.get(section_name, [])

	pdf_writer = PdfWriter() # Writer for the extracted pages
	extraction_started = False # Flag to check if extraction has started

	for page_num, page in enumerate(pages, start=start_page + 1):
	text = page.extract_text()

	# Step 2: Find the keywords within the keywords_dict
	if not extraction_started:
	for keyword_set in section_keywords:
	if all(re.search(keyword, text, re.IGNORECASE) for keyword in keyword_set):
	st.write(f"Keywords matched on page {page_num}. Starting extraction.")
	pdf_writer.add_page(page)

	# Check for stop keywords on the same page
	if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
	for stop_set in section_stop_keywords):
	st.warning(f"Stop keywords matched on starting page {page_num}. Stopping extraction.")

	# Check for anti-keywords before stopping
	if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
	for anti_set in section_anti_keywords):
	st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
	pdf_writer.pages.pop() # Remove the last added page

	# Save and display the extracted pages (if any)
	if len(pdf_writer.pages) > 0:
	temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
	with open(temp_section_path, "wb") as f:
	pdf_writer.write(f)
	pdf_viewer(temp_section_path)
	else:
	st.warning(f"No pages matched the criteria for {section_name}.")

	# Stop extraction immediately and signal to stop all processing
	return False
	else:
	# Continue extraction
	extraction_started = True
	break
	elif extraction_started:
	# Step 3: Add the page to the output
	pdf_writer.add_page(page)

	# Step 4: Check for stop keywords
	if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in stop_set)
	for stop_set in section_stop_keywords):
	st.warning(f"Stopping extraction at page {page_num}. Stop keywords matched.")

	# Step 5: After stopping, check for anti-keywords
	if any(all(re.search(keyword, text, re.IGNORECASE) for keyword in anti_set)
	for anti_set in section_anti_keywords):
	st.write(f"Page {page_num} contains anti-keywords. Excluding from results.")
	pdf_writer.pages.pop() # Remove the last added page

	# Save and display the extracted pages (if any)
	if len(pdf_writer.pages) > 0:
	temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
	with open(temp_section_path, "wb") as f:
	pdf_writer.write(f)
	pdf_viewer(temp_section_path)
	else:
	st.warning(f"No pages matched the criteria for {section_name}.")

	# Stop extraction and signal to stop all processing
	return False

	# If extraction finished without hitting stop keywords, save and display the pages
	if len(pdf_writer.pages) > 0:
	temp_section_path = os.path.join(f"temp_{section_name}_section.pdf")
	with open(temp_section_path, "wb") as f:
	pdf_writer.write(f)
	pdf_viewer(temp_section_path)
	else:
	st.warning(f"No pages matched the criteria for {section_name}.")

	# Indicate that processing can continue
	return True

	except Exception as e:
	st.error(f"An error occurred while processing the PDF: {e}")
	# Stop processing due to an error
	return False
	else:
	st.warning("Please upload a PDF on the Home page first.")
	# Stop processing since no file is uploaded
	return False