|
import pdfplumber |
|
from typing import Optional, Callable, Literal |
|
import base64 |
|
import io |
|
from PIL import Image |
|
from remittance_pdf_processing_utils import remittance_logger, format_amount_str_to_decimal |
|
from vertex_api_invoice_extractor import extract_invoice_numbers_with_vertex_ai, extract_invoice_numbers_from_text_with_vertex_ai, extract_payment_amounts_with_vertex_ai, extract_payment_amounts_from_text_with_vertex_ai |
|
|
|
from remittance_pdf_processing_types import InvoiceNumbers, InvoiceVerifier, DocumentType, ExtractorFunction, PaymentAmount, Candidate, ProcessedPDFResult, InvoiceListAndAmountVerifier |
|
from anthropic_api_invoice_extractor import extract_invoice_numbers_with_anthropic_ai, extract_payment_amounts_with_anthropic_ai |
|
|
|
def is_text_based_pdf(pdf: pdfplumber.PDF) -> bool: |
|
text_threshold = 100 |
|
for page in pdf.pages: |
|
if len(page.extract_text()) > text_threshold: |
|
return True |
|
return False |
|
|
|
def determine_document_type(pdf: pdfplumber.PDF) -> DocumentType: |
|
return 'single' if len(pdf.pages) == 1 else 'multi' |
|
|
|
def extract_text_from_pdf(pdf_path: str, wrap_pages: bool = False) -> str: |
|
with pdfplumber.open(pdf_path) as pdf: |
|
if not wrap_pages: |
|
|
|
return "\n".join(page.extract_text() for page in pdf.pages) |
|
else: |
|
|
|
pages_text = [] |
|
for i, page in enumerate(pdf.pages, start=1): |
|
page_text = page.extract_text() |
|
wrapped_page = f"<page_{i}>\n{page_text}\n</page_{i}>" |
|
pages_text.append(wrapped_page) |
|
|
|
all_pages_text = "\n".join(pages_text) |
|
return f"<remittance>\n{all_pages_text}\n</remittance>" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_invoice_numbers_from_text( |
|
text: str, |
|
doc_type: DocumentType, |
|
multi_hop: bool = False |
|
) -> list[InvoiceNumbers]: |
|
remittance_logger.info(f"Extracting invoice numbers from {doc_type}-page text-based document (multi_hop: {multi_hop})") |
|
|
|
|
|
return extract_invoice_numbers_from_text_with_vertex_ai(text, multi_hop) |
|
|
|
def extract_invoice_numbers_from_single_base64_image(base64_image: str, multi_hop: bool = False) -> list[InvoiceNumbers]: |
|
remittance_logger.debug(f"Extracting invoice numbers from a single base64 image using Vertex AI (multi_hop: {multi_hop})") |
|
return extract_invoice_numbers_with_vertex_ai(base64_image, multi_hop) |
|
|
|
def extract_invoice_numbers_from_multi_page_images(base64_images: list[str], multi_hop: bool = False) -> list[InvoiceNumbers]: |
|
remittance_logger.debug(f"Extracting invoice numbers from {len(base64_images)} base64 images using Anthropic AI (multi_hop: {multi_hop})") |
|
return extract_invoice_numbers_with_anthropic_ai(base64_images, multi_hop) |
|
|
|
def extract_invoice_numbers_from_base64_images(base64_images: list[str], multi_hop: bool = False) -> list[InvoiceNumbers]: |
|
remittance_logger.info(f"Extracting invoice numbers from {len(base64_images)} base64 image(s) (multi_hop: {multi_hop})") |
|
|
|
if len(base64_images) == 1: |
|
return extract_invoice_numbers_from_single_base64_image(base64_images[0], multi_hop) |
|
else: |
|
return extract_invoice_numbers_from_multi_page_images(base64_images, multi_hop) |
|
|
|
def extract_invoice_numbers_from_image( |
|
pdf: pdfplumber.PDF, |
|
multi_hop: bool = False, |
|
dpi: int = 257 |
|
) -> list[InvoiceNumbers]: |
|
remittance_logger.info(f"Extracting invoice numbers from {len(pdf.pages)}-page image-based document (multi_hop: {multi_hop})") |
|
|
|
base64_images = [] |
|
for page in pdf.pages: |
|
img = page.to_image(resolution=dpi) |
|
img_bytes = io.BytesIO() |
|
img.save(img_bytes, format='PNG') |
|
img_base64 = base64.b64encode(img_bytes.getvalue()).decode('utf-8') |
|
base64_images.append(img_base64) |
|
|
|
return extract_invoice_numbers_from_base64_images(base64_images, multi_hop) |
|
|
|
def extract_invoices_from_pdf(pdf_path: str, force_image_processing: bool = False, invoice_verifier: InvoiceVerifier | None = None, force_multi_hop: bool = False) -> tuple[list[InvoiceNumbers], list[InvoiceNumbers]]: |
|
with pdfplumber.open(pdf_path) as pdf: |
|
doc_type = determine_document_type(pdf) |
|
|
|
for multi_hop in [True] if force_multi_hop else [False, True]: |
|
|
|
if force_image_processing: |
|
invoice_numbers_candidates = extract_invoice_numbers_from_image(pdf, multi_hop=multi_hop) |
|
else: |
|
is_text_based = is_text_based_pdf(pdf) |
|
if is_text_based: |
|
text = extract_text_from_pdf(pdf_path, wrap_pages=True) |
|
invoice_numbers_candidates = extract_invoice_numbers_from_text(text, doc_type, multi_hop=multi_hop) |
|
else: |
|
invoice_numbers_candidates = extract_invoice_numbers_from_image(pdf, multi_hop=multi_hop) |
|
|
|
if invoice_verifier: |
|
verified_invoices = [ |
|
invoice_verifier(invoice_numbers) or [] |
|
for invoice_numbers in invoice_numbers_candidates |
|
] |
|
|
|
verified_result = [invoices for invoices in verified_invoices if invoices] |
|
else: |
|
verified_result = [] |
|
|
|
remittance_logger.info(f"Extracted invoice numbers (post verification, multi_hop={multi_hop}): {verified_result}") |
|
|
|
if verified_result or invoice_numbers_candidates: |
|
return verified_result, invoice_numbers_candidates |
|
|
|
|
|
remittance_logger.warning("No invoice numbers found after trying both single-hop and multi-hop processing.") |
|
return [], [] |
|
|
|
def extract_payment_amounts_from_single_base64_image(base64_image: str) -> list[PaymentAmount]: |
|
remittance_logger.debug("Extracting payment amounts from a single base64 image using Vertex AI") |
|
return extract_payment_amounts_with_vertex_ai(base64_image) |
|
|
|
def extract_payment_amounts_from_multi_page_images(base64_images: list[str]) -> list[PaymentAmount]: |
|
remittance_logger.debug(f"Extracting payment amounts from {len(base64_images)} base64 images using Anthropic AI") |
|
return extract_payment_amounts_with_anthropic_ai(base64_images) |
|
|
|
def extract_payment_amounts_from_base64_images(base64_images: list[str]) -> list[PaymentAmount]: |
|
remittance_logger.info(f"Extracting payment amounts from {len(base64_images)} base64 image(s)") |
|
|
|
if len(base64_images) == 1: |
|
return extract_payment_amounts_from_single_base64_image(base64_images[0]) |
|
else: |
|
return extract_payment_amounts_from_multi_page_images(base64_images) |
|
|
|
def extract_payment_amounts_from_pdf(pdf_path: str, force_image_processing: bool = False, payment_amount_formatter: Callable[[str], str] | None = None) -> list[PaymentAmount]: |
|
with pdfplumber.open(pdf_path) as pdf: |
|
doc_type = determine_document_type(pdf) |
|
|
|
if doc_type == 'single' or force_image_processing: |
|
payment_amounts = extract_payment_amounts_from_image(pdf) |
|
else: |
|
is_text_based = is_text_based_pdf(pdf) |
|
if is_text_based: |
|
text = extract_text_from_pdf(pdf_path, wrap_pages=True) |
|
payment_amounts = extract_payment_amounts_from_text(text, doc_type) |
|
else: |
|
payment_amounts = extract_payment_amounts_from_image(pdf) |
|
|
|
if payment_amount_formatter: |
|
payment_amounts = [payment_amount_formatter(amount) for amount in payment_amounts] |
|
|
|
return payment_amounts |
|
|
|
def extract_payment_amounts_from_text(text: str, doc_type: DocumentType) -> list[PaymentAmount]: |
|
remittance_logger.info(f"Extracting payment amounts from {doc_type}-page text-based document") |
|
|
|
|
|
return extract_payment_amounts_from_text_with_vertex_ai(text) |
|
|
|
def extract_payment_amounts_from_image(pdf: pdfplumber.PDF, dpi: int = 257) -> list[PaymentAmount]: |
|
remittance_logger.info(f"Extracting payment amounts from {len(pdf.pages)}-page image-based document") |
|
|
|
base64_images = [] |
|
for page in pdf.pages: |
|
img = page.to_image(resolution=dpi) |
|
img_bytes = io.BytesIO() |
|
img.save(img_bytes, format='PNG') |
|
img_base64 = base64.b64encode(img_bytes.getvalue()).decode('utf-8') |
|
base64_images.append(img_base64) |
|
|
|
return extract_payment_amounts_from_base64_images(base64_images) |
|
|
|
|
|
def process_pdf(pdf_path: str, force_image_processing: bool = False, force_multi_hop: bool = False, invoice_verifier: InvoiceVerifier | None = None, invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None = None) -> ProcessedPDFResult: |
|
verified_invoice_numbers, unverified_invoice_numbers = extract_invoices_from_pdf( |
|
pdf_path, |
|
force_image_processing, |
|
invoice_verifier, |
|
force_multi_hop=force_multi_hop |
|
) |
|
payment_amounts = extract_payment_amounts_from_pdf(pdf_path, force_image_processing, payment_amount_formatter=format_amount_str_to_decimal) |
|
remittance_logger.debug(f"Extracted payment amounts: {payment_amounts}") |
|
|
|
verified_payment_amounts = [] |
|
if invoice_and_amount_verifier and len(verified_invoice_numbers) == 1: |
|
for amount in payment_amounts: |
|
if invoice_and_amount_verifier(verified_invoice_numbers[0], amount): |
|
verified_payment_amounts = [amount] |
|
break |
|
|
|
verified_candidate = (verified_invoice_numbers, verified_payment_amounts) |
|
unverified_candidate = (unverified_invoice_numbers, payment_amounts) |
|
return verified_candidate, unverified_candidate |
|
|
|
|
|
|
|
def process_pdf_with_flow( |
|
pdf_path: str, |
|
invoice_verifier: InvoiceVerifier | None = None, |
|
invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None = None |
|
) -> ProcessedPDFResult: |
|
""" |
|
Process a PDF file using a specific flow of extraction methods. |
|
|
|
Args: |
|
pdf_path (str): Path to the PDF file. |
|
invoice_verifier (InvoiceVerifier | None): Function to verify invoice numbers. |
|
invoice_and_amount_verifier (InvoiceListAndAmountVerifier | None): Function to verify invoice numbers and amount pairs. |
|
|
|
Returns: |
|
ProcessedPDFResult: A tuple containing verified and unverified candidates. |
|
""" |
|
all_verified_invoices: list[InvoiceNumbers] = [] |
|
all_verified_amounts: list[PaymentAmount] = [] |
|
all_unverified_invoices: list[InvoiceNumbers] = [] |
|
all_unverified_amounts: list[PaymentAmount] = [] |
|
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
is_text_based = is_text_based_pdf(pdf) |
|
|
|
if is_text_based: |
|
|
|
text = extract_text_from_pdf(pdf_path, wrap_pages=True) |
|
result = process_text_based(text, invoice_verifier, invoice_and_amount_verifier, multi_hop=False) |
|
if has_single_verified_pair(result): |
|
return result |
|
accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) |
|
remittance_logger.debug(f"Result snapshot - single hop text processing: {result}") |
|
|
|
|
|
result = process_text_based(text, invoice_verifier, invoice_and_amount_verifier, multi_hop=True) |
|
if has_single_verified_pair(result): |
|
return result |
|
accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) |
|
remittance_logger.debug(f"Result snapshot - multi hop text processing: {result}") |
|
|
|
|
|
result = process_image_based(pdf, invoice_verifier, invoice_and_amount_verifier, multi_hop=False) |
|
if has_single_verified_pair(result): |
|
return result |
|
accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) |
|
remittance_logger.debug(f"Result snapshot - single hop image processing: {result}") |
|
|
|
|
|
result = process_image_based(pdf, invoice_verifier, invoice_and_amount_verifier, multi_hop=True) |
|
if has_single_verified_pair(result): |
|
return result |
|
accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts) |
|
remittance_logger.debug(f"Result snapshot - multi hop image processing: {result}") |
|
|
|
|
|
return (all_verified_invoices, all_verified_amounts), (all_unverified_invoices, all_unverified_amounts) |
|
|
|
def process_text_based( |
|
text: str, |
|
invoice_verifier: InvoiceVerifier | None, |
|
invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None, |
|
multi_hop: bool |
|
) -> ProcessedPDFResult: |
|
invoice_numbers = extract_invoice_numbers_from_text(text, 'multi', multi_hop) |
|
payment_amounts = extract_payment_amounts_from_text(text, 'multi') |
|
|
|
return verify_candidates(invoice_numbers, payment_amounts, invoice_verifier, invoice_and_amount_verifier) |
|
|
|
def process_image_based( |
|
pdf: pdfplumber.PDF, |
|
invoice_verifier: InvoiceVerifier | None, |
|
invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None, |
|
multi_hop: bool |
|
) -> ProcessedPDFResult: |
|
invoice_numbers = extract_invoice_numbers_from_image(pdf, multi_hop) |
|
payment_amounts = extract_payment_amounts_from_image(pdf) |
|
|
|
return verify_candidates(invoice_numbers, payment_amounts, invoice_verifier, invoice_and_amount_verifier) |
|
|
|
def verify_candidates( |
|
invoice_numbers: list[InvoiceNumbers], |
|
payment_amounts: list[PaymentAmount], |
|
invoice_verifier: InvoiceVerifier | None, |
|
invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None |
|
) -> ProcessedPDFResult: |
|
verified_invoices = [] |
|
verified_amounts = [] |
|
|
|
if invoice_verifier: |
|
verified_invoices = [invoice_verifier(inv) for inv in invoice_numbers if invoice_verifier(inv)] |
|
|
|
if invoice_and_amount_verifier and len(verified_invoices) == 1: |
|
for amount in payment_amounts: |
|
if invoice_and_amount_verifier(verified_invoices[0], amount): |
|
verified_amounts = [amount] |
|
break |
|
|
|
return (verified_invoices, verified_amounts), (invoice_numbers, payment_amounts) |
|
|
|
def has_single_verified_pair(result: ProcessedPDFResult) -> bool: |
|
verified, _ = result |
|
return len(verified[0]) == 1 and len(verified[1]) == 1 |
|
|
|
def accumulate_candidates( |
|
result: ProcessedPDFResult, |
|
all_verified_invoices: list[InvoiceNumbers], |
|
all_verified_amounts: list[PaymentAmount], |
|
all_unverified_invoices: list[InvoiceNumbers], |
|
all_unverified_amounts: list[PaymentAmount] |
|
) -> None: |
|
verified, unverified = result |
|
|
|
|
|
def add_unique(items: list, new_items: list) -> None: |
|
for item in new_items: |
|
if isinstance(item, list): |
|
if not any(set(item) == set(existing) for existing in items): |
|
items.append(item) |
|
else: |
|
if item not in items: |
|
items.append(item) |
|
|
|
add_unique(all_verified_invoices, verified[0]) |
|
add_unique(all_verified_amounts, verified[1]) |
|
add_unique(all_unverified_invoices, unverified[0]) |
|
add_unique(all_unverified_amounts, unverified[1]) |