File size: 15,567 Bytes
7850a69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
import pdfplumber
from typing import Optional, Callable, Literal
import base64
import io
from PIL import Image
from remittance_pdf_processing_utils import remittance_logger, format_amount_str_to_decimal
from vertex_api_invoice_extractor import extract_invoice_numbers_with_vertex_ai, extract_invoice_numbers_from_text_with_vertex_ai, extract_payment_amounts_with_vertex_ai, extract_payment_amounts_from_text_with_vertex_ai
# from dspy_invoice_extractors import SinglePageInvoiceExtractor, MultiPageInvoiceExtractor
from remittance_pdf_processing_types import InvoiceNumbers, InvoiceVerifier, DocumentType, ExtractorFunction, PaymentAmount, Candidate, ProcessedPDFResult, InvoiceListAndAmountVerifier
from anthropic_api_invoice_extractor import extract_invoice_numbers_with_anthropic_ai, extract_payment_amounts_with_anthropic_ai

def is_text_based_pdf(pdf: pdfplumber.PDF) -> bool:
	text_threshold = 100  # Minimum number of characters to consider it text-based
	for page in pdf.pages:
		if len(page.extract_text()) > text_threshold:
			return True
	return False

def determine_document_type(pdf: pdfplumber.PDF) -> DocumentType:
	return 'single' if len(pdf.pages) == 1 else 'multi'

def extract_text_from_pdf(pdf_path: str, wrap_pages: bool = False) -> str:
	with pdfplumber.open(pdf_path) as pdf:
		if not wrap_pages:
			# Keep the current behavior
			return "\n".join(page.extract_text() for page in pdf.pages)
		else:
			# Implement new wrapping behavior
			pages_text = []
			for i, page in enumerate(pdf.pages, start=1):
				page_text = page.extract_text()
				wrapped_page = f"<page_{i}>\n{page_text}\n</page_{i}>"
				pages_text.append(wrapped_page)
			
			all_pages_text = "\n".join(pages_text)
			return f"<remittance>\n{all_pages_text}\n</remittance>"

# def InvoiceExtractor(doc_type: DocumentType) -> ExtractorFunction:
# 	if doc_type == 'single':
# 		def single_page_extractor(text: str) -> list[InvoiceNumbers]:
# 			return []
# 		return single_page_extractor
# 	else:
# 		def multi_page_extractor(text: str) -> list[InvoiceNumbers]:
# 			return []
# 		return multi_page_extractor

def extract_invoice_numbers_from_text(
	text: str,
	doc_type: DocumentType,
	multi_hop: bool = False
) -> list[InvoiceNumbers]:
	remittance_logger.info(f"Extracting invoice numbers from {doc_type}-page text-based document (multi_hop: {multi_hop})")
	
	# Call the Vertex AI extractor
	return extract_invoice_numbers_from_text_with_vertex_ai(text, multi_hop)

def extract_invoice_numbers_from_single_base64_image(base64_image: str, multi_hop: bool = False) -> list[InvoiceNumbers]:
	remittance_logger.debug(f"Extracting invoice numbers from a single base64 image using Vertex AI (multi_hop: {multi_hop})")
	return extract_invoice_numbers_with_vertex_ai(base64_image, multi_hop)

def extract_invoice_numbers_from_multi_page_images(base64_images: list[str], multi_hop: bool = False) -> list[InvoiceNumbers]:
	remittance_logger.debug(f"Extracting invoice numbers from {len(base64_images)} base64 images using Anthropic AI (multi_hop: {multi_hop})")
	return extract_invoice_numbers_with_anthropic_ai(base64_images, multi_hop)

def extract_invoice_numbers_from_base64_images(base64_images: list[str], multi_hop: bool = False) -> list[InvoiceNumbers]:
	remittance_logger.info(f"Extracting invoice numbers from {len(base64_images)} base64 image(s) (multi_hop: {multi_hop})")
	
	if len(base64_images) == 1:
		return extract_invoice_numbers_from_single_base64_image(base64_images[0], multi_hop)
	else:
		return extract_invoice_numbers_from_multi_page_images(base64_images, multi_hop)

def extract_invoice_numbers_from_image(
	pdf: pdfplumber.PDF,
	multi_hop: bool = False,
	dpi: int = 257 # Number choosen for optimal resolution for Gemini Flash 1.5 model
) -> list[InvoiceNumbers]:
	remittance_logger.info(f"Extracting invoice numbers from {len(pdf.pages)}-page image-based document (multi_hop: {multi_hop})")
	
	base64_images = []
	for page in pdf.pages:
		img = page.to_image(resolution=dpi)
		img_bytes = io.BytesIO()
		img.save(img_bytes, format='PNG')
		img_base64 = base64.b64encode(img_bytes.getvalue()).decode('utf-8')
		base64_images.append(img_base64)
	
	return extract_invoice_numbers_from_base64_images(base64_images, multi_hop)

def extract_invoices_from_pdf(pdf_path: str, force_image_processing: bool = False, invoice_verifier: InvoiceVerifier | None = None, force_multi_hop: bool = False) -> tuple[list[InvoiceNumbers], list[InvoiceNumbers]]:
	with pdfplumber.open(pdf_path) as pdf:
		doc_type = determine_document_type(pdf)
		
		for multi_hop in [True] if force_multi_hop else [False, True]:
			# if doc_type == 'single' or force_image_processing:
			if force_image_processing:
				invoice_numbers_candidates = extract_invoice_numbers_from_image(pdf, multi_hop=multi_hop)
			else:
				is_text_based = is_text_based_pdf(pdf)
				if is_text_based:
					text = extract_text_from_pdf(pdf_path, wrap_pages=True)
					invoice_numbers_candidates = extract_invoice_numbers_from_text(text, doc_type, multi_hop=multi_hop)
				else:
					invoice_numbers_candidates = extract_invoice_numbers_from_image(pdf, multi_hop=multi_hop)

			if invoice_verifier:
				verified_invoices = [
					invoice_verifier(invoice_numbers) or []
					for invoice_numbers in invoice_numbers_candidates
				]
				# Filter out empty lists for verified invoices
				verified_result = [invoices for invoices in verified_invoices if invoices]
			else:
				verified_result = []  # When there's no verifier, the verified list should be empty

			remittance_logger.info(f"Extracted invoice numbers (post verification, multi_hop={multi_hop}): {verified_result}")
			# If we found invoices (either verified or unverified), return them
			if verified_result or invoice_numbers_candidates:
				return verified_result, invoice_numbers_candidates
		
		# If we've tried both with and without multi_hop and found nothing, return empty lists
		remittance_logger.warning("No invoice numbers found after trying both single-hop and multi-hop processing.")
		return [], []

def extract_payment_amounts_from_single_base64_image(base64_image: str) -> list[PaymentAmount]:
	remittance_logger.debug("Extracting payment amounts from a single base64 image using Vertex AI")
	return extract_payment_amounts_with_vertex_ai(base64_image)

def extract_payment_amounts_from_multi_page_images(base64_images: list[str]) -> list[PaymentAmount]:
	remittance_logger.debug(f"Extracting payment amounts from {len(base64_images)} base64 images using Anthropic AI")
	return extract_payment_amounts_with_anthropic_ai(base64_images)

def extract_payment_amounts_from_base64_images(base64_images: list[str]) -> list[PaymentAmount]:
	remittance_logger.info(f"Extracting payment amounts from {len(base64_images)} base64 image(s)")
	
	if len(base64_images) == 1:
		return extract_payment_amounts_from_single_base64_image(base64_images[0])
	else:
		return extract_payment_amounts_from_multi_page_images(base64_images)

def extract_payment_amounts_from_pdf(pdf_path: str, force_image_processing: bool = False, payment_amount_formatter: Callable[[str], str] | None = None) -> list[PaymentAmount]:
	with pdfplumber.open(pdf_path) as pdf:
		doc_type = determine_document_type(pdf)

		if doc_type == 'single' or force_image_processing:
			payment_amounts = extract_payment_amounts_from_image(pdf)
		else:
			is_text_based = is_text_based_pdf(pdf)
			if is_text_based:
				text = extract_text_from_pdf(pdf_path, wrap_pages=True)
				payment_amounts = extract_payment_amounts_from_text(text, doc_type)
			else:
				payment_amounts = extract_payment_amounts_from_image(pdf)
	
	if payment_amount_formatter:
		payment_amounts = [payment_amount_formatter(amount) for amount in payment_amounts]
				
	return payment_amounts

def extract_payment_amounts_from_text(text: str, doc_type: DocumentType) -> list[PaymentAmount]:
	remittance_logger.info(f"Extracting payment amounts from {doc_type}-page text-based document")
	
	# Call the Vertex AI extractor
	return extract_payment_amounts_from_text_with_vertex_ai(text)

def extract_payment_amounts_from_image(pdf: pdfplumber.PDF, dpi: int = 257) -> list[PaymentAmount]:
	remittance_logger.info(f"Extracting payment amounts from {len(pdf.pages)}-page image-based document")
	
	base64_images = []
	for page in pdf.pages:
		img = page.to_image(resolution=dpi)
		img_bytes = io.BytesIO()
		img.save(img_bytes, format='PNG')
		img_base64 = base64.b64encode(img_bytes.getvalue()).decode('utf-8')
		base64_images.append(img_base64)
	
	return extract_payment_amounts_from_base64_images(base64_images)		


def process_pdf(pdf_path: str, force_image_processing: bool = False, force_multi_hop: bool = False, invoice_verifier: InvoiceVerifier | None = None, invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None = None) -> ProcessedPDFResult:
	verified_invoice_numbers, unverified_invoice_numbers = extract_invoices_from_pdf(
		pdf_path, 
		force_image_processing, 
		invoice_verifier, 
		force_multi_hop=force_multi_hop
	)
	payment_amounts = extract_payment_amounts_from_pdf(pdf_path, force_image_processing, payment_amount_formatter=format_amount_str_to_decimal)
	remittance_logger.debug(f"Extracted payment amounts: {payment_amounts}")

	verified_payment_amounts = []	
	if invoice_and_amount_verifier and len(verified_invoice_numbers) == 1:
		for amount in payment_amounts:
			if invoice_and_amount_verifier(verified_invoice_numbers[0], amount):
				verified_payment_amounts = [amount]
				break

	verified_candidate = (verified_invoice_numbers, verified_payment_amounts)
	unverified_candidate = (unverified_invoice_numbers, payment_amounts)
	return verified_candidate, unverified_candidate

# from typing import list, tuple

def process_pdf_with_flow(
    pdf_path: str,
    invoice_verifier: InvoiceVerifier | None = None,
    invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None = None
) -> ProcessedPDFResult:
    """
    Process a PDF file using a specific flow of extraction methods.
    
    Args:
    pdf_path (str): Path to the PDF file.
    invoice_verifier (InvoiceVerifier | None): Function to verify invoice numbers.
    invoice_and_amount_verifier (InvoiceListAndAmountVerifier | None): Function to verify invoice numbers and amount pairs.
    
    Returns:
    ProcessedPDFResult: A tuple containing verified and unverified candidates.
    """
    all_verified_invoices: list[InvoiceNumbers] = []
    all_verified_amounts: list[PaymentAmount] = []
    all_unverified_invoices: list[InvoiceNumbers] = []
    all_unverified_amounts: list[PaymentAmount] = []

    with pdfplumber.open(pdf_path) as pdf:
        is_text_based = is_text_based_pdf(pdf)
        
        if is_text_based:
            # Try single hop text processing
            text = extract_text_from_pdf(pdf_path, wrap_pages=True)
            result = process_text_based(text, invoice_verifier, invoice_and_amount_verifier, multi_hop=False)
            if has_single_verified_pair(result):
                return result
            accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts)
            remittance_logger.debug(f"Result snapshot - single hop text processing: {result}")

            # Try multi hop text processing
            result = process_text_based(text, invoice_verifier, invoice_and_amount_verifier, multi_hop=True)
            if has_single_verified_pair(result):
                return result
            accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts)
            remittance_logger.debug(f"Result snapshot - multi hop text processing: {result}")
        
        # Try single hop image processing
        result = process_image_based(pdf, invoice_verifier, invoice_and_amount_verifier, multi_hop=False)
        if has_single_verified_pair(result):
            return result
        accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts)
        remittance_logger.debug(f"Result snapshot - single hop image processing: {result}")
        
        # Try multi hop image processing
        result = process_image_based(pdf, invoice_verifier, invoice_and_amount_verifier, multi_hop=True)
        if has_single_verified_pair(result):
            return result
        accumulate_candidates(result, all_verified_invoices, all_verified_amounts, all_unverified_invoices, all_unverified_amounts)
        remittance_logger.debug(f"Result snapshot - multi hop image processing: {result}")
    
    # If no single verified pair is found, return all accumulated candidates
    return (all_verified_invoices, all_verified_amounts), (all_unverified_invoices, all_unverified_amounts)

def process_text_based(
    text: str,
    invoice_verifier: InvoiceVerifier | None,
    invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None,
    multi_hop: bool
) -> ProcessedPDFResult:
    invoice_numbers = extract_invoice_numbers_from_text(text, 'multi', multi_hop)
    payment_amounts = extract_payment_amounts_from_text(text, 'multi')
    
    return verify_candidates(invoice_numbers, payment_amounts, invoice_verifier, invoice_and_amount_verifier)

def process_image_based(
    pdf: pdfplumber.PDF,
    invoice_verifier: InvoiceVerifier | None,
    invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None,
    multi_hop: bool
) -> ProcessedPDFResult:
    invoice_numbers = extract_invoice_numbers_from_image(pdf, multi_hop)
    payment_amounts = extract_payment_amounts_from_image(pdf)
    
    return verify_candidates(invoice_numbers, payment_amounts, invoice_verifier, invoice_and_amount_verifier)

def verify_candidates(
    invoice_numbers: list[InvoiceNumbers],
    payment_amounts: list[PaymentAmount],
    invoice_verifier: InvoiceVerifier | None,
    invoice_and_amount_verifier: InvoiceListAndAmountVerifier | None
) -> ProcessedPDFResult:
    verified_invoices = []
    verified_amounts = []
    
    if invoice_verifier:
        verified_invoices = [invoice_verifier(inv) for inv in invoice_numbers if invoice_verifier(inv)]
    
    if invoice_and_amount_verifier and len(verified_invoices) == 1:
        for amount in payment_amounts:
            if invoice_and_amount_verifier(verified_invoices[0], amount):
                verified_amounts = [amount]
                break
    
    return (verified_invoices, verified_amounts), (invoice_numbers, payment_amounts)

def has_single_verified_pair(result: ProcessedPDFResult) -> bool:
    verified, _ = result
    return len(verified[0]) == 1 and len(verified[1]) == 1

def accumulate_candidates(
    result: ProcessedPDFResult,
    all_verified_invoices: list[InvoiceNumbers],
    all_verified_amounts: list[PaymentAmount],
    all_unverified_invoices: list[InvoiceNumbers],
    all_unverified_amounts: list[PaymentAmount]
) -> None:
    verified, unverified = result
    
    # Helper function to add unique items to a list
    def add_unique(items: list, new_items: list) -> None:
        for item in new_items:
            if isinstance(item, list):  # For invoice numbers
                if not any(set(item) == set(existing) for existing in items):
                    items.append(item)
            else:  # For payment amounts
                if item not in items:
                    items.append(item)

    add_unique(all_verified_invoices, verified[0])
    add_unique(all_verified_amounts, verified[1])
    add_unique(all_unverified_invoices, unverified[0])
    add_unique(all_unverified_amounts, unverified[1])