File size: 4,910 Bytes
7850a69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File: remittance_pdf_processing_cli.py

import argparse
from remittance_pdf_processor import process_pdf, extract_text_from_pdf
from ccmt_verifier_aus import verify_invoice_numbers
from remittance_pdf_processing_types import Candidate, ProcessedPDFResult

def main():
    parser = argparse.ArgumentParser(description="Extract text and invoice numbers from a PDF file.")
    parser.add_argument("pdf_file", help="Path to the PDF file")
    parser.add_argument("-o", "--output", help="Output file path for extracted text (optional)")
    parser.add_argument("-v", "--verbose", action="store_true", help="Print extracted text from PDF")
    parser.add_argument("-f", "--force-image", action="store_true", help="Force processing PDF as image")
    parser.add_argument("--verify", action="store_true", help="Enable invoice number verification")
    parser.add_argument("--multi-hop", action="store_true", help="Force multi-hop processing")

    args = parser.parse_args()

    extracted_text = extract_text_from_pdf(args.pdf_file, wrap_pages=True)
    
    # Determine the invoice verifier function
    invoice_verifier = verify_invoice_numbers if args.verify else None

    extracted_result: ProcessedPDFResult = process_pdf(args.pdf_file, 
                                                       force_image_processing=args.force_image,
                                                       force_multi_hop=args.multi_hop,
                                                       invoice_verifier=invoice_verifier)

    verified_candidate, unverified_candidate = extracted_result

    if args.verbose:
        print("Extracted text from PDF:")
        print("-" * 40)
        print(extracted_text)
        print("-" * 40)
        print()

    print("Extracted invoice numbers and payment amounts:")
    print(f"Multi-hop processing: {'Enabled' if args.multi_hop else 'Disabled'}")
    print(f"Verification: {'Enabled' if args.verify else 'Disabled'}")
    print("-" * 40)
    
    print("Verified Results:")
    if len(verified_candidate[0]) == 1 and len(verified_candidate[1]) == 1:
        print(f"  Verified Invoice Numbers: {', '.join(verified_candidate[0][0])}")
        print(f"  Verified Amount: {verified_candidate[1][0]}")
    elif len(verified_candidate[0]) == 1 and len(verified_candidate[1]) == 0:
        print(f"  Verified Invoice Numbers: {', '.join(verified_candidate[0][0])}")
        print("  Amount doesn't match. This might be a partial payment.")
    elif len(verified_candidate[0]) >= 2:
        print("  Verified Invoice Numbers:")
        for i, invoice_numbers in enumerate(verified_candidate[0], 1):
            print(f"    List {i}: {', '.join(invoice_numbers)}")
    else:
        print("  No verified results.")
    
    print("\nUnverified Invoice Numbers:")
    for i, invoice_numbers in enumerate(unverified_candidate[0], 1):
        print(f"  Candidate {i}: {', '.join(invoice_numbers)}")
    
    print("\nUnverified Payment Amounts:")
    for i, amount in enumerate(unverified_candidate[1], 1):
        print(f"  Candidate {i}: {amount}")
    
    print("-" * 40)

    if args.output:
        with open(args.output, 'w', encoding='utf-8') as f:
            f.write("Extracted text:\n")
            f.write(extracted_text)
            f.write("\n\nExtracted invoice numbers and payment amounts:\n")
            f.write(f"Multi-hop processing: {'Enabled' if args.multi_hop else 'Disabled'}\n")
            f.write(f"Verification: {'Enabled' if args.verify else 'Disabled'}\n")
            f.write("Verified Results:\n")
            if len(verified_candidate[0]) == 1 and len(verified_candidate[1]) == 1:
                f.write(f"  Verified Invoice Numbers: {', '.join(verified_candidate[0][0])}\n")
                f.write(f"  Verified Amount: {verified_candidate[1][0]}\n")
            elif len(verified_candidate[0]) == 1 and len(verified_candidate[1]) == 0:
                f.write(f"  Verified Invoice Numbers: {', '.join(verified_candidate[0][0])}\n")
                f.write("  Amount doesn't match. This might be a partial payment.\n")
            elif len(verified_candidate[0]) >= 2:
                f.write("  Verified Invoice Numbers:\n")
                for i, invoice_numbers in enumerate(verified_candidate[0], 1):
                    f.write(f"    List {i}: {', '.join(invoice_numbers)}\n")
            else:
                f.write("  No verified results.\n")
            f.write("\nUnverified Invoice Numbers:\n")
            for i, invoice_numbers in enumerate(unverified_candidate[0], 1):
                f.write(f"  Candidate {i}: {', '.join(invoice_numbers)}\n")
            f.write("\nUnverified Payment Amounts:\n")
            for i, amount in enumerate(unverified_candidate[1], 1):
                f.write(f"  Candidate {i}: {amount}\n")
        print(f"Extracted text and invoice numbers written to {args.output}")

if __name__ == "__main__":
    main()