remittance-poc-with-verifier / remittance_pdf_processing_utils.py
eelang's picture
Upload 8 files
7850a69 verified
import logging
def setup_logger():
# Create a logger
logger = logging.getLogger('remittance_processing')
logger.setLevel(logging.DEBUG)
# Create handlers
c_handler = logging.StreamHandler()
f_handler = logging.FileHandler('remittance_processing.log')
c_handler.setLevel(logging.INFO)
f_handler.setLevel(logging.DEBUG)
# Create formatters and add it to handlers
log_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
c_handler.setFormatter(log_format)
f_handler.setFormatter(log_format)
# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)
return logger
remittance_logger = setup_logger()
def remove_duplicate_lists(lists):
"""
Remove duplicate lists from a list of lists.
Args:
lists:
a list of lists of strings
Returns:
a list of lists of strings, where each list is unique
"""
seen = set()
unique_lists = []
for lst in lists:
sorted_list = tuple(sorted(lst))
if sorted_list not in seen:
seen.add(sorted_list)
unique_lists.append(lst)
return unique_lists
import re
from decimal import Decimal, ROUND_HALF_UP
def format_amount_str_to_decimal(amount_str: str) -> str:
def standardize_number(s):
# Find the last occurrence of a comma or period
last_separator_index = max(s.rfind(','), s.rfind('.'))
if last_separator_index != -1:
# Split the string into two parts
before_separator = s[:last_separator_index]
after_separator = s[last_separator_index+1:]
# Clean the first part of any commas, periods, or whitespace
before_separator_cleaned = re.sub(r'[.,\s]', '', before_separator)
# Ensure the decimal part starts with a period, even if it was a comma
standardized_s = before_separator_cleaned + '.' + after_separator
else:
# If there's no separator, just remove commas, periods, or whitespace
standardized_s = re.sub(r'[.,\s]', '', s)
return standardized_s
def remove_chars_after_last_digit(s):
# Remove any non-digit characters following the last digit in the string
return re.sub(r'(?<=\d)[^\d]*$', '', s)
def clean_text(s):
# This pattern looks for:
# - Optional non-digit or non-negative sign characters followed by whitespace (if any)
# - Followed by any characters until a digit is found in the word
# It then replaces this matched portion with the remaining part of the word from the first digit
cleaned_s = re.sub(r'[^\d-]*\s?(\S*\d\S*)', r'\1', s)
return cleaned_s
# Run functions to format a text decimal
cleaned_amount = clean_text(remove_chars_after_last_digit(standardize_number(amount_str.strip().lower())))
# Convert to Decimal and round to 2 decimal places
try:
decimal_amount = Decimal(cleaned_amount).quantize(Decimal('0.01'), rounding=ROUND_HALF_UP)
return f"{decimal_amount:.2f}"
except:
# If conversion fails, return the cleaned string as is
return cleaned_amount