|
import boto3 |
|
import os |
|
import subprocess |
|
|
|
print("In lambda_entrypoint function") |
|
|
|
try: |
|
s3_client = boto3.client("s3", region_name="eu-west-2") |
|
print("s3_client is initialized:", s3_client) |
|
except Exception as e: |
|
print(f"Error initializing s3_client: {e}") |
|
raise e |
|
|
|
TMP_DIR = "/tmp/" |
|
|
|
run_direct_mode = os.getenv("RUN_DIRECT_MODE", "0") |
|
|
|
if run_direct_mode == "0": |
|
|
|
from app import app, max_queue_size, max_file_size |
|
from tools.auth import authenticate_user |
|
|
|
if os.getenv("COGNITO_AUTH", "0") == "1": |
|
app.queue(max_size=max_queue_size).launch(show_error=True, auth=authenticate_user, max_file_size=max_file_size) |
|
else: |
|
app.queue(max_size=max_queue_size).launch(show_error=True, inbrowser=True, max_file_size=max_file_size) |
|
|
|
def download_file_from_s3(bucket_name, key, download_path): |
|
"""Download a file from S3 to the local filesystem.""" |
|
s3_client.download_file(bucket_name, key, download_path) |
|
print(f"Downloaded {key} to {download_path}") |
|
|
|
def upload_file_to_s3(file_path, bucket_name, key): |
|
"""Upload a file to S3.""" |
|
s3_client.upload_file(file_path, bucket_name, key) |
|
print(f"Uploaded {file_path} to {key}") |
|
|
|
def lambda_handler(event, context): |
|
|
|
print("In lambda_handler function") |
|
|
|
|
|
os.makedirs(os.path.join(TMP_DIR, "input"), exist_ok=True) |
|
os.makedirs(os.path.join(TMP_DIR, "output"), exist_ok=True) |
|
|
|
print("Got to record loop") |
|
print("Event records is:", event["Records"]) |
|
|
|
|
|
for record in event.get("Records", [{}]): |
|
bucket_name = record.get("s3", {}).get("bucket", {}).get("name") |
|
input_key = record.get("s3", {}).get("object", {}).get("key") |
|
print(f"Processing file {input_key} from bucket {bucket_name}") |
|
|
|
|
|
arguments = event.get("arguments", {}) |
|
|
|
if not input_key: |
|
input_key = arguments.get("input_file", "") |
|
|
|
ocr_method = arguments.get("ocr_method", "Complex image analysis - docs with handwriting/signatures (AWS Textract)") |
|
pii_detector = arguments.get("pii_detector", "AWS Comprehend") |
|
page_min = str(arguments.get("page_min", 0)) |
|
page_max = str(arguments.get("page_max", 0)) |
|
allow_list = arguments.get("allow_list", None) |
|
output_dir = arguments.get("output_dir", os.path.join(TMP_DIR, "output")) |
|
|
|
print(f"OCR Method: {ocr_method}") |
|
print(f"PII Detector: {pii_detector}") |
|
print(f"Page Range: {page_min} - {page_max}") |
|
print(f"Allow List: {allow_list}") |
|
print(f"Output Directory: {output_dir}") |
|
|
|
|
|
input_file_path = os.path.join(TMP_DIR, "input", os.path.basename(input_key)) |
|
download_file_from_s3(bucket_name, input_key, input_file_path) |
|
|
|
|
|
command = [ |
|
"python", |
|
"app.py", |
|
"--input_file", input_file_path, |
|
"--ocr_method", ocr_method, |
|
"--pii_detector", pii_detector, |
|
"--page_min", page_min, |
|
"--page_max", page_max, |
|
"--output_dir", output_dir, |
|
] |
|
|
|
|
|
if allow_list: |
|
allow_list_path = os.path.join(TMP_DIR, "allow_list.csv") |
|
download_file_from_s3(bucket_name, allow_list, allow_list_path) |
|
command.extend(["--allow_list", allow_list_path]) |
|
|
|
print(f"Running command: {command}") |
|
|
|
try: |
|
result = subprocess.run(command, capture_output=True, text=True, check=True) |
|
print("Processing succeeded.") |
|
print(result.stdout) |
|
except subprocess.CalledProcessError as e: |
|
print("Error during processing:", e.stderr) |
|
raise e |
|
except Exception as e: |
|
print(f"Unexpected error: {str(e)}") |
|
raise e |
|
|
|
print("Now uploading files from:", output_dir) |
|
|
|
|
|
for root, _, files in os.walk(output_dir): |
|
for file_name in files: |
|
print("file_name:", file_name) |
|
local_file_path = os.path.join(root, file_name) |
|
output_key = f"output/{file_name}" |
|
print("Output location is:", output_key) |
|
upload_file_to_s3(local_file_path, bucket_name, output_key) |
|
|
|
return {"statusCode": 200, "body": "Processing complete."} |