Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on May 7

Commit

3bbf593

1 Parent(s): 3dbd1f7

Added config options for compressing output pdfs, returning output redacted pdfs at all, and for changing the length of time for showing previous Textract jobs

Browse files

Files changed (5) hide show

tools/config.py +17 -4
tools/file_conversion.py +11 -1
tools/file_redaction.py +20 -17
tools/redaction_review.py +5 -4
tools/textract_batch_call.py +10 -6

tools/config.py CHANGED Viewed

@@ -146,6 +146,10 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
         if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
@@ -211,8 +215,12 @@ CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
 REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
 ###
-# APP RUN CONFIG
 ###
 TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
@@ -245,7 +253,9 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
-### COST CODE OPTIONS
 SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
@@ -265,7 +275,9 @@ ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If y
 if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
-### WHOLE DOCUMENT API OPTIONS
 SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
@@ -281,5 +293,6 @@ TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') #
 TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
-TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored

         if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
+###
+# LOGS
+###
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
 REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
+RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
+COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF", "True") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 ###
+# APP RUN OPTIONS
 ###
 TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
 if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
 else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
+###
+# COST CODE OPTIONS
+###
 SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
 if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
+###
+# WHOLE DOCUMENT API OPTIONS
+###
 SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
 TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
+TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '30') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.

tools/file_conversion.py CHANGED Viewed

@@ -27,7 +27,7 @@ IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
 pd.set_option('future.no_silent_downcasting', True)
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
 from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
 # from tools.aws_textract import load_and_convert_textract_json
@@ -35,6 +35,7 @@ image_dpi = float(IMAGES_DPI)
 if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
 else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
 ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
 def is_pdf_or_image(filename):
     """
@@ -841,6 +842,15 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
     return out_message, out_file_paths
 def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
     # Threshold for matching
     threshold = 5

 pd.set_option('future.no_silent_downcasting', True)
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF
 from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
 # from tools.aws_textract import load_and_convert_textract_json
 if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
 else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
 ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
+COMPRESS_REDACTED_PDF = COMPRESS_REDACTED_PDF.lower() == "true"
 def is_pdf_or_image(filename):
     """
     return out_message, out_file_paths
+def save_pdf_with_or_without_compression(pymupdf_doc:object, out_redacted_pdf_file_path, COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF):
+    '''
+    Save a pymupdf document with basic cleaning or with full compression options. Can be useful for low memory systems to do minimal cleaning to avoid crashing with large PDFs.
+    '''
+    if COMPRESS_REDACTED_PDF == True:
+        pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
+    else:
+        pymupdf_doc.save(out_redacted_pdf_file_path, garbage=1, clean=True)
 def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
     # Threshold for matching
     threshold = 5

tools/file_redaction.py CHANGED Viewed

@@ -19,9 +19,9 @@ import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
-from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
-from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
@@ -31,6 +31,8 @@ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
 else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
 image_dpi = float(IMAGES_DPI)
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
@@ -104,6 +106,7 @@ def choose_and_run_redactor(file_paths:List[str],
  all_page_line_level_ocr_results = [],
  all_page_line_level_ocr_results_with_words = [],
  prepare_images:bool=True,
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -155,6 +158,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
     - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
@@ -533,22 +537,21 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save redacted file
             if pii_identification_method != no_redaction_option:
-                if is_pdf(file_path) == False:
-                    out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
-                    # pymupdf_doc is an image list in this case
-                    if isinstance(pymupdf_doc[-1], str):
-                        img = Image.open(pymupdf_doc[-1])
-                    # Otherwise could be an image object
                     else:
-                        img = pymupdf_doc[-1]
-                    img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
-                    #
-                else:
-                    out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
-                    print("Saving redacted PDF file:", out_redacted_pdf_file_path)
-                    pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
-                out_file_paths.append(out_redacted_pdf_file_path)
             if not all_line_level_ocr_results_df.empty:
                 all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]

 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
+from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
+from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
 else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
 image_dpi = float(IMAGES_DPI)
+RETURN_PDF_END_OF_REDACTION = RETURN_PDF_END_OF_REDACTION.lower() == "true"
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
  all_page_line_level_ocr_results = [],
  all_page_line_level_ocr_results_with_words = [],
  prepare_images:bool=True,
+ RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
  progress=gr.Progress(track_tqdm=True)):
     '''
     This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
     - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
     - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
+    - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted document along with processing logs.
             # Save redacted file
             if pii_identification_method != no_redaction_option:
+                if RETURN_PDF_END_OF_REDACTION == True:
+                    if is_pdf(file_path) == False:
+                        out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
+                        # pymupdf_doc is an image list in this case
+                        if isinstance(pymupdf_doc[-1], str):
+                            img = Image.open(pymupdf_doc[-1])
+                        # Otherwise could be an image object
+                        else:
+                            img = pymupdf_doc[-1]
+                        img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
                     else:
+                        out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
+                        print("Saving redacted PDF file:", out_redacted_pdf_file_path)
+                        save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
+                    out_file_paths.append(out_redacted_pdf_file_path)
             if not all_line_level_ocr_results_df.empty:
                 all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]

tools/redaction_review.py CHANGED Viewed

@@ -13,8 +13,8 @@ from pymupdf import Document, Rect
 import pymupdf
 from PIL import ImageDraw, Image
-from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
-from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
@@ -731,9 +731,10 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                      output_folder:str = OUTPUT_FOLDER,
                      save_pdf:bool=True,
                      page_sizes:List[dict]=[],
                      progress=gr.Progress(track_tqdm=True)):
     '''
-    Apply modified redactions to a pymupdf and export review files
     '''
     output_files = []
@@ -849,7 +850,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
             #try:
             if pdf_doc:
                 out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
-                pdf_doc.save(out_pdf_file_path, garbage=4, deflate=True, clean=True)
                 output_files.append(out_pdf_file_path)
             else:

 import pymupdf
 from PIL import ImageDraw, Image
+from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
+from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
                      output_folder:str = OUTPUT_FOLDER,
                      save_pdf:bool=True,
                      page_sizes:List[dict]=[],
+                     COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
                      progress=gr.Progress(track_tqdm=True)):
     '''
+    Apply modified redactions to a pymupdf and export review files.
     '''
     output_files = []
             #try:
             if pdf_doc:
                 out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
+                save_pdf_with_or_without_compression(pdf_doc, out_pdf_file_path, COMPRESS_REDACTED_PDF)
                 output_files.append(out_pdf_file_path)
             else:

tools/textract_batch_call.py CHANGED Viewed

@@ -11,11 +11,13 @@ from typing import List
 from io import StringIO
 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
-from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER
 from tools.aws_functions import download_file_from_s3
 from tools.file_conversion import get_input_file_names
 from tools.helper_functions import get_file_name_without_type
 def analyse_document_with_textract_api(
     local_pdf_path: str,
     s3_input_prefix: str,
@@ -402,6 +404,7 @@ def poll_whole_document_textract_analysis_progress_and_download(
     load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
     max_polling_attempts: int = 1, # ~10 minutes total wait time):
     progress = gr.Progress(track_tqdm=True)
     ):
     '''
@@ -446,11 +449,11 @@ def poll_whole_document_textract_analysis_progress_and_download(
                 else:
                     error = f"Unknown job type, cannot poll job"
                     print(error)
-                    #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
                     raise
             except textract_client.exceptions.InvalidJobIdException:
-                error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed."
                 print(error_message)
                 logging.error(error_message)
                 raise
@@ -514,7 +517,8 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
                                      load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
                                      document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
-                                     aws_region:str=AWS_REGION):
     '''
     Load in a dataframe of jobs previous submitted to the Textract API service.
     '''
@@ -550,8 +554,8 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
         if "job_date_time" in job_df.columns:
             job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
-            # Keep only jobs that have been completed in the last 7 days
-            cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=7)
             job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
     return job_df

 from io import StringIO
 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
+from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
 from tools.aws_functions import download_file_from_s3
 from tools.file_conversion import get_input_file_names
 from tools.helper_functions import get_file_name_without_type
+DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
 def analyse_document_with_textract_api(
     local_pdf_path: str,
     s3_input_prefix: str,
     load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
     poll_interval_seconds: int = 1,
     max_polling_attempts: int = 1, # ~10 minutes total wait time):
+    DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
     progress = gr.Progress(track_tqdm=True)
     ):
     '''
                 else:
                     error = f"Unknown job type, cannot poll job"
                     print(error)
+                    #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed.")
                     raise
             except textract_client.exceptions.InvalidJobIdException:
+                error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
                 print(error_message)
                 logging.error(error_message)
                 raise
                                      load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
                                      load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
                                      document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
+                                     aws_region:str=AWS_REGION,
+                                     DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS:int=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS):
     '''
     Load in a dataframe of jobs previous submitted to the Textract API service.
     '''
         if "job_date_time" in job_df.columns:
             job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
+            # Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
+            cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
             job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
     return job_df