seanpedrickcase commited on
Commit
3bbf593
·
1 Parent(s): 3dbd1f7

Added config options for compressing output pdfs, returning output redacted pdfs at all, and for changing the length of time for showing previous Textract jobs

Browse files
tools/config.py CHANGED
@@ -146,6 +146,10 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
146
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
147
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
148
 
 
 
 
 
149
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
150
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
151
 
@@ -211,8 +215,12 @@ CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
211
 
212
  REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
213
 
 
 
 
 
214
  ###
215
- # APP RUN CONFIG
216
  ###
217
 
218
  TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
@@ -245,7 +253,9 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
245
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
246
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
247
 
248
- ### COST CODE OPTIONS
 
 
249
 
250
  SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
251
 
@@ -265,7 +275,9 @@ ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If y
265
 
266
  if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
267
 
268
- ### WHOLE DOCUMENT API OPTIONS
 
 
269
 
270
  SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
271
 
@@ -281,5 +293,6 @@ TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') #
281
 
282
  TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
283
 
 
284
 
285
- TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
 
146
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
147
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
148
 
149
+ ###
150
+ # LOGS
151
+ ###
152
+
153
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
154
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
155
 
 
215
 
216
  REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
217
 
218
+ RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
219
+
220
+ COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF", "True") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
221
+
222
  ###
223
+ # APP RUN OPTIONS
224
  ###
225
 
226
  TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
 
253
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
254
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
255
 
256
+ ###
257
+ # COST CODE OPTIONS
258
+ ###
259
 
260
  SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
261
 
 
275
 
276
  if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
277
 
278
+ ###
279
+ # WHOLE DOCUMENT API OPTIONS
280
+ ###
281
 
282
  SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
283
 
 
293
 
294
  TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
295
 
296
+ TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
297
 
298
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '30') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
tools/file_conversion.py CHANGED
@@ -27,7 +27,7 @@ IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
27
 
28
  pd.set_option('future.no_silent_downcasting', True)
29
 
30
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
31
  from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
32
  # from tools.aws_textract import load_and_convert_textract_json
33
 
@@ -35,6 +35,7 @@ image_dpi = float(IMAGES_DPI)
35
  if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
36
  else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
37
  ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
 
38
 
39
  def is_pdf_or_image(filename):
40
  """
@@ -841,6 +842,15 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
841
 
842
  return out_message, out_file_paths
843
 
 
 
 
 
 
 
 
 
 
844
  def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
845
  # Threshold for matching
846
  threshold = 5
 
27
 
28
  pd.set_option('future.no_silent_downcasting', True)
29
 
30
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF
31
  from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
32
  # from tools.aws_textract import load_and_convert_textract_json
33
 
 
35
  if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
36
  else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
37
  ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
38
+ COMPRESS_REDACTED_PDF = COMPRESS_REDACTED_PDF.lower() == "true"
39
 
40
  def is_pdf_or_image(filename):
41
  """
 
842
 
843
  return out_message, out_file_paths
844
 
845
+ def save_pdf_with_or_without_compression(pymupdf_doc:object, out_redacted_pdf_file_path, COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF):
846
+ '''
847
+ Save a pymupdf document with basic cleaning or with full compression options. Can be useful for low memory systems to do minimal cleaning to avoid crashing with large PDFs.
848
+ '''
849
+ if COMPRESS_REDACTED_PDF == True:
850
+ pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
851
+ else:
852
+ pymupdf_doc.save(out_redacted_pdf_file_path, garbage=1, clean=True)
853
+
854
  def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
855
  # Threshold for matching
856
  threshold = 5
tools/file_redaction.py CHANGED
@@ -19,9 +19,9 @@ import gradio as gr
19
  from gradio import Progress
20
  from collections import defaultdict # For efficient grouping
21
 
22
- from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
23
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
24
- from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json
25
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
  from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
27
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
@@ -31,6 +31,8 @@ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
31
  else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
32
  image_dpi = float(IMAGES_DPI)
33
 
 
 
34
  def bounding_boxes_overlap(box1, box2):
35
  """Check if two bounding boxes overlap."""
36
  return (box1[0] < box2[2] and box2[0] < box1[2] and
@@ -104,6 +106,7 @@ def choose_and_run_redactor(file_paths:List[str],
104
  all_page_line_level_ocr_results = [],
105
  all_page_line_level_ocr_results_with_words = [],
106
  prepare_images:bool=True,
 
107
  progress=gr.Progress(track_tqdm=True)):
108
  '''
109
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -155,6 +158,7 @@ def choose_and_run_redactor(file_paths:List[str],
155
  - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
156
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
157
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
 
158
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
159
 
160
  The function returns a redacted document along with processing logs.
@@ -533,22 +537,21 @@ def choose_and_run_redactor(file_paths:List[str],
533
 
534
  # Save redacted file
535
  if pii_identification_method != no_redaction_option:
536
- if is_pdf(file_path) == False:
537
- out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
538
- # pymupdf_doc is an image list in this case
539
- if isinstance(pymupdf_doc[-1], str):
540
- img = Image.open(pymupdf_doc[-1])
541
- # Otherwise could be an image object
 
 
 
 
542
  else:
543
- img = pymupdf_doc[-1]
544
- img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
545
- #
546
- else:
547
- out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
548
- print("Saving redacted PDF file:", out_redacted_pdf_file_path)
549
- pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
550
-
551
- out_file_paths.append(out_redacted_pdf_file_path)
552
 
553
  if not all_line_level_ocr_results_df.empty:
554
  all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
 
19
  from gradio import Progress
20
  from collections import defaultdict # For efficient grouping
21
 
22
+ from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION
23
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
24
+ from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
25
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
  from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
27
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
 
31
  else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
32
  image_dpi = float(IMAGES_DPI)
33
 
34
+ RETURN_PDF_END_OF_REDACTION = RETURN_PDF_END_OF_REDACTION.lower() == "true"
35
+
36
  def bounding_boxes_overlap(box1, box2):
37
  """Check if two bounding boxes overlap."""
38
  return (box1[0] < box2[2] and box2[0] < box1[2] and
 
106
  all_page_line_level_ocr_results = [],
107
  all_page_line_level_ocr_results_with_words = [],
108
  prepare_images:bool=True,
109
+ RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
110
  progress=gr.Progress(track_tqdm=True)):
111
  '''
112
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
 
158
  - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
159
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
160
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
161
+ - RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
162
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
163
 
164
  The function returns a redacted document along with processing logs.
 
537
 
538
  # Save redacted file
539
  if pii_identification_method != no_redaction_option:
540
+ if RETURN_PDF_END_OF_REDACTION == True:
541
+ if is_pdf(file_path) == False:
542
+ out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
543
+ # pymupdf_doc is an image list in this case
544
+ if isinstance(pymupdf_doc[-1], str):
545
+ img = Image.open(pymupdf_doc[-1])
546
+ # Otherwise could be an image object
547
+ else:
548
+ img = pymupdf_doc[-1]
549
+ img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
550
  else:
551
+ out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
552
+ print("Saving redacted PDF file:", out_redacted_pdf_file_path)
553
+ save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
554
+ out_file_paths.append(out_redacted_pdf_file_path)
 
 
 
 
 
555
 
556
  if not all_line_level_ocr_results_df.empty:
557
  all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
tools/redaction_review.py CHANGED
@@ -13,8 +13,8 @@ from pymupdf import Document, Rect
13
  import pymupdf
14
  from PIL import ImageDraw, Image
15
 
16
- from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
17
- from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes
18
  from tools.helper_functions import get_file_name_without_type, detect_file_type
19
  from tools.file_redaction import redact_page_with_pymupdf
20
 
@@ -731,9 +731,10 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
731
  output_folder:str = OUTPUT_FOLDER,
732
  save_pdf:bool=True,
733
  page_sizes:List[dict]=[],
 
734
  progress=gr.Progress(track_tqdm=True)):
735
  '''
736
- Apply modified redactions to a pymupdf and export review files
737
  '''
738
 
739
  output_files = []
@@ -849,7 +850,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
849
  #try:
850
  if pdf_doc:
851
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
852
- pdf_doc.save(out_pdf_file_path, garbage=4, deflate=True, clean=True)
853
  output_files.append(out_pdf_file_path)
854
 
855
  else:
 
13
  import pymupdf
14
  from PIL import ImageDraw, Image
15
 
16
+ from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
17
+ from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
18
  from tools.helper_functions import get_file_name_without_type, detect_file_type
19
  from tools.file_redaction import redact_page_with_pymupdf
20
 
 
731
  output_folder:str = OUTPUT_FOLDER,
732
  save_pdf:bool=True,
733
  page_sizes:List[dict]=[],
734
+ COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
735
  progress=gr.Progress(track_tqdm=True)):
736
  '''
737
+ Apply modified redactions to a pymupdf and export review files.
738
  '''
739
 
740
  output_files = []
 
850
  #try:
851
  if pdf_doc:
852
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
853
+ save_pdf_with_or_without_compression(pdf_doc, out_pdf_file_path, COMPRESS_REDACTED_PDF)
854
  output_files.append(out_pdf_file_path)
855
 
856
  else:
tools/textract_batch_call.py CHANGED
@@ -11,11 +11,13 @@ from typing import List
11
  from io import StringIO
12
  from urllib.parse import urlparse
13
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
14
- from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER
15
  from tools.aws_functions import download_file_from_s3
16
  from tools.file_conversion import get_input_file_names
17
  from tools.helper_functions import get_file_name_without_type
18
 
 
 
19
  def analyse_document_with_textract_api(
20
  local_pdf_path: str,
21
  s3_input_prefix: str,
@@ -402,6 +404,7 @@ def poll_whole_document_textract_analysis_progress_and_download(
402
  load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
403
  poll_interval_seconds: int = 1,
404
  max_polling_attempts: int = 1, # ~10 minutes total wait time):
 
405
  progress = gr.Progress(track_tqdm=True)
406
  ):
407
  '''
@@ -446,11 +449,11 @@ def poll_whole_document_textract_analysis_progress_and_download(
446
  else:
447
  error = f"Unknown job type, cannot poll job"
448
  print(error)
449
- #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
450
  raise
451
 
452
  except textract_client.exceptions.InvalidJobIdException:
453
- error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed."
454
  print(error_message)
455
  logging.error(error_message)
456
  raise
@@ -514,7 +517,8 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
514
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
515
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
516
  document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
517
- aws_region:str=AWS_REGION):
 
518
  '''
519
  Load in a dataframe of jobs previous submitted to the Textract API service.
520
  '''
@@ -550,8 +554,8 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
550
 
551
  if "job_date_time" in job_df.columns:
552
  job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
553
- # Keep only jobs that have been completed in the last 7 days
554
- cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=7)
555
  job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
556
 
557
  return job_df
 
11
  from io import StringIO
12
  from urllib.parse import urlparse
13
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
14
+ from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
15
  from tools.aws_functions import download_file_from_s3
16
  from tools.file_conversion import get_input_file_names
17
  from tools.helper_functions import get_file_name_without_type
18
 
19
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
20
+
21
  def analyse_document_with_textract_api(
22
  local_pdf_path: str,
23
  s3_input_prefix: str,
 
404
  load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
405
  poll_interval_seconds: int = 1,
406
  max_polling_attempts: int = 1, # ~10 minutes total wait time):
407
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
408
  progress = gr.Progress(track_tqdm=True)
409
  ):
410
  '''
 
449
  else:
450
  error = f"Unknown job type, cannot poll job"
451
  print(error)
452
+ #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed.")
453
  raise
454
 
455
  except textract_client.exceptions.InvalidJobIdException:
456
+ error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
457
  print(error_message)
458
  logging.error(error_message)
459
  raise
 
517
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
518
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
519
  document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
520
+ aws_region:str=AWS_REGION,
521
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS:int=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS):
522
  '''
523
  Load in a dataframe of jobs previous submitted to the Textract API service.
524
  '''
 
554
 
555
  if "job_date_time" in job_df.columns:
556
  job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
557
+ # Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
558
+ cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
559
  job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
560
 
561
  return job_df