seanpedrickcase commited on
Commit
5a21738
·
1 Parent(s): 3bbf593

Updated gradio version. Minor changes to redactor function sequence. Minor formatting and wording changes.

Browse files
README.md CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 0.6.4
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 0.6.5
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
app.py CHANGED
@@ -219,8 +219,8 @@ with app:
219
  check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
220
  job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
221
  job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
222
- textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
223
- selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
224
  is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
225
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
226
  job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
@@ -287,15 +287,16 @@ with app:
287
  send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
288
  with gr.Row(equal_height=False):
289
  with gr.Column(scale=2):
290
- textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(6,'fixed'), static_columns=[0,1,2,3,4,5])
291
  with gr.Column(scale=1):
292
  job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
293
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
294
- with gr.Row():
295
- job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
296
- textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
297
-
298
- convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=True)
 
299
 
300
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
301
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
@@ -534,13 +535,11 @@ with app:
534
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
535
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
536
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
537
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc").\
538
- success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
539
 
540
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
541
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
542
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
543
- success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
544
 
545
  # If a file has been completed, the function will continue onto the next document
546
  latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
@@ -574,7 +573,8 @@ with app:
574
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
575
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
576
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
577
- outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
 
578
 
579
  ###
580
  # REVIEW PDF REDACTIONS
@@ -640,7 +640,7 @@ with app:
640
 
641
 
642
  # Review OCR text buttom
643
- all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
644
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
645
 
646
  # Convert review file to xfdf Adobe format
 
219
  check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
220
  job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
221
  job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
222
+ textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
223
+ selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
224
  is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
225
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
226
  job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
 
287
  send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
288
  with gr.Row(equal_height=False):
289
  with gr.Column(scale=2):
290
+ textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
291
  with gr.Column(scale=1):
292
  job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
293
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
294
+ with gr.Row():
295
+ with gr.Column():
296
+ textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
297
+ with gr.Column():
298
+ job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
299
+ convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results", variant="secondary", visible=True)
300
 
301
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
302
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
 
535
  document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
536
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
537
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
538
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc")
 
539
 
540
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
541
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
542
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
 
543
 
544
  # If a file has been completed, the function will continue onto the next document
545
  latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
 
573
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
574
  success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
575
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
576
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
577
+ success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
578
 
579
  ###
580
  # REVIEW PDF REDACTIONS
 
640
 
641
 
642
  # Review OCR text buttom
643
+ all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row])
644
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
645
 
646
  # Convert review file to xfdf Adobe format
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.6.4"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
@@ -23,7 +23,7 @@ dependencies = [
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.27.1",
27
  "boto3==1.38.4",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "0.6.5"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.29.0",
27
  "boto3==1.38.4",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
requirements.txt CHANGED
@@ -10,7 +10,7 @@ pandas==2.2.3
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.27.1
14
  boto3==1.38.4
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
 
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.29.0
14
  boto3==1.38.4
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
tools/config.py CHANGED
@@ -59,6 +59,11 @@ def add_folder_to_path(folder_path: str):
59
  else:
60
  print(f"Folder not found at {folder_path} - not added to PATH")
61
 
 
 
 
 
 
62
  ensure_folder_exists("config/")
63
 
64
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
@@ -70,15 +75,12 @@ if APP_CONFIG_PATH:
70
  load_dotenv(APP_CONFIG_PATH)
71
  else: print("App config file not found at location:", APP_CONFIG_PATH)
72
 
73
- # Report logging to console?
74
- LOGGING = get_or_create_env_var('LOGGING', 'False')
75
 
76
- if LOGGING == 'True':
77
- # Configure logging
78
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
79
 
80
  ###
81
- # AWS CONFIG
82
  ###
83
 
84
  # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
@@ -108,27 +110,27 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
108
 
109
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
110
 
111
-
112
-
113
  # Custom headers e.g. if routing traffic through Cloudfront
114
  # Retrieving or setting CUSTOM_HEADER
115
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
116
- #if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
117
 
118
  # Retrieving or setting CUSTOM_HEADER_VALUE
119
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
120
- #if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
 
 
121
 
122
  ###
123
- # Images config
124
  ###
125
  IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
126
  LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
127
  MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
128
 
129
  ###
130
- # File I/O config
131
  ###
 
132
  SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
133
 
134
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
@@ -146,8 +148,9 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
146
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
147
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
148
 
 
149
  ###
150
- # LOGS
151
  ###
152
 
153
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
@@ -194,8 +197,18 @@ DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEA
194
  USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
195
  DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
196
 
 
 
 
 
 
 
 
 
 
 
197
  ###
198
- # REDACTION
199
  ###
200
 
201
  # Create Tesseract and Poppler folders if you have installed them locally
@@ -211,13 +224,16 @@ PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
211
 
212
  MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
213
 
214
- CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
215
 
216
  REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
217
 
218
  RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
219
 
220
- COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF", "True") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
 
 
 
221
 
222
  ###
223
  # APP RUN OPTIONS
@@ -253,6 +269,9 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
253
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
254
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
255
 
 
 
 
256
  ###
257
  # COST CODE OPTIONS
258
  ###
@@ -275,6 +294,9 @@ ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If y
275
 
276
  if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
277
 
 
 
 
278
  ###
279
  # WHOLE DOCUMENT API OPTIONS
280
  ###
@@ -295,4 +317,4 @@ TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC',
295
 
296
  TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
297
 
298
- DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '30') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
 
59
  else:
60
  print(f"Folder not found at {folder_path} - not added to PATH")
61
 
62
+
63
+ ###
64
+ # LOAD CONFIG FROM ENV FILE
65
+ ###
66
+
67
  ensure_folder_exists("config/")
68
 
69
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
 
75
  load_dotenv(APP_CONFIG_PATH)
76
  else: print("App config file not found at location:", APP_CONFIG_PATH)
77
 
 
 
78
 
79
+
80
+
 
81
 
82
  ###
83
+ # AWS OPTIONS
84
  ###
85
 
86
  # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
 
110
 
111
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
112
 
 
 
113
  # Custom headers e.g. if routing traffic through Cloudfront
114
  # Retrieving or setting CUSTOM_HEADER
115
  CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
 
116
 
117
  # Retrieving or setting CUSTOM_HEADER_VALUE
118
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
119
+
120
+
121
+
122
 
123
  ###
124
+ # Image options
125
  ###
126
  IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
127
  LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
128
  MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
129
 
130
  ###
131
+ # File I/O options
132
  ###
133
+
134
  SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
135
 
136
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
 
148
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
149
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
150
 
151
+
152
  ###
153
+ # LOGGING OPTIONS
154
  ###
155
 
156
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 
197
  USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
198
  DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
199
 
200
+ # Report logging to console?
201
+ LOGGING = get_or_create_env_var('LOGGING', 'False')
202
+
203
+ if LOGGING == 'True':
204
+ # Configure logging
205
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
206
+
207
+
208
+
209
+
210
  ###
211
+ # REDACTION OPTIONS
212
  ###
213
 
214
  # Create Tesseract and Poppler folders if you have installed them locally
 
224
 
225
  MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
226
 
227
+ CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
228
 
229
  REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
230
 
231
  RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
232
 
233
+ COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
234
+
235
+
236
+
237
 
238
  ###
239
  # APP RUN OPTIONS
 
269
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
270
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
271
 
272
+
273
+
274
+
275
  ###
276
  # COST CODE OPTIONS
277
  ###
 
294
 
295
  if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
296
 
297
+
298
+
299
+
300
  ###
301
  # WHOLE DOCUMENT API OPTIONS
302
  ###
 
317
 
318
  TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
319
 
320
+ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
tools/file_redaction.py CHANGED
@@ -236,6 +236,11 @@ def choose_and_run_redactor(file_paths:List[str],
236
 
237
  combined_out_message = re.sub(r'^\n+', '', combined_out_message).strip()
238
 
 
 
 
 
 
239
  # Only send across review file if redaction has been done
240
  if pii_identification_method != no_redaction_option:
241
 
@@ -533,11 +538,13 @@ def choose_and_run_redactor(file_paths:List[str],
533
  if latest_file_completed != len(file_paths_list):
534
  print("Completed file number:", str(latest_file_completed), "there are more files to do")
535
 
536
- progress(0.9, "Saving redacted PDF file")
537
 
538
  # Save redacted file
539
  if pii_identification_method != no_redaction_option:
540
  if RETURN_PDF_END_OF_REDACTION == True:
 
 
541
  if is_pdf(file_path) == False:
542
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
543
  # pymupdf_doc is an image list in this case
@@ -567,11 +574,10 @@ def choose_and_run_redactor(file_paths:List[str],
567
 
568
  # Convert the gradio annotation boxes to relative coordinates
569
  # Convert annotations_all_pages to a consistent relative coordinate format output
 
570
  page_sizes = page_sizes_df.to_dict(orient="records")
571
  all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
572
-
573
  all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
574
-
575
  annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
576
  annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
577
 
@@ -937,7 +943,7 @@ def set_cropbox_safely(page, original_cropbox):
937
  """
938
  mediabox = page.mediabox
939
  if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
940
- print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
941
  page.set_cropbox(mediabox)
942
  else:
943
  page.set_cropbox(original_cropbox)
 
236
 
237
  combined_out_message = re.sub(r'^\n+', '', combined_out_message).strip()
238
 
239
+ end_message = "\n\nPlease review and modify the suggested redaction outputs on the 'Review redactions' tab of the app (you can find this under the introduction text at the top of the page)."
240
+
241
+ if not end_message in combined_out_message:
242
+ combined_out_message = combined_out_message + end_message
243
+
244
  # Only send across review file if redaction has been done
245
  if pii_identification_method != no_redaction_option:
246
 
 
538
  if latest_file_completed != len(file_paths_list):
539
  print("Completed file number:", str(latest_file_completed), "there are more files to do")
540
 
541
+
542
 
543
  # Save redacted file
544
  if pii_identification_method != no_redaction_option:
545
  if RETURN_PDF_END_OF_REDACTION == True:
546
+ progress(0.9, "Saving redacted file")
547
+
548
  if is_pdf(file_path) == False:
549
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
550
  # pymupdf_doc is an image list in this case
 
574
 
575
  # Convert the gradio annotation boxes to relative coordinates
576
  # Convert annotations_all_pages to a consistent relative coordinate format output
577
+ progress(0.93, "Creating review file output")
578
  page_sizes = page_sizes_df.to_dict(orient="records")
579
  all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
 
580
  all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
 
581
  annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
582
  annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
583
 
 
943
  """
944
  mediabox = page.mediabox
945
  if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
946
+ #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
947
  page.set_cropbox(mediabox)
948
  else:
949
  page.set_cropbox(original_cropbox)
tools/textract_batch_call.py CHANGED
@@ -166,7 +166,7 @@ def analyse_document_with_textract_api(
166
  'file_name': pdf_filename,
167
  'job_type': job_type,
168
  'signature_extraction':analyse_signatures,
169
- 's3_location': job_location_full,
170
  'job_date_time': datetime.datetime.now()
171
  }])
172
 
@@ -449,19 +449,19 @@ def poll_whole_document_textract_analysis_progress_and_download(
449
  else:
450
  error = f"Unknown job type, cannot poll job"
451
  print(error)
452
- #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed.")
453
- raise
454
 
455
  except textract_client.exceptions.InvalidJobIdException:
456
  error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
457
  print(error_message)
458
  logging.error(error_message)
459
- raise
460
  except Exception as e:
461
  error_message = f"Error while polling Textract status for job {job_id}: {e}"
462
  print(error_message)
463
  logging.error(error_message)
464
- raise
465
 
466
  downloaded_file_path = None
467
  if job_status == 'SUCCEEDED':
@@ -522,7 +522,7 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
522
  '''
523
  Load in a dataframe of jobs previous submitted to the Textract API service.
524
  '''
525
- job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
526
 
527
  # Initialize boto3 clients
528
  session = boto3.Session(region_name=aws_region)
@@ -556,7 +556,12 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
556
  job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
557
  # Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
558
  cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
559
- job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
 
 
 
 
 
560
 
561
  return job_df
562
 
 
166
  'file_name': pdf_filename,
167
  'job_type': job_type,
168
  'signature_extraction':analyse_signatures,
169
+ #'s3_location': job_location_full,
170
  'job_date_time': datetime.datetime.now()
171
  }])
172
 
 
449
  else:
450
  error = f"Unknown job type, cannot poll job"
451
  print(error)
452
+ logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed.")
453
+ raise Exception(error_message)
454
 
455
  except textract_client.exceptions.InvalidJobIdException:
456
  error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
457
  print(error_message)
458
  logging.error(error_message)
459
+ raise Exception(error_message)
460
  except Exception as e:
461
  error_message = f"Error while polling Textract status for job {job_id}: {e}"
462
  print(error_message)
463
  logging.error(error_message)
464
+ raise Exception(error_message)
465
 
466
  downloaded_file_path = None
467
  if job_status == 'SUCCEEDED':
 
522
  '''
523
  Load in a dataframe of jobs previous submitted to the Textract API service.
524
  '''
525
+ job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time'])
526
 
527
  # Initialize boto3 clients
528
  session = boto3.Session(region_name=aws_region)
 
556
  job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
557
  # Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
558
  cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
559
+ job_df = job_df.loc[job_df["job_date_time"] > cutoff_time,:]
560
+
561
+ try:
562
+ job_df = job_df[['job_id','file_name','job_type','signature_extraction','job_date_time']]
563
+ except Exception as e:
564
+ print("Could not find one or more columns in Textract whole document list dataframe:", e)
565
 
566
  return job_df
567