Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 27

Commit

bde6e5b

1 Parent(s): 6b28cfa

Fuzzy match implementation for deny list. Added option to merge multiple review files. Review files from redaction step should now include text.

Browse files

Files changed (9) hide show

app.py +25 -9
requirements.txt +2 -0
tools/custom_image_analyser_engine.py +42 -11
tools/data_anonymise.py +3 -3
tools/file_conversion.py +10 -6
tools/file_redaction.py +63 -34
tools/helper_functions.py +64 -23
tools/load_spacy_model_custom_recognisers.py +176 -25
tools/redaction_review.py +13 -5

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from datetime import datetime
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
@@ -30,15 +30,16 @@ ensure_output_folder_exists()
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
-full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 chosen_comprehend_entities.extend(custom_entities)
 full_comprehend_entity_list.extend(custom_entities)
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
-full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
 language = 'en'
@@ -68,7 +69,6 @@ with app:
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
     all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
@@ -261,7 +261,7 @@ with app:
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
-            adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple')
             convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
     ###
@@ -325,9 +325,12 @@ with app:
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
         with gr.Accordion("Redact only selected pages", open = False):
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
@@ -341,7 +344,16 @@ with app:
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
-        log_files_output = gr.File(label="Log file output", interactive=False)
     ###
     # PDF/IMAGE REDACTION
@@ -350,12 +362,12 @@ with app:
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
@@ -461,6 +473,10 @@ with app:
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     ###

 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
+full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 chosen_comprehend_entities.extend(custom_entities)
 full_comprehend_entity_list.extend(custom_entities)
+# Entities for local PII redaction option
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
+full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
 language = 'en'
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
     all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
+            adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
             convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
     ###
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
+                with gr.Row():
+                    max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
+                    match_fuzzy_whole_phrase_bool = gr.Checkbox(label="Should fuzzy match on entire phrases in deny list (as opposed to each word individually)?", value=True)
         with gr.Accordion("Redact only selected pages", open = False):
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
+        log_files_output = gr.File(label="Log file output", interactive=False)
+        with gr.Accordion("Combine multiple review files", open = False):
+            multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
+            merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
+    ### UI INTERACTION ###
     ###
     # PDF/IMAGE REDACTION
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
+    # Merge multiple review csv files together
+    merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
     ###

requirements.txt CHANGED Viewed

@@ -16,6 +16,8 @@ boto3==1.35.83
 pyarrow==18.1.0
 openpyxl==3.1.2
 Faker==22.2.0
 gradio_image_annotation==0.2.5
 numpy==1.26.4
 awslambdaric==3.0.0

 pyarrow==18.1.0
 openpyxl==3.1.2
 Faker==22.2.0
+python-levenshtein==0.26.1
+spaczz==0.6.1
 gradio_image_annotation==0.2.5
 numpy==1.26.4
 awslambdaric==3.0.0

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -560,7 +560,7 @@ def run_page_text_redaction(
         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
-        print("page text:", page_text)
         page_analyser_result = nlp_analyser.analyze(
             text=page_text,
@@ -1077,15 +1077,15 @@ class CustomImageAnalyzerEngine:
             line_length = len(line_text)
             redaction_text = redaction_relevant_ocr_result.text
-            # print(f"Processing line: '{line_text}'")
             for redaction_result in text_analyzer_results:
-                # print(f"Checking redaction result: {redaction_result}")
-                # print("redaction_text:", redaction_text)
-                # print("line_length:", line_length)
-                # print("line_text:", line_text)
-                # Check if the redaction text is no in the allow list
                 if redaction_text not in allow_list:
@@ -1098,14 +1098,45 @@ class CustomImageAnalyzerEngine:
                     matched_words = matched_text.split()
                     # print(f"Found match: '{matched_text}' in line")
                     # Find the corresponding words in the OCR results
                     matching_word_boxes = []
                     for word_info in ocr_results_with_children_child_info.get('words', []):
-                        # Check if this word is part of our match
-                        if any(word.lower() in word_info['text'].lower() for word in matched_words):
                             matching_word_boxes.append(word_info['bounding_box'])
-                            # print(f"Matched word: {word_info['text']}")
                     if matching_word_boxes:
                         # Calculate the combined bounding box for all matching words
@@ -1127,7 +1158,7 @@ class CustomImageAnalyzerEngine:
                                 text=matched_text
                             )
                         )
-                        # print(f"Added bounding box for: '{matched_text}'")
         return redaction_bboxes

         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
+        #print("page text:", page_text)
         page_analyser_result = nlp_analyser.analyze(
             text=page_text,
             line_length = len(line_text)
             redaction_text = redaction_relevant_ocr_result.text
+            #print(f"Processing line: '{line_text}'")
             for redaction_result in text_analyzer_results:
+                #print(f"Checking redaction result: {redaction_result}")
+                #print("redaction_text:", redaction_text)
+                #print("line_length:", line_length)
+                #print("line_text:", line_text)
+                # Check if the redaction text is not in the allow list
                 if redaction_text not in allow_list:
                     matched_words = matched_text.split()
                     # print(f"Found match: '{matched_text}' in line")
+                    # for word_info in ocr_results_with_children_child_info.get('words', []):
+                    #     # Check if this word is part of our match
+                    #     if any(word.lower() in word_info['text'].lower() for word in matched_words):
+                    #         matching_word_boxes.append(word_info['bounding_box'])
+                    #         print(f"Matched word: {word_info['text']}")
                     # Find the corresponding words in the OCR results
                     matching_word_boxes = []
+                    #print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
+                    current_position = 0
                     for word_info in ocr_results_with_children_child_info.get('words', []):
+                        word_text = word_info['text']
+                        word_length = len(word_text)
+                        # Assign start and end character positions
+                        #word_info['start_position'] = current_position
+                        #word_info['end_position'] = current_position + word_length
+                        word_start = current_position
+                        word_end = current_position + word_length
+                        # Update current position for the next word
+                        current_position += word_length + 1  # +1 for the space after the word
+                        #print("word_info['bounding_box']:", word_info['bounding_box'])
+                        #print("word_start:", word_start)
+                        #print("start_in_line:", start_in_line)
+                        #print("word_end:", word_end)
+                        #print("end_in_line:", end_in_line)
+                        # Check if the word's bounding box is within the start and end bounds
+                        if word_start >= start_in_line and word_end <= (end_in_line + 1):
                             matching_word_boxes.append(word_info['bounding_box'])
+                            #print(f"Matched word: {word_info['text']}")
                     if matching_word_boxes:
                         # Calculate the combined bounding box for all matching words
                                 text=matched_text
                             )
                         )
+                        #print(f"Added bounding box for: '{matched_text}'")
         return redaction_bboxes

tools/data_anonymise.py CHANGED Viewed

@@ -12,7 +12,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
-from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 # Use custom version of analyze_dict to be able to track progress
@@ -434,7 +434,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
             file_type = detect_file_type(anon_file)
             print("File type is:", file_type)
-            out_file_part = get_file_path_end(anon_file.name)
             if file_type == 'xlsx':
                 print("Running through all xlsx sheets")
@@ -472,7 +472,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
-                out_file_part = get_file_path_end(anon_file.name)
                 out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file

 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
+from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 # Use custom version of analyze_dict to be able to track progress
             file_type = detect_file_type(anon_file)
             print("File type is:", file_type)
+            out_file_part = get_file_name_without_type(anon_file.name)
             if file_type == 'xlsx':
                 print("Running through all xlsx sheets")
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
+                out_file_part = get_file_name_without_type(anon_file.name)
                 out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file

tools/file_conversion.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
-from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
 from PIL import Image, ImageFile
 import os
 import re
@@ -7,6 +7,7 @@ import time
 import json
 import pymupdf
 import pandas as pd
 from pymupdf import Rect
 from fitz import Page
 from tqdm import tqdm
@@ -240,7 +241,7 @@ def get_input_file_names(file_input:List[str]):
         else:
             file_path = file.name
-        file_path_without_ext = get_file_path_end(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
@@ -489,7 +490,7 @@ def prepare_image_or_pdf(
             file_path = file
         else:
             file_path = file.name
-        file_path_without_ext = get_file_path_end(file_path)
         file_name_with_ext = os.path.basename(file_path)
         if not file_path:
@@ -668,7 +669,7 @@ def prepare_image_or_pdf(
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
-    file_path_without_ext = get_file_path_end(in_file_path)
     out_file_paths = out_text_file_path
@@ -754,7 +755,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
             if 'text' not in box:
                 data_to_add = {"image": image_path, "page": reported_number,  **box} # "text": annotation['text'],
             else:
-                data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
             #print("data_to_add:", data_to_add)
             flattened_annotation_data.append(data_to_add)
@@ -764,7 +765,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
     #print("redaction_decision_output:", redaction_decision_output)
     #print("annotation_data_as_df:", annotation_data_as_df)
-    # Join on additional text data from decision output results if included
     if not redaction_decision_output.empty:
         #print("redaction_decision_output is not empty")
         #print("redaction_decision_output:", redaction_decision_output)
@@ -793,6 +794,9 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
         if col not in annotation_data_as_df.columns:
             annotation_data_as_df[col] = ''
     annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
     return annotation_data_as_df

 from pdf2image import convert_from_path, pdfinfo_from_path
+from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
 from PIL import Image, ImageFile
 import os
 import re
 import json
 import pymupdf
 import pandas as pd
+import numpy as np
 from pymupdf import Rect
 from fitz import Page
 from tqdm import tqdm
         else:
             file_path = file.name
+        file_path_without_ext = get_file_name_without_type(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
             file_path = file
         else:
             file_path = file.name
+        file_path_without_ext = get_file_name_without_type(file_path)
         file_name_with_ext = os.path.basename(file_path)
         if not file_path:
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
+    file_path_without_ext = get_file_name_without_type(in_file_path)
     out_file_paths = out_text_file_path
             if 'text' not in box:
                 data_to_add = {"image": image_path, "page": reported_number,  **box} # "text": annotation['text'],
             else:
+                data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
             #print("data_to_add:", data_to_add)
             flattened_annotation_data.append(data_to_add)
     #print("redaction_decision_output:", redaction_decision_output)
     #print("annotation_data_as_df:", annotation_data_as_df)
+    # Join on additional text data from decision output results if included, if text not already there
     if not redaction_decision_output.empty:
         #print("redaction_decision_output is not empty")
         #print("redaction_decision_output:", redaction_decision_output)
         if col not in annotation_data_as_df.columns:
             annotation_data_as_df[col] = ''
+    for col in ['xmin', 'xmax', 'ymin', 'ymax']:
+        annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
     annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
     return annotation_data_as_df

tools/file_redaction.py CHANGED Viewed

@@ -27,8 +27,8 @@ from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
-from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
@@ -94,6 +94,8 @@ def choose_and_run_redactor(file_paths:List[str],
  page_break_return:bool=False,
  pii_identification_method:str="Local",
  comprehend_query_number:int=0,
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
@@ -127,6 +129,8 @@ def choose_and_run_redactor(file_paths:List[str],
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -279,9 +283,9 @@ def choose_and_run_redactor(file_paths:List[str],
             file_path = file.name
         if file_path:
-            pdf_file_name_without_ext = get_file_path_end(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
-            print("Redacting file:", pdf_file_name_with_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -327,7 +331,9 @@ def choose_and_run_redactor(file_paths:List[str],
              comprehend_client,
              textract_client,
              custom_recogniser_word_list,
-             redact_whole_page_list)
             #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
@@ -366,7 +372,9 @@ def choose_and_run_redactor(file_paths:List[str],
             comprehend_query_number,
             comprehend_client,
             custom_recogniser_word_list,
-            redact_whole_page_list)
         else:
             out_message = "No redaction method selected"
@@ -414,13 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save the gradio_annotation_boxes to a JSON file
             try:
-                #print("Saving annotations to JSON")
-                out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
-                with open(out_annotation_file_path, 'w') as f:
-                    json.dump(annotations_all_pages, f)
-                log_files_output_paths.append(out_annotation_file_path)
                 #print("Saving annotations to CSV")
                 # Convert json to csv and also save this
@@ -435,6 +437,13 @@ def choose_and_run_redactor(file_paths:List[str],
                 print("Saved review file to csv")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
@@ -694,10 +703,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                 x1 = pymupdf_x1
                 x2 = pymupdf_x2
-                # if hasattr(annot, 'text') and annot.text:
-                #     img_annotation_box["text"] = annot.text
-                # else:
-                #     img_annotation_box["text"] = ""
             # Else should be CustomImageRecognizerResult
             else:
@@ -715,10 +724,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                     img_annotation_box["label"] = annot.entity_type
                 except:
                     img_annotation_box["label"] = "Redaction"
-                # if hasattr(annot, 'text') and annot.text:
-                #     img_annotation_box["text"] = annot.text
-                # else:
-                #     img_annotation_box["text"] = ""
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)  # Create the PyMuPDF Rect
@@ -749,12 +759,14 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                 if isinstance(annot, Dictionary):
                     img_annotation_box["label"] = str(annot["/T"])
                 else:
                     img_annotation_box["label"] = "REDACTION"
-                # if hasattr(annot, 'text') and annot.text:
-                #     img_annotation_box["text"] = annot.text
-                # else:
-                #     img_annotation_box["text"] = ""
         # Convert to a PyMuPDF Rect object
         #rect = Rect(rect_coordinates)
@@ -913,6 +925,8 @@ def redact_image_pdf(file_path:str,
                      textract_client:str="",
                      custom_recogniser_word_list:List[str]=[],
                      redact_whole_page_list:List[str]=[],
                      page_break_val:int=int(page_break_value),
                      log_files_output_paths:List=[],
                      max_time:int=int(max_time_value),
@@ -945,14 +959,16 @@ def redact_image_pdf(file_path:str,
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
     - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
-    The function returns a fully or partially-redacted PDF document.
     '''
-    file_name = get_file_path_end(file_path)
     fill = (0, 0, 0)   # Fill colour for redactions
     comprehend_query_number_new = 0
@@ -962,11 +978,14 @@ def redact_image_pdf(file_path:str,
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         #print("new_custom_recogniser:", new_custom_recogniser)
-        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
-    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
@@ -1190,6 +1209,7 @@ def redact_image_pdf(file_path:str,
             ## Apply annotations with pymupdf
             else:
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
@@ -1471,6 +1491,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
     pikepdf_annotations_on_page = []
     for analysed_bounding_box in analysed_bounding_boxes:
         bounding_box = analysed_bounding_box["boundingBox"]
         annotation = Dictionary(
             Type=Name.Annot,
@@ -1482,6 +1504,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
             IC=[0, 0, 0],
             CA=1, # Transparency
             T=analysed_bounding_box["result"].entity_type,
             BS=Dictionary(
                 W=0,                     # Border width: 1 point
                 S=Name.S                # Border style: solid
@@ -1511,6 +1534,8 @@ def redact_text_pdf(
     comprehend_client="",
     custom_recogniser_word_list:List[str]=[],
     redact_whole_page_list:List[str]=[],
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
@@ -1540,6 +1565,8 @@ def redact_text_pdf(
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
@@ -1555,9 +1582,12 @@ def redact_text_pdf(
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
-        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         # List all elements currently in the nlp_analyser registry
         #print("Current recognizers in nlp_analyser registry:")
         #for recognizer_name in nlp_analyser.registry.recognizers:
@@ -1660,7 +1690,7 @@ def redact_text_pdf(
                                                             language,
                                                             chosen_redact_entities,
                                                             chosen_redact_comprehend_entities,
-                                                            all_line_level_text_results_list, #line_level_text_results_list,
                                                             all_line_characters,
                                                             page_analyser_results,
                                                             page_analysed_bounding_boxes,
@@ -1673,7 +1703,6 @@ def redact_text_pdf(
                                                             comprehend_query_number
                                                             )
                     #print("page_analyser_results:", page_analyser_results)
                     #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
                     #print("image:", image)
@@ -1688,7 +1717,7 @@ def redact_text_pdf(
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
-                #print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)

 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
+from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
  page_break_return:bool=False,
  pii_identification_method:str="Local",
  comprehend_query_number:int=0,
+ max_fuzzy_spelling_mistakes_num:int=1,
+ match_fuzzy_whole_phrase_bool:bool=True,
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
             file_path = file.name
         if file_path:
+            pdf_file_name_without_ext = get_file_name_without_type(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
+            # print("Redacting file:", pdf_file_name_with_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
              comprehend_client,
              textract_client,
              custom_recogniser_word_list,
+             redact_whole_page_list,
+             max_fuzzy_spelling_mistakes_num,
+             match_fuzzy_whole_phrase_bool)
             #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
             comprehend_query_number,
             comprehend_client,
             custom_recogniser_word_list,
+            redact_whole_page_list,
+            max_fuzzy_spelling_mistakes_num,
+            match_fuzzy_whole_phrase_bool)
         else:
             out_message = "No redaction method selected"
             # Save the gradio_annotation_boxes to a JSON file
             try:
                 #print("Saving annotations to CSV")
                 # Convert json to csv and also save this
                 print("Saved review file to csv")
+                out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
+                with open(out_annotation_file_path, 'w') as f:
+                    json.dump(annotations_all_pages, f)
+                log_files_output_paths.append(out_annotation_file_path)
+                print("Saving annotations to JSON")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
                 x1 = pymupdf_x1
                 x2 = pymupdf_x2
+                if hasattr(annot, 'text') and annot.text:
+                    img_annotation_box["text"] = annot.text
+                else:
+                    img_annotation_box["text"] = ""
             # Else should be CustomImageRecognizerResult
             else:
                     img_annotation_box["label"] = annot.entity_type
                 except:
                     img_annotation_box["label"] = "Redaction"
+                if hasattr(annot, 'text') and annot.text:
+                    img_annotation_box["text"] = annot.text
+                else:
+                    img_annotation_box["text"] = ""
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)  # Create the PyMuPDF Rect
                 if isinstance(annot, Dictionary):
                     img_annotation_box["label"] = str(annot["/T"])
+                    if hasattr(annot, 'Contents'):
+                        img_annotation_box["text"] = annot.Contents
+                    else:
+                        img_annotation_box["text"] = ""
                 else:
                     img_annotation_box["label"] = "REDACTION"
+                    img_annotation_box["text"] = ""
         # Convert to a PyMuPDF Rect object
         #rect = Rect(rect_coordinates)
                      textract_client:str="",
                      custom_recogniser_word_list:List[str]=[],
                      redact_whole_page_list:List[str]=[],
+                     max_fuzzy_spelling_mistakes_num:int=1,
+                     match_fuzzy_whole_phrase_bool:bool=True,
                      page_break_val:int=int(page_break_value),
                      log_files_output_paths:List=[],
                      max_time:int=int(max_time_value),
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
     - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
+    - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
+    The function returns a redacted PDF document along with processing output objects.
     '''
+    file_name = get_file_name_without_type(file_path)
     fill = (0, 0, 0)   # Fill colour for redactions
     comprehend_query_number_new = 0
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         #print("new_custom_recogniser:", new_custom_recogniser)
+        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+        nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
+        #print("new_custom_recogniser:", new_custom_recogniser)
+        nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
             ## Apply annotations with pymupdf
             else:
+                print("merged_redaction_boxes:", merged_redaction_bboxes)
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
     pikepdf_annotations_on_page = []
     for analysed_bounding_box in analysed_bounding_boxes:
+        #print("analysed_bounding_box:", analysed_bounding_boxes)
         bounding_box = analysed_bounding_box["boundingBox"]
         annotation = Dictionary(
             Type=Name.Annot,
             IC=[0, 0, 0],
             CA=1, # Transparency
             T=analysed_bounding_box["result"].entity_type,
+            Contents=analysed_bounding_box["text"],
             BS=Dictionary(
                 W=0,                     # Border width: 1 point
                 S=Name.S                # Border style: solid
     comprehend_client="",
     custom_recogniser_word_list:List[str]=[],
     redact_whole_page_list:List[str]=[],
+    max_fuzzy_spelling_mistakes_num:int=1,
+    match_fuzzy_whole_phrase_bool:bool=True,
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
+    -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+        nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
+        nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
         # List all elements currently in the nlp_analyser registry
         #print("Current recognizers in nlp_analyser registry:")
         #for recognizer_name in nlp_analyser.registry.recognizers:
                                                             language,
                                                             chosen_redact_entities,
                                                             chosen_redact_comprehend_entities,
+                                                            all_line_level_text_results_list,
                                                             all_line_characters,
                                                             page_analyser_results,
                                                             page_analysed_bounding_boxes,
                                                             comprehend_query_number
                                                             )
                     #print("page_analyser_results:", page_analyser_results)
                     #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
                     #print("image:", image)
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
+                # print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)

tools/helper_functions.py CHANGED Viewed

@@ -4,26 +4,12 @@ import boto3
 from botocore.exceptions import ClientError
 import gradio as gr
 import pandas as pd
 import unicodedata
 from typing import List
 from gradio_image_annotation import image_annotator
 from tools.auth import user_pool_id
-def reset_state_vars():
-    return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
-            label="Modify redaction boxes",
-            label_list=["Redaction"],
-            label_colors=[(0, 0, 0)],
-            show_label=False,
-            sources=None,#["upload"],
-            show_clear_button=False,
-            show_share_button=False,
-            show_remove_button=False,
-            interactive=False
-        ), [], [], [], pd.DataFrame(), pd.DataFrame()
-def reset_review_vars():
-    return [], pd.DataFrame(), pd.DataFrame()
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
@@ -51,13 +37,40 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
 print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
-def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
@@ -126,7 +139,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
         if regex_file_names:
             regex_file_name = regex_file_names[0]
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
-            #regex_file_name_no_ext = get_file_path_end(regex_file_name)
             custom_regex.columns = custom_regex.columns.astype(str)
@@ -220,13 +233,41 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
     except Exception as e:
         print("Could not remove usage logs file", e)
-# Retrieving or setting CUSTOM_HEADER
-CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
-print(f'CUSTOM_HEADER found')
-# Retrieving or setting CUSTOM_HEADER_VALUE
-CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
-print(f'CUSTOM_HEADER_VALUE found')
 async def get_connection_params(request: gr.Request):
     base_folder = ""

 from botocore.exceptions import ClientError
 import gradio as gr
 import pandas as pd
+import numpy as np
 import unicodedata
 from typing import List
 from gradio_image_annotation import image_annotator
 from tools.auth import user_pool_id
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
 print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
+# Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
+print(f'CUSTOM_HEADER found')
+# Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
+print(f'CUSTOM_HEADER_VALUE found')
+def reset_state_vars():
+    return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
+            label="Modify redaction boxes",
+            label_list=["Redaction"],
+            label_colors=[(0, 0, 0)],
+            show_label=False,
+            sources=None,#["upload"],
+            show_clear_button=False,
+            show_share_button=False,
+            show_remove_button=False,
+            interactive=False
+        ), [], [], [], pd.DataFrame(), pd.DataFrame()
+def reset_review_vars():
+    return [], pd.DataFrame(), pd.DataFrame()
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
+def get_file_name_without_type(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
         if regex_file_names:
             regex_file_name = regex_file_names[0]
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
+            #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
             custom_regex.columns = custom_regex.columns.astype(str)
     except Exception as e:
         print("Could not remove usage logs file", e)
+def merge_csv_files(file_list):
+    # Initialise an empty list to hold DataFrames
+    dataframes = []
+    output_files = []
+    # Loop through each file in the file list
+    for file in file_list:
+        # Read the CSV file into a DataFrame
+        df = pd.read_csv(file.name)
+        dataframes.append(df)
+    # Concatenate all DataFrames into a single DataFrame
+    merged_df = pd.concat(dataframes, ignore_index=True)
+    for col in ['xmin', 'xmax', 'ymin', 'ymax']:
+        merged_df[col] = np.floor(merged_df[col])
+    merged_df = merged_df.drop_duplicates(subset=['page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax'])
+    merged_df = merged_df.sort_values(['page', 'ymin', 'xmin', 'label'])
+    file_out_name = os.path.basename(file_list[0])
+    merged_csv_path = output_folder + file_out_name + "_merged.csv"
+    # Save the merged DataFrame to a CSV file
+    #merged_csv = StringIO()
+    merged_df.to_csv(merged_csv_path, index=False)
+    output_files.append(merged_csv_path)
+    #merged_csv.seek(0)  # Move to the beginning of the StringIO object
+    return output_files
 async def get_connection_params(request: gr.Request):
     base_folder = ""

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -3,9 +3,13 @@ from typing import List
 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 import spacy
 spacy.prefer_gpu()
 from spacy.cli.download import download
 import re
 model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
@@ -65,16 +69,8 @@ ukpostcode_pattern = Pattern(
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
-# %%
-# Examples for testing
-#text = "I live in 510 Broad st SE5 9NG ."
-#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
-#print("Result:")
-#print(numbers_result)
-# %%
 def extract_street_name(text:str) -> str:
     """
     Extracts the street name and preceding word (that should contain at least one number) from the given text.
@@ -101,7 +97,7 @@ def extract_street_name(text:str) -> str:
     pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
     # Find all matches in text
-    matches = re.finditer(pattern, text, re.IGNORECASE)
     start_positions = []
     end_positions = []
@@ -120,19 +116,6 @@ def extract_street_name(text:str) -> str:
     return start_positions, end_positions
-# %%
-# Some examples for testing
-#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
-#text = "Roberto lives in Five 10 Broad st in Oregon"
-#text = "Roberto lives in 55 Oregon Square"
-#text = "There is 51a no way I will do that"
-#text = "I am writing to apply for"
-#extract_street_name(text)
-# %%
 class StreetNameRecognizer(EntityRecognizer):
     def load(self) -> None:
@@ -163,14 +146,181 @@ class StreetNameRecognizer(EntityRecognizer):
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model):
         super().__init__()
         self.nlp = {"en": loaded_spacy_model}
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
@@ -186,4 +336,5 @@ nlp_analyser.registry.add_recognizer(street_recogniser)
 nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
 nlp_analyser.registry.add_recognizer(titles_recogniser)
 nlp_analyser.registry.add_recognizer(custom_recogniser)

 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 import spacy
+from spacy.matcher import Matcher, PhraseMatcher
+from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
 from spacy.cli.download import download
+import Levenshtein
 import re
+import gradio as gr
 model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
+### Street name
 def extract_street_name(text:str) -> str:
     """
     Extracts the street name and preceding word (that should contain at least one number) from the given text.
     pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
     # Find all matches in text
+    matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
     start_positions = []
     end_positions = []
     return start_positions, end_positions
 class StreetNameRecognizer(EntityRecognizer):
     def load(self) -> None:
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
+## Custom fuzzy match recogniser for list of strings
+def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
+    # Create regex pattern, handling quotes carefully
+    quote_str = '"'
+    replace_str = '(?:"|"|")'
+    custom_regex_pattern = '|'.join(
+        rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
+        for term in custom_list
+    )
+    # Find all matches in text
+    matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
+    start_positions = []
+    end_positions = []
+    for match in matches:
+        start_pos = match.start()
+        end_pos = match.end()
+        start_positions.append(start_pos)
+        end_positions.append(end_pos)
+    return start_positions, end_positions
+def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
+    ''' Conduct fuzzy match on a list of text data.'''
+    all_matches = []
+    all_start_positions = []
+    all_end_positions = []
+    all_ratios = []
+    #print("custom_query_list:", custom_query_list)
+    if not text:
+        out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
+        print(out_message)
+        return out_message, None
+    for string_query in custom_query_list:
+        #print("text:", text)
+        #print("string_query:", string_query)
+        query = nlp(string_query)
+        if search_whole_phrase == False:
+            # Keep only words that are not stop words
+            token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
+            spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
+            #print("token_query:", token_query)
+            if len(token_query) > 1:
+                #pattern_lemma = [{"LEMMA": {"IN": query}}]
+                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
+            else:
+                #pattern_lemma = [{"LEMMA": query[0]}]
+                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
+            matcher = Matcher(nlp.vocab)
+            matcher.add(string_query, [pattern_fuzz])
+            #matcher.add(string_query, [pattern_lemma])
+        else:
+            # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
+            #tokenised_query = [string_query.lower()]
+            # If you want to match the whole phrase, use phrase matcher
+            matcher = FuzzyMatcher(nlp.vocab)
+            patterns = [nlp.make_doc(string_query)]  # Convert query into a Doc object
+            matcher.add("PHRASE", patterns, [{"ignore_case": True}])
+        batch_size = 256
+        docs = nlp.pipe([text], batch_size=batch_size)
+        # Get number of matches per doc
+        for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
+            matches = matcher(doc)
+            match_count = len(matches)
+            # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
+            if search_whole_phrase==False:
+                all_matches.append(match_count)
+                for match_id, start, end in matches:
+                    span = str(doc[start:end]).strip()
+                    query_search = str(query).strip()
+                    #print("doc:", doc)
+                    #print("span:", span)
+                    #print("query_search:", query_search)
+                    # Convert word positions to character positions
+                    start_char = doc[start].idx  # Start character position
+                    end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                    # The positions here are word position, not character position
+                    all_matches.append(match_count)
+                    all_start_positions.append(start_char)
+                    all_end_positions.append(end_char)
+            else:
+                for match_id, start, end, ratio, pattern in matches:
+                    span = str(doc[start:end]).strip()
+                    query_search = str(query).strip()
+                    print("doc:", doc)
+                    print("span:", span)
+                    print("query_search:", query_search)
+                    # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
+                    distance = Levenshtein.distance(query_search.lower(), span.lower())
+                    print("Levenshtein distance:", distance)
+                    if distance > spelling_mistakes_max:
+                        match_count = match_count - 1
+                    else:
+                        # Convert word positions to character positions
+                        start_char = doc[start].idx  # Start character position
+                        end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                        print("start_char:", start_char)
+                        print("end_char:", end_char)
+                        all_matches.append(match_count)
+                        all_start_positions.append(start_char)
+                        all_end_positions.append(end_char)
+                        all_ratios.append(ratio)
+    return all_start_positions, all_end_positions
+class CustomWordFuzzyRecognizer(EntityRecognizer):
+    def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
+        super().__init__(supported_entities=supported_entities)
+        self.custom_list = custom_list  # Store the custom_list as an instance attribute
+        self.spelling_mistakes_max = spelling_mistakes_max  # Store the max spelling mistakes
+        self.search_whole_phrase = search_whole_phrase  # Store the search whole phrase flag
+    def load(self) -> None:
+        """No loading is required."""
+        pass
+    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
+        """
+        Logic for detecting a specific PII
+        """
+        start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase)  # Pass new parameters
+        results = []
+        for i in range(0, len(start_pos)):
+            result = RecognizerResult(
+                entity_type="CUSTOM_FUZZY",
+                start=start_pos[i],
+                end=end_pos[i],
+                score=1
+            )
+            results.append(result)
+        return results
+custom_list_default = []
+custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model):
         super().__init__()
         self.nlp = {"en": loaded_spacy_model}
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
 nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
 nlp_analyser.registry.add_recognizer(titles_recogniser)
 nlp_analyser.registry.add_recognizer(custom_recogniser)
+nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)

tools/redaction_review.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
-from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 import json
 import os
@@ -68,6 +68,12 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
         if non_empty_boxes:
             # Keep the first entry with non-empty boxes
             result.append(non_empty_boxes[0])
@@ -175,6 +181,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
@@ -264,7 +272,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
     for file_path in file_paths:
         #print("file_path:", file_path)
-        file_name_without_ext = get_file_path_end(file_path)
         file_name_with_ext = os.path.basename(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
@@ -544,7 +552,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
         else:
             file_path = file.name
-    file_path_name = get_file_path_end(file_path)
     file_path_end = detect_file_type(file_path)
     if file_path_end == "pdf":
@@ -675,7 +683,7 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
         else:
             file_path = file.name
-        file_path_name = get_file_path_end(file_path)
         file_path_end = detect_file_type(file_path)
         if file_path_end == "pdf":
@@ -699,7 +707,7 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
             # else:
             #     xfdf_path = xfdf_paths[0].name
-            file_path_name = get_file_path_end(xfdf_path)
             #print("file_path_name:", file_path_name)

 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
+from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 import json
 import os
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
+         # Remove 'text' elements from boxes
+        for item in non_empty_boxes:
+            if 'boxes' in item:
+                item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
         if non_empty_boxes:
             # Keep the first entry with non-empty boxes
             result.append(non_empty_boxes[0])
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
     for file_path in file_paths:
         #print("file_path:", file_path)
+        file_name_without_ext = get_file_name_without_type(file_path)
         file_name_with_ext = os.path.basename(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
         else:
             file_path = file.name
+    file_path_name = get_file_name_without_type(file_path)
     file_path_end = detect_file_type(file_path)
     if file_path_end == "pdf":
         else:
             file_path = file.name
+        file_path_name = get_file_name_without_type(file_path)
         file_path_end = detect_file_type(file_path)
         if file_path_end == "pdf":
             # else:
             #     xfdf_path = xfdf_paths[0].name
+            file_path_name = get_file_name_without_type(xfdf_path)
             #print("file_path_name:", file_path_name)