seanpedrickcase commited on
Commit
bde6e5b
·
1 Parent(s): 6b28cfa

Fuzzy match implementation for deny list. Added option to merge multiple review files. Review files from redaction step should now include text.

Browse files
app.py CHANGED
@@ -10,7 +10,7 @@ from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
@@ -30,15 +30,16 @@ ensure_output_folder_exists()
30
 
31
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
32
 
33
- full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
34
 
35
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
36
  chosen_comprehend_entities.extend(custom_entities)
37
  full_comprehend_entity_list.extend(custom_entities)
38
 
 
39
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
40
 
41
- full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
42
 
43
  language = 'en'
44
 
@@ -68,7 +69,6 @@ with app:
68
  pdf_doc_state = gr.State([])
69
  all_image_annotations_state = gr.State([])
70
 
71
-
72
  all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
73
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
74
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
@@ -261,7 +261,7 @@ with app:
261
 
262
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
263
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
264
- adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple')
265
  convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
266
 
267
  ###
@@ -325,9 +325,12 @@ with app:
325
 
326
  with gr.Accordion("Select entity types to redact", open = True):
327
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
328
-
329
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
330
 
 
 
 
 
331
  with gr.Accordion("Redact only selected pages", open = False):
332
  with gr.Row():
333
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
@@ -341,7 +344,16 @@ with app:
341
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
342
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
343
 
344
- log_files_output = gr.File(label="Log file output", interactive=False)
 
 
 
 
 
 
 
 
 
345
 
346
  ###
347
  # PDF/IMAGE REDACTION
@@ -350,12 +362,12 @@ with app:
350
 
351
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
352
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
353
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
354
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
355
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
356
 
357
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
358
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
359
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
360
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
361
 
@@ -461,6 +473,10 @@ with app:
461
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
462
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
463
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
 
 
 
 
464
 
465
 
466
  ###
 
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
 
30
 
31
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
32
 
33
+ full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
34
 
35
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
36
  chosen_comprehend_entities.extend(custom_entities)
37
  full_comprehend_entity_list.extend(custom_entities)
38
 
39
+ # Entities for local PII redaction option
40
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
41
 
42
+ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
43
 
44
  language = 'en'
45
 
 
69
  pdf_doc_state = gr.State([])
70
  all_image_annotations_state = gr.State([])
71
 
 
72
  all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
73
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
74
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
 
261
 
262
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
263
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
264
+ adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
265
  convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
266
 
267
  ###
 
325
 
326
  with gr.Accordion("Select entity types to redact", open = True):
327
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
 
328
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
329
 
330
+ with gr.Row():
331
+ max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
332
+ match_fuzzy_whole_phrase_bool = gr.Checkbox(label="Should fuzzy match on entire phrases in deny list (as opposed to each word individually)?", value=True)
333
+
334
  with gr.Accordion("Redact only selected pages", open = False):
335
  with gr.Row():
336
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
 
344
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
345
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
346
 
347
+ log_files_output = gr.File(label="Log file output", interactive=False)
348
+
349
+ with gr.Accordion("Combine multiple review files", open = False):
350
+ multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
351
+ merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
352
+
353
+
354
+
355
+
356
+ ### UI INTERACTION ###
357
 
358
  ###
359
  # PDF/IMAGE REDACTION
 
362
 
363
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
364
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
365
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
366
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
367
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
368
 
369
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
370
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
371
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
372
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
373
 
 
473
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
474
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
475
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
476
+
477
+
478
+ # Merge multiple review csv files together
479
+ merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
480
 
481
 
482
  ###
requirements.txt CHANGED
@@ -16,6 +16,8 @@ boto3==1.35.83
16
  pyarrow==18.1.0
17
  openpyxl==3.1.2
18
  Faker==22.2.0
 
 
19
  gradio_image_annotation==0.2.5
20
  numpy==1.26.4
21
  awslambdaric==3.0.0
 
16
  pyarrow==18.1.0
17
  openpyxl==3.1.2
18
  Faker==22.2.0
19
+ python-levenshtein==0.26.1
20
+ spaczz==0.6.1
21
  gradio_image_annotation==0.2.5
22
  numpy==1.26.4
23
  awslambdaric==3.0.0
tools/custom_image_analyser_engine.py CHANGED
@@ -560,7 +560,7 @@ def run_page_text_redaction(
560
  if not nlp_analyser:
561
  raise ValueError("nlp_analyser is required for Local identification method")
562
 
563
- print("page text:", page_text)
564
 
565
  page_analyser_result = nlp_analyser.analyze(
566
  text=page_text,
@@ -1077,15 +1077,15 @@ class CustomImageAnalyzerEngine:
1077
  line_length = len(line_text)
1078
  redaction_text = redaction_relevant_ocr_result.text
1079
 
1080
- # print(f"Processing line: '{line_text}'")
1081
 
1082
  for redaction_result in text_analyzer_results:
1083
- # print(f"Checking redaction result: {redaction_result}")
1084
- # print("redaction_text:", redaction_text)
1085
- # print("line_length:", line_length)
1086
- # print("line_text:", line_text)
1087
 
1088
- # Check if the redaction text is no in the allow list
1089
 
1090
  if redaction_text not in allow_list:
1091
 
@@ -1098,14 +1098,45 @@ class CustomImageAnalyzerEngine:
1098
  matched_words = matched_text.split()
1099
 
1100
  # print(f"Found match: '{matched_text}' in line")
 
 
 
 
 
 
1101
 
1102
  # Find the corresponding words in the OCR results
1103
  matching_word_boxes = []
 
 
 
 
 
1104
  for word_info in ocr_results_with_children_child_info.get('words', []):
1105
- # Check if this word is part of our match
1106
- if any(word.lower() in word_info['text'].lower() for word in matched_words):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1107
  matching_word_boxes.append(word_info['bounding_box'])
1108
- # print(f"Matched word: {word_info['text']}")
1109
 
1110
  if matching_word_boxes:
1111
  # Calculate the combined bounding box for all matching words
@@ -1127,7 +1158,7 @@ class CustomImageAnalyzerEngine:
1127
  text=matched_text
1128
  )
1129
  )
1130
- # print(f"Added bounding box for: '{matched_text}'")
1131
 
1132
  return redaction_bboxes
1133
 
 
560
  if not nlp_analyser:
561
  raise ValueError("nlp_analyser is required for Local identification method")
562
 
563
+ #print("page text:", page_text)
564
 
565
  page_analyser_result = nlp_analyser.analyze(
566
  text=page_text,
 
1077
  line_length = len(line_text)
1078
  redaction_text = redaction_relevant_ocr_result.text
1079
 
1080
+ #print(f"Processing line: '{line_text}'")
1081
 
1082
  for redaction_result in text_analyzer_results:
1083
+ #print(f"Checking redaction result: {redaction_result}")
1084
+ #print("redaction_text:", redaction_text)
1085
+ #print("line_length:", line_length)
1086
+ #print("line_text:", line_text)
1087
 
1088
+ # Check if the redaction text is not in the allow list
1089
 
1090
  if redaction_text not in allow_list:
1091
 
 
1098
  matched_words = matched_text.split()
1099
 
1100
  # print(f"Found match: '{matched_text}' in line")
1101
+
1102
+ # for word_info in ocr_results_with_children_child_info.get('words', []):
1103
+ # # Check if this word is part of our match
1104
+ # if any(word.lower() in word_info['text'].lower() for word in matched_words):
1105
+ # matching_word_boxes.append(word_info['bounding_box'])
1106
+ # print(f"Matched word: {word_info['text']}")
1107
 
1108
  # Find the corresponding words in the OCR results
1109
  matching_word_boxes = []
1110
+
1111
+ #print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
1112
+
1113
+ current_position = 0
1114
+
1115
  for word_info in ocr_results_with_children_child_info.get('words', []):
1116
+ word_text = word_info['text']
1117
+ word_length = len(word_text)
1118
+
1119
+ # Assign start and end character positions
1120
+ #word_info['start_position'] = current_position
1121
+ #word_info['end_position'] = current_position + word_length
1122
+
1123
+ word_start = current_position
1124
+ word_end = current_position + word_length
1125
+
1126
+ # Update current position for the next word
1127
+ current_position += word_length + 1 # +1 for the space after the word
1128
+
1129
+ #print("word_info['bounding_box']:", word_info['bounding_box'])
1130
+ #print("word_start:", word_start)
1131
+ #print("start_in_line:", start_in_line)
1132
+
1133
+ #print("word_end:", word_end)
1134
+ #print("end_in_line:", end_in_line)
1135
+
1136
+ # Check if the word's bounding box is within the start and end bounds
1137
+ if word_start >= start_in_line and word_end <= (end_in_line + 1):
1138
  matching_word_boxes.append(word_info['bounding_box'])
1139
+ #print(f"Matched word: {word_info['text']}")
1140
 
1141
  if matching_word_boxes:
1142
  # Calculate the combined bounding box for all matching words
 
1158
  text=matched_text
1159
  )
1160
  )
1161
+ #print(f"Added bounding box for: '{matched_text}'")
1162
 
1163
  return redaction_bboxes
1164
 
tools/data_anonymise.py CHANGED
@@ -12,7 +12,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
12
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
15
- from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
16
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
17
 
18
  # Use custom version of analyze_dict to be able to track progress
@@ -434,7 +434,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
434
  file_type = detect_file_type(anon_file)
435
  print("File type is:", file_type)
436
 
437
- out_file_part = get_file_path_end(anon_file.name)
438
 
439
  if file_type == 'xlsx':
440
  print("Running through all xlsx sheets")
@@ -472,7 +472,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
472
  else:
473
  sheet_name = ""
474
  anon_df = read_file(anon_file)
475
- out_file_part = get_file_path_end(anon_file.name)
476
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
477
 
478
  # Increase latest file completed count unless we are at the last file
 
12
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
15
+ from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
16
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
17
 
18
  # Use custom version of analyze_dict to be able to track progress
 
434
  file_type = detect_file_type(anon_file)
435
  print("File type is:", file_type)
436
 
437
+ out_file_part = get_file_name_without_type(anon_file.name)
438
 
439
  if file_type == 'xlsx':
440
  print("Running through all xlsx sheets")
 
472
  else:
473
  sheet_name = ""
474
  anon_df = read_file(anon_file)
475
+ out_file_part = get_file_name_without_type(anon_file.name)
476
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
477
 
478
  # Increase latest file completed count unless we are at the last file
tools/file_conversion.py CHANGED
@@ -1,5 +1,5 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
@@ -7,6 +7,7 @@ import time
7
  import json
8
  import pymupdf
9
  import pandas as pd
 
10
  from pymupdf import Rect
11
  from fitz import Page
12
  from tqdm import tqdm
@@ -240,7 +241,7 @@ def get_input_file_names(file_input:List[str]):
240
  else:
241
  file_path = file.name
242
 
243
- file_path_without_ext = get_file_path_end(file_path)
244
 
245
  file_extension = os.path.splitext(file_path)[1].lower()
246
 
@@ -489,7 +490,7 @@ def prepare_image_or_pdf(
489
  file_path = file
490
  else:
491
  file_path = file.name
492
- file_path_without_ext = get_file_path_end(file_path)
493
  file_name_with_ext = os.path.basename(file_path)
494
 
495
  if not file_path:
@@ -668,7 +669,7 @@ def prepare_image_or_pdf(
668
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
669
 
670
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
671
- file_path_without_ext = get_file_path_end(in_file_path)
672
 
673
  out_file_paths = out_text_file_path
674
 
@@ -754,7 +755,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
754
  if 'text' not in box:
755
  data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
756
  else:
757
- data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
758
  #print("data_to_add:", data_to_add)
759
  flattened_annotation_data.append(data_to_add)
760
 
@@ -764,7 +765,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
764
  #print("redaction_decision_output:", redaction_decision_output)
765
  #print("annotation_data_as_df:", annotation_data_as_df)
766
 
767
- # Join on additional text data from decision output results if included
768
  if not redaction_decision_output.empty:
769
  #print("redaction_decision_output is not empty")
770
  #print("redaction_decision_output:", redaction_decision_output)
@@ -793,6 +794,9 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
793
  if col not in annotation_data_as_df.columns:
794
  annotation_data_as_df[col] = ''
795
 
 
 
 
796
  annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
797
 
798
  return annotation_data_as_df
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
 
7
  import json
8
  import pymupdf
9
  import pandas as pd
10
+ import numpy as np
11
  from pymupdf import Rect
12
  from fitz import Page
13
  from tqdm import tqdm
 
241
  else:
242
  file_path = file.name
243
 
244
+ file_path_without_ext = get_file_name_without_type(file_path)
245
 
246
  file_extension = os.path.splitext(file_path)[1].lower()
247
 
 
490
  file_path = file
491
  else:
492
  file_path = file.name
493
+ file_path_without_ext = get_file_name_without_type(file_path)
494
  file_name_with_ext = os.path.basename(file_path)
495
 
496
  if not file_path:
 
669
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
670
 
671
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
672
+ file_path_without_ext = get_file_name_without_type(in_file_path)
673
 
674
  out_file_paths = out_text_file_path
675
 
 
755
  if 'text' not in box:
756
  data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
757
  else:
758
+ data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
759
  #print("data_to_add:", data_to_add)
760
  flattened_annotation_data.append(data_to_add)
761
 
 
765
  #print("redaction_decision_output:", redaction_decision_output)
766
  #print("annotation_data_as_df:", annotation_data_as_df)
767
 
768
+ # Join on additional text data from decision output results if included, if text not already there
769
  if not redaction_decision_output.empty:
770
  #print("redaction_decision_output is not empty")
771
  #print("redaction_decision_output:", redaction_decision_output)
 
794
  if col not in annotation_data_as_df.columns:
795
  annotation_data_as_df[col] = ''
796
 
797
+ for col in ['xmin', 'xmax', 'ymin', 'ymax']:
798
+ annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
799
+
800
  annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
801
 
802
  return annotation_data_as_df
tools/file_redaction.py CHANGED
@@ -27,8 +27,8 @@ from presidio_analyzer import RecognizerResult
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
31
- from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
33
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
34
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
@@ -94,6 +94,8 @@ def choose_and_run_redactor(file_paths:List[str],
94
  page_break_return:bool=False,
95
  pii_identification_method:str="Local",
96
  comprehend_query_number:int=0,
 
 
97
  output_folder:str=output_folder,
98
  progress=gr.Progress(track_tqdm=True)):
99
  '''
@@ -127,6 +129,8 @@ def choose_and_run_redactor(file_paths:List[str],
127
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
128
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
129
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
 
 
130
  - output_folder (str, optional): Output folder for results.
131
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
132
 
@@ -279,9 +283,9 @@ def choose_and_run_redactor(file_paths:List[str],
279
  file_path = file.name
280
 
281
  if file_path:
282
- pdf_file_name_without_ext = get_file_path_end(file_path)
283
  pdf_file_name_with_ext = os.path.basename(file_path)
284
- print("Redacting file:", pdf_file_name_with_ext)
285
 
286
  is_a_pdf = is_pdf(file_path) == True
287
  if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -327,7 +331,9 @@ def choose_and_run_redactor(file_paths:List[str],
327
  comprehend_client,
328
  textract_client,
329
  custom_recogniser_word_list,
330
- redact_whole_page_list)
 
 
331
 
332
 
333
  #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
@@ -366,7 +372,9 @@ def choose_and_run_redactor(file_paths:List[str],
366
  comprehend_query_number,
367
  comprehend_client,
368
  custom_recogniser_word_list,
369
- redact_whole_page_list)
 
 
370
 
371
  else:
372
  out_message = "No redaction method selected"
@@ -414,13 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
414
 
415
  # Save the gradio_annotation_boxes to a JSON file
416
  try:
417
- #print("Saving annotations to JSON")
418
-
419
- out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
420
- with open(out_annotation_file_path, 'w') as f:
421
- json.dump(annotations_all_pages, f)
422
- log_files_output_paths.append(out_annotation_file_path)
423
-
424
  #print("Saving annotations to CSV")
425
 
426
  # Convert json to csv and also save this
@@ -435,6 +437,13 @@ def choose_and_run_redactor(file_paths:List[str],
435
 
436
  print("Saved review file to csv")
437
 
 
 
 
 
 
 
 
438
  except Exception as e:
439
  print("Could not save annotations to json or csv file:", e)
440
 
@@ -694,10 +703,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
694
  x1 = pymupdf_x1
695
  x2 = pymupdf_x2
696
 
697
- # if hasattr(annot, 'text') and annot.text:
698
- # img_annotation_box["text"] = annot.text
699
- # else:
700
- # img_annotation_box["text"] = ""
701
 
702
  # Else should be CustomImageRecognizerResult
703
  else:
@@ -715,10 +724,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
715
  img_annotation_box["label"] = annot.entity_type
716
  except:
717
  img_annotation_box["label"] = "Redaction"
718
- # if hasattr(annot, 'text') and annot.text:
719
- # img_annotation_box["text"] = annot.text
720
- # else:
721
- # img_annotation_box["text"] = ""
 
722
 
723
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
724
 
@@ -749,12 +759,14 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
749
 
750
  if isinstance(annot, Dictionary):
751
  img_annotation_box["label"] = str(annot["/T"])
 
 
 
 
 
752
  else:
753
  img_annotation_box["label"] = "REDACTION"
754
- # if hasattr(annot, 'text') and annot.text:
755
- # img_annotation_box["text"] = annot.text
756
- # else:
757
- # img_annotation_box["text"] = ""
758
 
759
  # Convert to a PyMuPDF Rect object
760
  #rect = Rect(rect_coordinates)
@@ -913,6 +925,8 @@ def redact_image_pdf(file_path:str,
913
  textract_client:str="",
914
  custom_recogniser_word_list:List[str]=[],
915
  redact_whole_page_list:List[str]=[],
 
 
916
  page_break_val:int=int(page_break_value),
917
  log_files_output_paths:List=[],
918
  max_time:int=int(max_time_value),
@@ -945,14 +959,16 @@ def redact_image_pdf(file_path:str,
945
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
946
  - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
947
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
 
 
948
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
949
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
950
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
951
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
952
 
953
- The function returns a fully or partially-redacted PDF document.
954
  '''
955
- file_name = get_file_path_end(file_path)
956
  fill = (0, 0, 0) # Fill colour for redactions
957
  comprehend_query_number_new = 0
958
 
@@ -962,11 +978,14 @@ def redact_image_pdf(file_path:str,
962
  nlp_analyser.registry.remove_recognizer("CUSTOM")
963
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
964
  #print("new_custom_recogniser:", new_custom_recogniser)
965
- nlp_analyser.registry.add_recognizer(new_custom_recogniser)
966
 
 
 
 
 
967
 
968
- image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
969
-
970
 
971
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
972
  print("Connection to AWS Comprehend service unsuccessful.")
@@ -1190,6 +1209,7 @@ def redact_image_pdf(file_path:str,
1190
 
1191
  ## Apply annotations with pymupdf
1192
  else:
 
1193
  #print("redact_whole_page_list:", redact_whole_page_list)
1194
  if redact_whole_page_list:
1195
  int_reported_page_number = int(reported_page_number)
@@ -1471,6 +1491,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
1471
  def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1472
  pikepdf_annotations_on_page = []
1473
  for analysed_bounding_box in analysed_bounding_boxes:
 
 
1474
  bounding_box = analysed_bounding_box["boundingBox"]
1475
  annotation = Dictionary(
1476
  Type=Name.Annot,
@@ -1482,6 +1504,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1482
  IC=[0, 0, 0],
1483
  CA=1, # Transparency
1484
  T=analysed_bounding_box["result"].entity_type,
 
1485
  BS=Dictionary(
1486
  W=0, # Border width: 1 point
1487
  S=Name.S # Border style: solid
@@ -1511,6 +1534,8 @@ def redact_text_pdf(
1511
  comprehend_client="",
1512
  custom_recogniser_word_list:List[str]=[],
1513
  redact_whole_page_list:List[str]=[],
 
 
1514
  page_break_val: int = int(page_break_value), # Value for page break
1515
  max_time: int = int(max_time_value),
1516
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
@@ -1540,6 +1565,8 @@ def redact_text_pdf(
1540
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1541
  - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
1542
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
 
 
1543
  - page_break_val: Value for page break
1544
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1545
  - progress: Progress tracking object
@@ -1555,9 +1582,12 @@ def redact_text_pdf(
1555
  if custom_recogniser_word_list:
1556
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1557
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
1558
- #print("new_custom_recogniser:", new_custom_recogniser)
1559
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1560
 
 
 
 
 
1561
  # List all elements currently in the nlp_analyser registry
1562
  #print("Current recognizers in nlp_analyser registry:")
1563
  #for recognizer_name in nlp_analyser.registry.recognizers:
@@ -1660,7 +1690,7 @@ def redact_text_pdf(
1660
  language,
1661
  chosen_redact_entities,
1662
  chosen_redact_comprehend_entities,
1663
- all_line_level_text_results_list, #line_level_text_results_list,
1664
  all_line_characters,
1665
  page_analyser_results,
1666
  page_analysed_bounding_boxes,
@@ -1673,7 +1703,6 @@ def redact_text_pdf(
1673
  comprehend_query_number
1674
  )
1675
 
1676
-
1677
  #print("page_analyser_results:", page_analyser_results)
1678
  #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1679
  #print("image:", image)
@@ -1688,7 +1717,7 @@ def redact_text_pdf(
1688
  # Annotate redactions on page
1689
  pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1690
 
1691
- #print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
1692
 
1693
  # Make pymupdf page redactions
1694
  #print("redact_whole_page_list:", redact_whole_page_list)
 
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
31
+ from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
33
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
34
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
 
94
  page_break_return:bool=False,
95
  pii_identification_method:str="Local",
96
  comprehend_query_number:int=0,
97
+ max_fuzzy_spelling_mistakes_num:int=1,
98
+ match_fuzzy_whole_phrase_bool:bool=True,
99
  output_folder:str=output_folder,
100
  progress=gr.Progress(track_tqdm=True)):
101
  '''
 
129
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
130
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
131
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
132
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
133
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
134
  - output_folder (str, optional): Output folder for results.
135
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
136
 
 
283
  file_path = file.name
284
 
285
  if file_path:
286
+ pdf_file_name_without_ext = get_file_name_without_type(file_path)
287
  pdf_file_name_with_ext = os.path.basename(file_path)
288
+ # print("Redacting file:", pdf_file_name_with_ext)
289
 
290
  is_a_pdf = is_pdf(file_path) == True
291
  if is_a_pdf == False and in_redact_method == text_ocr_option:
 
331
  comprehend_client,
332
  textract_client,
333
  custom_recogniser_word_list,
334
+ redact_whole_page_list,
335
+ max_fuzzy_spelling_mistakes_num,
336
+ match_fuzzy_whole_phrase_bool)
337
 
338
 
339
  #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
 
372
  comprehend_query_number,
373
  comprehend_client,
374
  custom_recogniser_word_list,
375
+ redact_whole_page_list,
376
+ max_fuzzy_spelling_mistakes_num,
377
+ match_fuzzy_whole_phrase_bool)
378
 
379
  else:
380
  out_message = "No redaction method selected"
 
422
 
423
  # Save the gradio_annotation_boxes to a JSON file
424
  try:
425
+
 
 
 
 
 
 
426
  #print("Saving annotations to CSV")
427
 
428
  # Convert json to csv and also save this
 
437
 
438
  print("Saved review file to csv")
439
 
440
+ out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
441
+ with open(out_annotation_file_path, 'w') as f:
442
+ json.dump(annotations_all_pages, f)
443
+ log_files_output_paths.append(out_annotation_file_path)
444
+
445
+ print("Saving annotations to JSON")
446
+
447
  except Exception as e:
448
  print("Could not save annotations to json or csv file:", e)
449
 
 
703
  x1 = pymupdf_x1
704
  x2 = pymupdf_x2
705
 
706
+ if hasattr(annot, 'text') and annot.text:
707
+ img_annotation_box["text"] = annot.text
708
+ else:
709
+ img_annotation_box["text"] = ""
710
 
711
  # Else should be CustomImageRecognizerResult
712
  else:
 
724
  img_annotation_box["label"] = annot.entity_type
725
  except:
726
  img_annotation_box["label"] = "Redaction"
727
+
728
+ if hasattr(annot, 'text') and annot.text:
729
+ img_annotation_box["text"] = annot.text
730
+ else:
731
+ img_annotation_box["text"] = ""
732
 
733
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
734
 
 
759
 
760
  if isinstance(annot, Dictionary):
761
  img_annotation_box["label"] = str(annot["/T"])
762
+
763
+ if hasattr(annot, 'Contents'):
764
+ img_annotation_box["text"] = annot.Contents
765
+ else:
766
+ img_annotation_box["text"] = ""
767
  else:
768
  img_annotation_box["label"] = "REDACTION"
769
+ img_annotation_box["text"] = ""
 
 
 
770
 
771
  # Convert to a PyMuPDF Rect object
772
  #rect = Rect(rect_coordinates)
 
925
  textract_client:str="",
926
  custom_recogniser_word_list:List[str]=[],
927
  redact_whole_page_list:List[str]=[],
928
+ max_fuzzy_spelling_mistakes_num:int=1,
929
+ match_fuzzy_whole_phrase_bool:bool=True,
930
  page_break_val:int=int(page_break_value),
931
  log_files_output_paths:List=[],
932
  max_time:int=int(max_time_value),
 
959
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
960
  - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
961
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
962
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
963
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
964
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
965
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
966
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
967
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
968
 
969
+ The function returns a redacted PDF document along with processing output objects.
970
  '''
971
+ file_name = get_file_name_without_type(file_path)
972
  fill = (0, 0, 0) # Fill colour for redactions
973
  comprehend_query_number_new = 0
974
 
 
978
  nlp_analyser.registry.remove_recognizer("CUSTOM")
979
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
980
  #print("new_custom_recogniser:", new_custom_recogniser)
981
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
982
 
983
+ nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
984
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
985
+ #print("new_custom_recogniser:", new_custom_recogniser)
986
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
987
 
988
+ image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
 
989
 
990
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
991
  print("Connection to AWS Comprehend service unsuccessful.")
 
1209
 
1210
  ## Apply annotations with pymupdf
1211
  else:
1212
+ print("merged_redaction_boxes:", merged_redaction_bboxes)
1213
  #print("redact_whole_page_list:", redact_whole_page_list)
1214
  if redact_whole_page_list:
1215
  int_reported_page_number = int(reported_page_number)
 
1491
  def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1492
  pikepdf_annotations_on_page = []
1493
  for analysed_bounding_box in analysed_bounding_boxes:
1494
+ #print("analysed_bounding_box:", analysed_bounding_boxes)
1495
+
1496
  bounding_box = analysed_bounding_box["boundingBox"]
1497
  annotation = Dictionary(
1498
  Type=Name.Annot,
 
1504
  IC=[0, 0, 0],
1505
  CA=1, # Transparency
1506
  T=analysed_bounding_box["result"].entity_type,
1507
+ Contents=analysed_bounding_box["text"],
1508
  BS=Dictionary(
1509
  W=0, # Border width: 1 point
1510
  S=Name.S # Border style: solid
 
1534
  comprehend_client="",
1535
  custom_recogniser_word_list:List[str]=[],
1536
  redact_whole_page_list:List[str]=[],
1537
+ max_fuzzy_spelling_mistakes_num:int=1,
1538
+ match_fuzzy_whole_phrase_bool:bool=True,
1539
  page_break_val: int = int(page_break_value), # Value for page break
1540
  max_time: int = int(max_time_value),
1541
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
 
1565
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1566
  - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
1567
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
1568
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
1569
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
1570
  - page_break_val: Value for page break
1571
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1572
  - progress: Progress tracking object
 
1582
  if custom_recogniser_word_list:
1583
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1584
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
 
1585
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1586
 
1587
+ nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
1588
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1589
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1590
+
1591
  # List all elements currently in the nlp_analyser registry
1592
  #print("Current recognizers in nlp_analyser registry:")
1593
  #for recognizer_name in nlp_analyser.registry.recognizers:
 
1690
  language,
1691
  chosen_redact_entities,
1692
  chosen_redact_comprehend_entities,
1693
+ all_line_level_text_results_list,
1694
  all_line_characters,
1695
  page_analyser_results,
1696
  page_analysed_bounding_boxes,
 
1703
  comprehend_query_number
1704
  )
1705
 
 
1706
  #print("page_analyser_results:", page_analyser_results)
1707
  #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1708
  #print("image:", image)
 
1717
  # Annotate redactions on page
1718
  pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1719
 
1720
+ # print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
1721
 
1722
  # Make pymupdf page redactions
1723
  #print("redact_whole_page_list:", redact_whole_page_list)
tools/helper_functions.py CHANGED
@@ -4,26 +4,12 @@ import boto3
4
  from botocore.exceptions import ClientError
5
  import gradio as gr
6
  import pandas as pd
 
7
  import unicodedata
8
  from typing import List
9
  from gradio_image_annotation import image_annotator
10
  from tools.auth import user_pool_id
11
 
12
- def reset_state_vars():
13
- return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
14
- label="Modify redaction boxes",
15
- label_list=["Redaction"],
16
- label_colors=[(0, 0, 0)],
17
- show_label=False,
18
- sources=None,#["upload"],
19
- show_clear_button=False,
20
- show_share_button=False,
21
- show_remove_button=False,
22
- interactive=False
23
- ), [], [], [], pd.DataFrame(), pd.DataFrame()
24
-
25
- def reset_review_vars():
26
- return [], pd.DataFrame(), pd.DataFrame()
27
 
28
  def get_or_create_env_var(var_name, default_value):
29
  # Get the environment variable if it exists
@@ -51,13 +37,40 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
51
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
52
  print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def load_in_default_allow_list(allow_list_file_path):
55
  if isinstance(allow_list_file_path, str):
56
  allow_list_file_path = [allow_list_file_path]
57
  return allow_list_file_path
58
 
59
 
60
- def get_file_path_end(file_path):
61
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
62
  basename = os.path.basename(file_path)
63
 
@@ -126,7 +139,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
126
  if regex_file_names:
127
  regex_file_name = regex_file_names[0]
128
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
129
- #regex_file_name_no_ext = get_file_path_end(regex_file_name)
130
 
131
  custom_regex.columns = custom_regex.columns.astype(str)
132
 
@@ -220,13 +233,41 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
220
  except Exception as e:
221
  print("Could not remove usage logs file", e)
222
 
223
- # Retrieving or setting CUSTOM_HEADER
224
- CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
225
- print(f'CUSTOM_HEADER found')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- # Retrieving or setting CUSTOM_HEADER_VALUE
228
- CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
229
- print(f'CUSTOM_HEADER_VALUE found')
230
 
231
  async def get_connection_params(request: gr.Request):
232
  base_folder = ""
 
4
  from botocore.exceptions import ClientError
5
  import gradio as gr
6
  import pandas as pd
7
+ import numpy as np
8
  import unicodedata
9
  from typing import List
10
  from gradio_image_annotation import image_annotator
11
  from tools.auth import user_pool_id
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def get_or_create_env_var(var_name, default_value):
15
  # Get the environment variable if it exists
 
37
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
38
  print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
39
 
40
+ # Retrieving or setting CUSTOM_HEADER
41
+ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
42
+ print(f'CUSTOM_HEADER found')
43
+
44
+ # Retrieving or setting CUSTOM_HEADER_VALUE
45
+ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
46
+ print(f'CUSTOM_HEADER_VALUE found')
47
+
48
+
49
+ def reset_state_vars():
50
+ return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
51
+ label="Modify redaction boxes",
52
+ label_list=["Redaction"],
53
+ label_colors=[(0, 0, 0)],
54
+ show_label=False,
55
+ sources=None,#["upload"],
56
+ show_clear_button=False,
57
+ show_share_button=False,
58
+ show_remove_button=False,
59
+ interactive=False
60
+ ), [], [], [], pd.DataFrame(), pd.DataFrame()
61
+
62
+ def reset_review_vars():
63
+ return [], pd.DataFrame(), pd.DataFrame()
64
+
65
+
66
+
67
  def load_in_default_allow_list(allow_list_file_path):
68
  if isinstance(allow_list_file_path, str):
69
  allow_list_file_path = [allow_list_file_path]
70
  return allow_list_file_path
71
 
72
 
73
+ def get_file_name_without_type(file_path):
74
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
75
  basename = os.path.basename(file_path)
76
 
 
139
  if regex_file_names:
140
  regex_file_name = regex_file_names[0]
141
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
142
+ #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
143
 
144
  custom_regex.columns = custom_regex.columns.astype(str)
145
 
 
233
  except Exception as e:
234
  print("Could not remove usage logs file", e)
235
 
236
+ def merge_csv_files(file_list):
237
+
238
+ # Initialise an empty list to hold DataFrames
239
+ dataframes = []
240
+ output_files = []
241
+
242
+ # Loop through each file in the file list
243
+ for file in file_list:
244
+ # Read the CSV file into a DataFrame
245
+ df = pd.read_csv(file.name)
246
+ dataframes.append(df)
247
+
248
+ # Concatenate all DataFrames into a single DataFrame
249
+ merged_df = pd.concat(dataframes, ignore_index=True)
250
+
251
+ for col in ['xmin', 'xmax', 'ymin', 'ymax']:
252
+ merged_df[col] = np.floor(merged_df[col])
253
+
254
+ merged_df = merged_df.drop_duplicates(subset=['page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax'])
255
+
256
+ merged_df = merged_df.sort_values(['page', 'ymin', 'xmin', 'label'])
257
+
258
+ file_out_name = os.path.basename(file_list[0])
259
+
260
+ merged_csv_path = output_folder + file_out_name + "_merged.csv"
261
+
262
+ # Save the merged DataFrame to a CSV file
263
+ #merged_csv = StringIO()
264
+ merged_df.to_csv(merged_csv_path, index=False)
265
+ output_files.append(merged_csv_path)
266
+ #merged_csv.seek(0) # Move to the beginning of the StringIO object
267
+
268
+ return output_files
269
+
270
 
 
 
 
271
 
272
  async def get_connection_params(request: gr.Request):
273
  base_folder = ""
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -3,9 +3,13 @@ from typing import List
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
  import spacy
 
 
6
  spacy.prefer_gpu()
7
  from spacy.cli.download import download
 
8
  import re
 
9
 
10
  model_name = "en_core_web_sm" #"en_core_web_trf"
11
  score_threshold = 0.001
@@ -65,16 +69,8 @@ ukpostcode_pattern = Pattern(
65
  # Define the recognizer with one or more patterns
66
  ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
67
 
68
- # %%
69
- # Examples for testing
70
-
71
- #text = "I live in 510 Broad st SE5 9NG ."
72
-
73
- #numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
74
- #print("Result:")
75
- #print(numbers_result)
76
 
77
- # %%
78
  def extract_street_name(text:str) -> str:
79
  """
80
  Extracts the street name and preceding word (that should contain at least one number) from the given text.
@@ -101,7 +97,7 @@ def extract_street_name(text:str) -> str:
101
  pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
102
 
103
  # Find all matches in text
104
- matches = re.finditer(pattern, text, re.IGNORECASE)
105
 
106
  start_positions = []
107
  end_positions = []
@@ -120,19 +116,6 @@ def extract_street_name(text:str) -> str:
120
 
121
  return start_positions, end_positions
122
 
123
-
124
- # %%
125
- # Some examples for testing
126
-
127
- #text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
128
- #text = "Roberto lives in Five 10 Broad st in Oregon"
129
- #text = "Roberto lives in 55 Oregon Square"
130
- #text = "There is 51a no way I will do that"
131
- #text = "I am writing to apply for"
132
-
133
- #extract_street_name(text)
134
-
135
- # %%
136
  class StreetNameRecognizer(EntityRecognizer):
137
 
138
  def load(self) -> None:
@@ -163,14 +146,181 @@ class StreetNameRecognizer(EntityRecognizer):
163
 
164
  street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  # Create a class inheriting from SpacyNlpEngine
167
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
168
  def __init__(self, loaded_spacy_model):
169
  super().__init__()
170
  self.nlp = {"en": loaded_spacy_model}
171
 
172
-
173
-
174
  # Pass the loaded model to the new LoadedSpacyNlpEngine
175
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
176
 
@@ -186,4 +336,5 @@ nlp_analyser.registry.add_recognizer(street_recogniser)
186
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
187
  nlp_analyser.registry.add_recognizer(titles_recogniser)
188
  nlp_analyser.registry.add_recognizer(custom_recogniser)
 
189
 
 
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
  import spacy
6
+ from spacy.matcher import Matcher, PhraseMatcher
7
+ from spaczz.matcher import FuzzyMatcher
8
  spacy.prefer_gpu()
9
  from spacy.cli.download import download
10
+ import Levenshtein
11
  import re
12
+ import gradio as gr
13
 
14
  model_name = "en_core_web_sm" #"en_core_web_trf"
15
  score_threshold = 0.001
 
69
  # Define the recognizer with one or more patterns
70
  ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
71
 
72
+ ### Street name
 
 
 
 
 
 
 
73
 
 
74
  def extract_street_name(text:str) -> str:
75
  """
76
  Extracts the street name and preceding word (that should contain at least one number) from the given text.
 
97
  pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
98
 
99
  # Find all matches in text
100
+ matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
101
 
102
  start_positions = []
103
  end_positions = []
 
116
 
117
  return start_positions, end_positions
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  class StreetNameRecognizer(EntityRecognizer):
120
 
121
  def load(self) -> None:
 
146
 
147
  street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
148
 
149
+ ## Custom fuzzy match recogniser for list of strings
150
+ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
151
+ # Create regex pattern, handling quotes carefully
152
+
153
+ quote_str = '"'
154
+ replace_str = '(?:"|"|")'
155
+
156
+ custom_regex_pattern = '|'.join(
157
+ rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
158
+ for term in custom_list
159
+ )
160
+
161
+ # Find all matches in text
162
+ matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
163
+
164
+ start_positions = []
165
+ end_positions = []
166
+
167
+ for match in matches:
168
+ start_pos = match.start()
169
+ end_pos = match.end()
170
+
171
+ start_positions.append(start_pos)
172
+ end_positions.append(end_pos)
173
+
174
+ return start_positions, end_positions
175
+
176
+ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
177
+ ''' Conduct fuzzy match on a list of text data.'''
178
+
179
+ all_matches = []
180
+ all_start_positions = []
181
+ all_end_positions = []
182
+ all_ratios = []
183
+
184
+ #print("custom_query_list:", custom_query_list)
185
+
186
+ if not text:
187
+ out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
188
+ print(out_message)
189
+ return out_message, None
190
+
191
+ for string_query in custom_query_list:
192
+
193
+ #print("text:", text)
194
+ #print("string_query:", string_query)
195
+
196
+ query = nlp(string_query)
197
+
198
+ if search_whole_phrase == False:
199
+ # Keep only words that are not stop words
200
+ token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
201
+
202
+ spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
203
+
204
+ #print("token_query:", token_query)
205
+
206
+ if len(token_query) > 1:
207
+ #pattern_lemma = [{"LEMMA": {"IN": query}}]
208
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
209
+ else:
210
+ #pattern_lemma = [{"LEMMA": query[0]}]
211
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
212
+
213
+ matcher = Matcher(nlp.vocab)
214
+ matcher.add(string_query, [pattern_fuzz])
215
+ #matcher.add(string_query, [pattern_lemma])
216
+
217
+ else:
218
+ # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
219
+ #tokenised_query = [string_query.lower()]
220
+ # If you want to match the whole phrase, use phrase matcher
221
+ matcher = FuzzyMatcher(nlp.vocab)
222
+ patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
223
+ matcher.add("PHRASE", patterns, [{"ignore_case": True}])
224
+
225
+ batch_size = 256
226
+ docs = nlp.pipe([text], batch_size=batch_size)
227
+
228
+ # Get number of matches per doc
229
+ for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
230
+ matches = matcher(doc)
231
+ match_count = len(matches)
232
+
233
+ # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
234
+ if search_whole_phrase==False:
235
+ all_matches.append(match_count)
236
+
237
+ for match_id, start, end in matches:
238
+ span = str(doc[start:end]).strip()
239
+ query_search = str(query).strip()
240
+ #print("doc:", doc)
241
+ #print("span:", span)
242
+ #print("query_search:", query_search)
243
+
244
+ # Convert word positions to character positions
245
+ start_char = doc[start].idx # Start character position
246
+ end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
247
+
248
+ # The positions here are word position, not character position
249
+ all_matches.append(match_count)
250
+ all_start_positions.append(start_char)
251
+ all_end_positions.append(end_char)
252
+
253
+ else:
254
+ for match_id, start, end, ratio, pattern in matches:
255
+ span = str(doc[start:end]).strip()
256
+ query_search = str(query).strip()
257
+ print("doc:", doc)
258
+ print("span:", span)
259
+ print("query_search:", query_search)
260
+
261
+ # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
262
+ distance = Levenshtein.distance(query_search.lower(), span.lower())
263
+
264
+ print("Levenshtein distance:", distance)
265
+
266
+ if distance > spelling_mistakes_max:
267
+ match_count = match_count - 1
268
+ else:
269
+ # Convert word positions to character positions
270
+ start_char = doc[start].idx # Start character position
271
+ end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
272
+
273
+ print("start_char:", start_char)
274
+ print("end_char:", end_char)
275
+
276
+ all_matches.append(match_count)
277
+ all_start_positions.append(start_char)
278
+ all_end_positions.append(end_char)
279
+ all_ratios.append(ratio)
280
+
281
+
282
+ return all_start_positions, all_end_positions
283
+
284
+
285
+ class CustomWordFuzzyRecognizer(EntityRecognizer):
286
+ def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
287
+ super().__init__(supported_entities=supported_entities)
288
+ self.custom_list = custom_list # Store the custom_list as an instance attribute
289
+ self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes
290
+ self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag
291
+
292
+ def load(self) -> None:
293
+ """No loading is required."""
294
+ pass
295
+
296
+ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
297
+ """
298
+ Logic for detecting a specific PII
299
+ """
300
+ start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters
301
+
302
+ results = []
303
+
304
+ for i in range(0, len(start_pos)):
305
+ result = RecognizerResult(
306
+ entity_type="CUSTOM_FUZZY",
307
+ start=start_pos[i],
308
+ end=end_pos[i],
309
+ score=1
310
+ )
311
+ results.append(result)
312
+
313
+ return results
314
+
315
+ custom_list_default = []
316
+ custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
317
+
318
  # Create a class inheriting from SpacyNlpEngine
319
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
320
  def __init__(self, loaded_spacy_model):
321
  super().__init__()
322
  self.nlp = {"en": loaded_spacy_model}
323
 
 
 
324
  # Pass the loaded model to the new LoadedSpacyNlpEngine
325
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
326
 
 
336
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
337
  nlp_analyser.registry.add_recognizer(titles_recogniser)
338
  nlp_analyser.registry.add_recognizer(custom_recogniser)
339
+ nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
340
 
tools/redaction_review.py CHANGED
@@ -8,7 +8,7 @@ from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
  from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
11
- from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
14
  import os
@@ -68,6 +68,12 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
68
  for image, items in image_groups.items():
69
  # Filter items with non-empty boxes
70
  non_empty_boxes = [item for item in items if item.get('boxes')]
 
 
 
 
 
 
71
  if non_empty_boxes:
72
  # Keep the first entry with non-empty boxes
73
  result.append(non_empty_boxes[0])
@@ -175,6 +181,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
175
 
176
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
177
 
 
 
178
  out_image_annotator = image_annotator(
179
  value = image_annotator_object[page_num_reported - 1],
180
  boxes_alpha=0.1,
@@ -264,7 +272,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
264
 
265
  for file_path in file_paths:
266
  #print("file_path:", file_path)
267
- file_name_without_ext = get_file_path_end(file_path)
268
  file_name_with_ext = os.path.basename(file_path)
269
 
270
  file_extension = os.path.splitext(file_path)[1].lower()
@@ -544,7 +552,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
544
  else:
545
  file_path = file.name
546
 
547
- file_path_name = get_file_path_end(file_path)
548
  file_path_end = detect_file_type(file_path)
549
 
550
  if file_path_end == "pdf":
@@ -675,7 +683,7 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
675
  else:
676
  file_path = file.name
677
 
678
- file_path_name = get_file_path_end(file_path)
679
  file_path_end = detect_file_type(file_path)
680
 
681
  if file_path_end == "pdf":
@@ -699,7 +707,7 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
699
  # else:
700
  # xfdf_path = xfdf_paths[0].name
701
 
702
- file_path_name = get_file_path_end(xfdf_path)
703
 
704
  #print("file_path_name:", file_path_name)
705
 
 
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
  from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
11
+ from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
14
  import os
 
68
  for image, items in image_groups.items():
69
  # Filter items with non-empty boxes
70
  non_empty_boxes = [item for item in items if item.get('boxes')]
71
+
72
+ # Remove 'text' elements from boxes
73
+ for item in non_empty_boxes:
74
+ if 'boxes' in item:
75
+ item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
76
+
77
  if non_empty_boxes:
78
  # Keep the first entry with non-empty boxes
79
  result.append(non_empty_boxes[0])
 
181
 
182
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
183
 
184
+
185
+
186
  out_image_annotator = image_annotator(
187
  value = image_annotator_object[page_num_reported - 1],
188
  boxes_alpha=0.1,
 
272
 
273
  for file_path in file_paths:
274
  #print("file_path:", file_path)
275
+ file_name_without_ext = get_file_name_without_type(file_path)
276
  file_name_with_ext = os.path.basename(file_path)
277
 
278
  file_extension = os.path.splitext(file_path)[1].lower()
 
552
  else:
553
  file_path = file.name
554
 
555
+ file_path_name = get_file_name_without_type(file_path)
556
  file_path_end = detect_file_type(file_path)
557
 
558
  if file_path_end == "pdf":
 
683
  else:
684
  file_path = file.name
685
 
686
+ file_path_name = get_file_name_without_type(file_path)
687
  file_path_end = detect_file_type(file_path)
688
 
689
  if file_path_end == "pdf":
 
707
  # else:
708
  # xfdf_path = xfdf_paths[0].name
709
 
710
+ file_path_name = get_file_name_without_type(xfdf_path)
711
 
712
  #print("file_path_name:", file_path_name)
713