seanpedrickcase commited on
Commit
6ac4be4
·
1 Parent(s): 613b1b4

Greatly improved regex for direct matching with custom entities

Browse files
app.py CHANGED
@@ -197,7 +197,7 @@ with app:
197
  # Object annotation
198
  with gr.Tab("Review redactions", id="tab_object_annotation"):
199
 
200
- with gr.Accordion(label = "Review redaction file", open=True):
201
  output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
202
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
203
 
@@ -418,13 +418,13 @@ with app:
418
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
419
 
420
  # If running on AWS, load in the default allow list file from S3
421
- if RUN_AWS_FUNCTIONS == "1":
422
- print("default_allow_list_output_folder_location:", default_allow_list_loc)
423
- if not os.path.exists(default_allow_list_loc):
424
- app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
425
- then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
426
- else:
427
- app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
428
 
429
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
430
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
 
197
  # Object annotation
198
  with gr.Tab("Review redactions", id="tab_object_annotation"):
199
 
200
+ with gr.Accordion(label = "Review redaction file", open=False):
201
  output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
202
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
203
 
 
418
  app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
419
 
420
  # If running on AWS, load in the default allow list file from S3
421
+ # if RUN_AWS_FUNCTIONS == "1":
422
+ # print("default_allow_list_output_folder_location:", default_allow_list_loc)
423
+ # if not os.path.exists(default_allow_list_loc):
424
+ # app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
425
+ # then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
426
+ # else:
427
+ # app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
428
 
429
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
430
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
tools/custom_image_analyser_engine.py CHANGED
@@ -14,6 +14,7 @@ from tools.helper_functions import clean_unicode_text
14
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  #import string # Import string to get a list of common punctuation characters
 
17
 
18
  @dataclass
19
  class OCRResult:
@@ -493,11 +494,12 @@ class CustomImageAnalyzerEngine:
493
 
494
  elif pii_identification_method == "AWS Comprehend":
495
 
496
- # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
497
  text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
498
 
499
  spacy_analyzer_result = self.analyzer_engine.analyze(
500
  text=line_level_ocr_result.text, **text_analyzer_kwargs)
 
501
  analyzer_results_by_line[i].extend(spacy_analyzer_result)
502
 
503
  if len(line_level_ocr_result.text) >= 3:
@@ -573,7 +575,7 @@ class CustomImageAnalyzerEngine:
573
  for result in analyzer_result:
574
  # Extract the relevant portion of text based on start and end
575
  relevant_text = line_level_ocr_results[i].text[result.start:result.end]
576
-
577
  # Find the corresponding entry in ocr_results_with_children
578
  child_words = ocr_results_with_children_line_level['words']
579
 
@@ -583,13 +585,23 @@ class CustomImageAnalyzerEngine:
583
  word_num = 0 # Initialize word count
584
  total_width = 0 # Initialize total width
585
 
586
- for word_text in relevant_text.split(): # Iterate through each word in relevant_text
587
- #print("Looking for word_text:", word_text)
588
- for word in child_words:
589
- #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip(): # Check for exact match
590
- if word_text in word['text']:
 
 
 
 
 
 
 
 
 
 
 
591
  found_word = word
592
- #print("found_word:", found_word)
593
 
594
  if word_num == 0: # First word
595
  left = found_word['bounding_box'][0]
@@ -598,6 +610,10 @@ class CustomImageAnalyzerEngine:
598
  all_words += found_word['text'] + " " # Concatenate words
599
  total_width = found_word['bounding_box'][2] - left # Add each word's width
600
  word_num += 1
 
 
 
 
601
  break # Move to the next word in relevant_text
602
 
603
  width = total_width + horizontal_buffer # Set width to total width of all matched words
@@ -621,9 +637,9 @@ class CustomImageAnalyzerEngine:
621
  result_reset_pos.start = 0
622
  result_reset_pos.end = len(relevant_text)
623
 
624
- #print("result_reset_pos:", result_reset_pos)
625
- #print("relevant_line_ocr_result:", relevant_line_ocr_result)
626
- #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
627
 
628
  # Map the analyzer results to bounding boxes for this line
629
  line_results = self.map_analyzer_results_to_bounding_boxes(
 
14
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  #import string # Import string to get a list of common punctuation characters
17
+ import re # Add this import at the top of the file
18
 
19
  @dataclass
20
  class OCRResult:
 
494
 
495
  elif pii_identification_method == "AWS Comprehend":
496
 
497
+ # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
498
  text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
499
 
500
  spacy_analyzer_result = self.analyzer_engine.analyze(
501
  text=line_level_ocr_result.text, **text_analyzer_kwargs)
502
+
503
  analyzer_results_by_line[i].extend(spacy_analyzer_result)
504
 
505
  if len(line_level_ocr_result.text) >= 3:
 
575
  for result in analyzer_result:
576
  # Extract the relevant portion of text based on start and end
577
  relevant_text = line_level_ocr_results[i].text[result.start:result.end]
578
+
579
  # Find the corresponding entry in ocr_results_with_children
580
  child_words = ocr_results_with_children_line_level['words']
581
 
 
585
  word_num = 0 # Initialize word count
586
  total_width = 0 # Initialize total width
587
 
588
+ split_relevant_text = relevant_text.split()
589
+
590
+ loop_child_words = child_words.copy()
591
+
592
+ for word_text in split_relevant_text: # Iterate through each word in relevant_text
593
+
594
+ quote_str = '"'
595
+ replace_str = '(?:"|"|")'
596
+
597
+ word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
598
+
599
+ for word in loop_child_words:
600
+ # Check for regex as whole word
601
+
602
+ if re.search(word_regex, word['text']):
603
+ #if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
604
  found_word = word
 
605
 
606
  if word_num == 0: # First word
607
  left = found_word['bounding_box'][0]
 
610
  all_words += found_word['text'] + " " # Concatenate words
611
  total_width = found_word['bounding_box'][2] - left # Add each word's width
612
  word_num += 1
613
+
614
+ # Drop the first word of child_words
615
+ loop_child_words = loop_child_words[1:] # Skip the first word
616
+
617
  break # Move to the next word in relevant_text
618
 
619
  width = total_width + horizontal_buffer # Set width to total width of all matched words
 
637
  result_reset_pos.start = 0
638
  result_reset_pos.end = len(relevant_text)
639
 
640
+ print("result_reset_pos:", result_reset_pos)
641
+ print("relevant_line_ocr_result:", relevant_line_ocr_result)
642
+ print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
643
 
644
  # Map the analyzer results to bounding boxes for this line
645
  line_results = self.map_analyzer_results_to_bounding_boxes(
tools/file_redaction.py CHANGED
@@ -760,8 +760,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
760
  # Process signature and handwriting results
761
  if signature_recogniser_results or handwriting_recogniser_results:
762
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
763
- print("handwriting_recogniser_results:", handwriting_recogniser_results)
764
-
765
  merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
766
 
767
  if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -972,9 +970,6 @@ def redact_image_pdf(file_path:str,
972
  print("Page range:", str(page_min + 1), "to", str(page_max))
973
  #print("Current_loop_page:", current_loop_page)
974
 
975
- if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
976
- elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
977
-
978
  # If running Textract, check if file already exists. If it does, load in existing data
979
  # Import results from json and convert
980
  if analysis_type == textract_option:
@@ -983,7 +978,6 @@ def redact_image_pdf(file_path:str,
983
  log_files_output_paths.append(json_file_path)
984
 
985
  if not os.path.exists(json_file_path):
986
- no_textract_file = True
987
  print("No existing Textract results file found.")
988
  existing_data = {}
989
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
@@ -1041,12 +1035,8 @@ def redact_image_pdf(file_path:str,
1041
 
1042
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
1043
  if analysis_type == tesseract_ocr_option:
1044
-
1045
  word_level_ocr_results = image_analyser.perform_ocr(image)
1046
-
1047
- # Combine OCR results
1048
  line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
1049
-
1050
 
1051
  # Import results from json and convert
1052
  if analysis_type == textract_option:
 
760
  # Process signature and handwriting results
761
  if signature_recogniser_results or handwriting_recogniser_results:
762
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
 
 
763
  merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
764
 
765
  if "Redact all identified signatures" in handwrite_signature_checkbox:
 
970
  print("Page range:", str(page_min + 1), "to", str(page_max))
971
  #print("Current_loop_page:", current_loop_page)
972
 
 
 
 
973
  # If running Textract, check if file already exists. If it does, load in existing data
974
  # Import results from json and convert
975
  if analysis_type == textract_option:
 
978
  log_files_output_paths.append(json_file_path)
979
 
980
  if not os.path.exists(json_file_path):
 
981
  print("No existing Textract results file found.")
982
  existing_data = {}
983
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
 
1035
 
1036
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
1037
  if analysis_type == tesseract_ocr_option:
 
1038
  word_level_ocr_results = image_analyser.perform_ocr(image)
 
 
1039
  line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
 
1040
 
1041
  # Import results from json and convert
1042
  if analysis_type == textract_option:
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -24,15 +24,22 @@ except:
24
  print("Successfully downloaded and imported spaCy model", model_name)
25
 
26
  # #### Custom recognisers
27
- # Allow user to create their own recogniser
28
  def custom_word_list_recogniser(custom_list:List[str]=[]):
29
- #custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}" for term in custom_list) + '\\b'
30
- custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}(?=\W|$)" for term in custom_list)
31
- custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
32
 
33
- #print("custom_pattern:", custom_pattern)
 
 
 
 
 
 
 
 
 
 
34
  custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
35
- global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
36
 
37
  return custom_recogniser
38
 
 
24
  print("Successfully downloaded and imported spaCy model", model_name)
25
 
26
  # #### Custom recognisers
 
27
  def custom_word_list_recogniser(custom_list:List[str]=[]):
28
+ # Create regex pattern, handling quotes carefully
 
 
29
 
30
+ quote_str = '"'
31
+ replace_str = '(?:"|"|")'
32
+
33
+ custom_regex = '|'.join(
34
+ rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
35
+ for term in custom_list
36
+ )
37
+ print(custom_regex)
38
+
39
+ custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
40
+
41
  custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
42
+ global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
43
 
44
  return custom_recogniser
45