Commit
·
6ac4be4
1
Parent(s):
613b1b4
Greatly improved regex for direct matching with custom entities
Browse files- app.py +8 -8
- tools/custom_image_analyser_engine.py +27 -11
- tools/file_redaction.py +0 -10
- tools/load_spacy_model_custom_recognisers.py +13 -6
app.py
CHANGED
@@ -197,7 +197,7 @@ with app:
|
|
197 |
# Object annotation
|
198 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
199 |
|
200 |
-
with gr.Accordion(label = "Review redaction file", open=
|
201 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
202 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
203 |
|
@@ -418,13 +418,13 @@ with app:
|
|
418 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
419 |
|
420 |
# If running on AWS, load in the default allow list file from S3
|
421 |
-
if RUN_AWS_FUNCTIONS == "1":
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
|
429 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
430 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
|
|
197 |
# Object annotation
|
198 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
199 |
|
200 |
+
with gr.Accordion(label = "Review redaction file", open=False):
|
201 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
202 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
203 |
|
|
|
418 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
419 |
|
420 |
# If running on AWS, load in the default allow list file from S3
|
421 |
+
# if RUN_AWS_FUNCTIONS == "1":
|
422 |
+
# print("default_allow_list_output_folder_location:", default_allow_list_loc)
|
423 |
+
# if not os.path.exists(default_allow_list_loc):
|
424 |
+
# app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
425 |
+
# then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
426 |
+
# else:
|
427 |
+
# app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
428 |
|
429 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
430 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -14,6 +14,7 @@ from tools.helper_functions import clean_unicode_text
|
|
14 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
#import string # Import string to get a list of common punctuation characters
|
|
|
17 |
|
18 |
@dataclass
|
19 |
class OCRResult:
|
@@ -493,11 +494,12 @@ class CustomImageAnalyzerEngine:
|
|
493 |
|
494 |
elif pii_identification_method == "AWS Comprehend":
|
495 |
|
496 |
-
# If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
|
497 |
text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
498 |
|
499 |
spacy_analyzer_result = self.analyzer_engine.analyze(
|
500 |
text=line_level_ocr_result.text, **text_analyzer_kwargs)
|
|
|
501 |
analyzer_results_by_line[i].extend(spacy_analyzer_result)
|
502 |
|
503 |
if len(line_level_ocr_result.text) >= 3:
|
@@ -573,7 +575,7 @@ class CustomImageAnalyzerEngine:
|
|
573 |
for result in analyzer_result:
|
574 |
# Extract the relevant portion of text based on start and end
|
575 |
relevant_text = line_level_ocr_results[i].text[result.start:result.end]
|
576 |
-
|
577 |
# Find the corresponding entry in ocr_results_with_children
|
578 |
child_words = ocr_results_with_children_line_level['words']
|
579 |
|
@@ -583,13 +585,23 @@ class CustomImageAnalyzerEngine:
|
|
583 |
word_num = 0 # Initialize word count
|
584 |
total_width = 0 # Initialize total width
|
585 |
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
found_word = word
|
592 |
-
#print("found_word:", found_word)
|
593 |
|
594 |
if word_num == 0: # First word
|
595 |
left = found_word['bounding_box'][0]
|
@@ -598,6 +610,10 @@ class CustomImageAnalyzerEngine:
|
|
598 |
all_words += found_word['text'] + " " # Concatenate words
|
599 |
total_width = found_word['bounding_box'][2] - left # Add each word's width
|
600 |
word_num += 1
|
|
|
|
|
|
|
|
|
601 |
break # Move to the next word in relevant_text
|
602 |
|
603 |
width = total_width + horizontal_buffer # Set width to total width of all matched words
|
@@ -621,9 +637,9 @@ class CustomImageAnalyzerEngine:
|
|
621 |
result_reset_pos.start = 0
|
622 |
result_reset_pos.end = len(relevant_text)
|
623 |
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
|
628 |
# Map the analyzer results to bounding boxes for this line
|
629 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
|
|
14 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
#import string # Import string to get a list of common punctuation characters
|
17 |
+
import re # Add this import at the top of the file
|
18 |
|
19 |
@dataclass
|
20 |
class OCRResult:
|
|
|
494 |
|
495 |
elif pii_identification_method == "AWS Comprehend":
|
496 |
|
497 |
+
# If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
|
498 |
text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
499 |
|
500 |
spacy_analyzer_result = self.analyzer_engine.analyze(
|
501 |
text=line_level_ocr_result.text, **text_analyzer_kwargs)
|
502 |
+
|
503 |
analyzer_results_by_line[i].extend(spacy_analyzer_result)
|
504 |
|
505 |
if len(line_level_ocr_result.text) >= 3:
|
|
|
575 |
for result in analyzer_result:
|
576 |
# Extract the relevant portion of text based on start and end
|
577 |
relevant_text = line_level_ocr_results[i].text[result.start:result.end]
|
578 |
+
|
579 |
# Find the corresponding entry in ocr_results_with_children
|
580 |
child_words = ocr_results_with_children_line_level['words']
|
581 |
|
|
|
585 |
word_num = 0 # Initialize word count
|
586 |
total_width = 0 # Initialize total width
|
587 |
|
588 |
+
split_relevant_text = relevant_text.split()
|
589 |
+
|
590 |
+
loop_child_words = child_words.copy()
|
591 |
+
|
592 |
+
for word_text in split_relevant_text: # Iterate through each word in relevant_text
|
593 |
+
|
594 |
+
quote_str = '"'
|
595 |
+
replace_str = '(?:"|"|")'
|
596 |
+
|
597 |
+
word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
|
598 |
+
|
599 |
+
for word in loop_child_words:
|
600 |
+
# Check for regex as whole word
|
601 |
+
|
602 |
+
if re.search(word_regex, word['text']):
|
603 |
+
#if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
|
604 |
found_word = word
|
|
|
605 |
|
606 |
if word_num == 0: # First word
|
607 |
left = found_word['bounding_box'][0]
|
|
|
610 |
all_words += found_word['text'] + " " # Concatenate words
|
611 |
total_width = found_word['bounding_box'][2] - left # Add each word's width
|
612 |
word_num += 1
|
613 |
+
|
614 |
+
# Drop the first word of child_words
|
615 |
+
loop_child_words = loop_child_words[1:] # Skip the first word
|
616 |
+
|
617 |
break # Move to the next word in relevant_text
|
618 |
|
619 |
width = total_width + horizontal_buffer # Set width to total width of all matched words
|
|
|
637 |
result_reset_pos.start = 0
|
638 |
result_reset_pos.end = len(relevant_text)
|
639 |
|
640 |
+
print("result_reset_pos:", result_reset_pos)
|
641 |
+
print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
642 |
+
print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
643 |
|
644 |
# Map the analyzer results to bounding boxes for this line
|
645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
tools/file_redaction.py
CHANGED
@@ -760,8 +760,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
760 |
# Process signature and handwriting results
|
761 |
if signature_recogniser_results or handwriting_recogniser_results:
|
762 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
763 |
-
print("handwriting_recogniser_results:", handwriting_recogniser_results)
|
764 |
-
|
765 |
merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
|
766 |
|
767 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
@@ -972,9 +970,6 @@ def redact_image_pdf(file_path:str,
|
|
972 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
973 |
#print("Current_loop_page:", current_loop_page)
|
974 |
|
975 |
-
if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
976 |
-
elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
977 |
-
|
978 |
# If running Textract, check if file already exists. If it does, load in existing data
|
979 |
# Import results from json and convert
|
980 |
if analysis_type == textract_option:
|
@@ -983,7 +978,6 @@ def redact_image_pdf(file_path:str,
|
|
983 |
log_files_output_paths.append(json_file_path)
|
984 |
|
985 |
if not os.path.exists(json_file_path):
|
986 |
-
no_textract_file = True
|
987 |
print("No existing Textract results file found.")
|
988 |
existing_data = {}
|
989 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
@@ -1041,12 +1035,8 @@ def redact_image_pdf(file_path:str,
|
|
1041 |
|
1042 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
1043 |
if analysis_type == tesseract_ocr_option:
|
1044 |
-
|
1045 |
word_level_ocr_results = image_analyser.perform_ocr(image)
|
1046 |
-
|
1047 |
-
# Combine OCR results
|
1048 |
line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
|
1049 |
-
|
1050 |
|
1051 |
# Import results from json and convert
|
1052 |
if analysis_type == textract_option:
|
|
|
760 |
# Process signature and handwriting results
|
761 |
if signature_recogniser_results or handwriting_recogniser_results:
|
762 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
|
|
|
|
763 |
merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
|
764 |
|
765 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
|
|
970 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
971 |
#print("Current_loop_page:", current_loop_page)
|
972 |
|
|
|
|
|
|
|
973 |
# If running Textract, check if file already exists. If it does, load in existing data
|
974 |
# Import results from json and convert
|
975 |
if analysis_type == textract_option:
|
|
|
978 |
log_files_output_paths.append(json_file_path)
|
979 |
|
980 |
if not os.path.exists(json_file_path):
|
|
|
981 |
print("No existing Textract results file found.")
|
982 |
existing_data = {}
|
983 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
|
|
1035 |
|
1036 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
1037 |
if analysis_type == tesseract_ocr_option:
|
|
|
1038 |
word_level_ocr_results = image_analyser.perform_ocr(image)
|
|
|
|
|
1039 |
line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
|
|
|
1040 |
|
1041 |
# Import results from json and convert
|
1042 |
if analysis_type == textract_option:
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -24,15 +24,22 @@ except:
|
|
24 |
print("Successfully downloaded and imported spaCy model", model_name)
|
25 |
|
26 |
# #### Custom recognisers
|
27 |
-
# Allow user to create their own recogniser
|
28 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
29 |
-
#
|
30 |
-
custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}(?=\W|$)" for term in custom_list)
|
31 |
-
custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
|
35 |
-
|
36 |
|
37 |
return custom_recogniser
|
38 |
|
|
|
24 |
print("Successfully downloaded and imported spaCy model", model_name)
|
25 |
|
26 |
# #### Custom recognisers
|
|
|
27 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
28 |
+
# Create regex pattern, handling quotes carefully
|
|
|
|
|
29 |
|
30 |
+
quote_str = '"'
|
31 |
+
replace_str = '(?:"|"|")'
|
32 |
+
|
33 |
+
custom_regex = '|'.join(
|
34 |
+
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
|
35 |
+
for term in custom_list
|
36 |
+
)
|
37 |
+
print(custom_regex)
|
38 |
+
|
39 |
+
custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
|
40 |
+
|
41 |
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
|
42 |
+
global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
|
43 |
|
44 |
return custom_recogniser
|
45 |
|