Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 14

Commit

6ac4be4

1 Parent(s): 613b1b4

Greatly improved regex for direct matching with custom entities

Browse files

Files changed (4) hide show

app.py +8 -8
tools/custom_image_analyser_engine.py +27 -11
tools/file_redaction.py +0 -10
tools/load_spacy_model_custom_recognisers.py +13 -6

app.py CHANGED Viewed

@@ -197,7 +197,7 @@ with app:
     # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
-        with gr.Accordion(label = "Review redaction file", open=True):
             output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
             upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
@@ -418,13 +418,13 @@ with app:
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # If running on AWS, load in the default allow list file from S3
-    if RUN_AWS_FUNCTIONS == "1":
-        print("default_allow_list_output_folder_location:", default_allow_list_loc)
-        if not os.path.exists(default_allow_list_loc):
-            app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
-            then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
-        else:
-            app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

     # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
+        with gr.Accordion(label = "Review redaction file", open=False):
             output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
             upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
     app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # If running on AWS, load in the default allow list file from S3
+    # if RUN_AWS_FUNCTIONS == "1":
+    #     print("default_allow_list_output_folder_location:", default_allow_list_loc)
+    #     if not os.path.exists(default_allow_list_loc):
+    #         app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
+    #         then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
+    #     else:
+    #         app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -14,6 +14,7 @@ from tools.helper_functions import clean_unicode_text
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
 #import string  # Import string to get a list of common punctuation characters
 @dataclass
 class OCRResult:
@@ -493,11 +494,12 @@ class CustomImageAnalyzerEngine:
             elif pii_identification_method == "AWS Comprehend":
-                # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
                 text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
                 spacy_analyzer_result = self.analyzer_engine.analyze(
                 text=line_level_ocr_result.text, **text_analyzer_kwargs)
                 analyzer_results_by_line[i].extend(spacy_analyzer_result)
                 if len(line_level_ocr_result.text) >= 3:
@@ -573,7 +575,7 @@ class CustomImageAnalyzerEngine:
                 for result in analyzer_result:
                     # Extract the relevant portion of text based on start and end
                     relevant_text = line_level_ocr_results[i].text[result.start:result.end]
                     # Find the corresponding entry in ocr_results_with_children
                     child_words = ocr_results_with_children_line_level['words']
@@ -583,13 +585,23 @@ class CustomImageAnalyzerEngine:
                     word_num = 0  # Initialize word count
                     total_width = 0  # Initialize total width
-                    for word_text in relevant_text.split():  # Iterate through each word in relevant_text
-                        #print("Looking for word_text:", word_text)
-                        for word in child_words:
-                            #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip():  # Check for exact match
-                            if word_text in word['text']:
                                 found_word = word
-                                #print("found_word:", found_word)
                                 if word_num == 0:  # First word
                                     left = found_word['bounding_box'][0]
@@ -598,6 +610,10 @@ class CustomImageAnalyzerEngine:
                                 all_words += found_word['text'] + " "  # Concatenate words
                                 total_width = found_word['bounding_box'][2] - left  # Add each word's width
                                 word_num += 1
                                 break  # Move to the next word in relevant_text
                     width = total_width + horizontal_buffer # Set width to total width of all matched words
@@ -621,9 +637,9 @@ class CustomImageAnalyzerEngine:
                     result_reset_pos.start = 0
                     result_reset_pos.end = len(relevant_text)
-                    #print("result_reset_pos:", result_reset_pos)
-                    #print("relevant_line_ocr_result:", relevant_line_ocr_result)
-                    #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
                     # Map the analyzer results to bounding boxes for this line
                     line_results = self.map_analyzer_results_to_bounding_boxes(

 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
 #import string  # Import string to get a list of common punctuation characters
+import re  # Add this import at the top of the file
 @dataclass
 class OCRResult:
             elif pii_identification_method == "AWS Comprehend":
+                # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
                 text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
                 spacy_analyzer_result = self.analyzer_engine.analyze(
                 text=line_level_ocr_result.text, **text_analyzer_kwargs)
                 analyzer_results_by_line[i].extend(spacy_analyzer_result)
                 if len(line_level_ocr_result.text) >= 3:
                 for result in analyzer_result:
                     # Extract the relevant portion of text based on start and end
                     relevant_text = line_level_ocr_results[i].text[result.start:result.end]
                     # Find the corresponding entry in ocr_results_with_children
                     child_words = ocr_results_with_children_line_level['words']
                     word_num = 0  # Initialize word count
                     total_width = 0  # Initialize total width
+                    split_relevant_text = relevant_text.split()
+                    loop_child_words = child_words.copy()
+                    for word_text in split_relevant_text:  # Iterate through each word in relevant_text
+                        quote_str = '"'
+                        replace_str = '(?:"|"|")'
+                        word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
+                        for word in loop_child_words:
+                            # Check for regex as whole word
+                            if re.search(word_regex, word['text']):
+                            #if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
                                 found_word = word
                                 if word_num == 0:  # First word
                                     left = found_word['bounding_box'][0]
                                 all_words += found_word['text'] + " "  # Concatenate words
                                 total_width = found_word['bounding_box'][2] - left  # Add each word's width
                                 word_num += 1
+                                # Drop the first word of child_words
+                                loop_child_words = loop_child_words[1:]  # Skip the first word
                                 break  # Move to the next word in relevant_text
                     width = total_width + horizontal_buffer # Set width to total width of all matched words
                     result_reset_pos.start = 0
                     result_reset_pos.end = len(relevant_text)
+                    print("result_reset_pos:", result_reset_pos)
+                    print("relevant_line_ocr_result:", relevant_line_ocr_result)
+                    print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
                     # Map the analyzer results to bounding boxes for this line
                     line_results = self.map_analyzer_results_to_bounding_boxes(

tools/file_redaction.py CHANGED Viewed

@@ -760,8 +760,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
-            print("handwriting_recogniser_results:", handwriting_recogniser_results)
             merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
         if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -972,9 +970,6 @@ def redact_image_pdf(file_path:str,
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #print("Current_loop_page:", current_loop_page)
-    if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
-    elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
     # If running Textract, check if file already exists. If it does, load in existing data
     # Import results from json and convert
     if analysis_type == textract_option:
@@ -983,7 +978,6 @@ def redact_image_pdf(file_path:str,
         log_files_output_paths.append(json_file_path)
         if not os.path.exists(json_file_path):
-            no_textract_file = True
             print("No existing Textract results file found.")
             existing_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
@@ -1041,12 +1035,8 @@ def redact_image_pdf(file_path:str,
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == tesseract_ocr_option:
                 word_level_ocr_results = image_analyser.perform_ocr(image)
-                # Combine OCR results
                 line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
             # Import results from json and convert
             if analysis_type == textract_option:

     # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
             merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
         if "Redact all identified signatures" in handwrite_signature_checkbox:
     print("Page range:", str(page_min + 1), "to", str(page_max))
     #print("Current_loop_page:", current_loop_page)
     # If running Textract, check if file already exists. If it does, load in existing data
     # Import results from json and convert
     if analysis_type == textract_option:
         log_files_output_paths.append(json_file_path)
         if not os.path.exists(json_file_path):
             print("No existing Textract results file found.")
             existing_data = {}
             #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == tesseract_ocr_option:
                 word_level_ocr_results = image_analyser.perform_ocr(image)
                 line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
             # Import results from json and convert
             if analysis_type == textract_option:

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -24,15 +24,22 @@ except:
 	print("Successfully downloaded and imported spaCy model", model_name)
 # #### Custom recognisers
-# Allow user to create their own recogniser
 def custom_word_list_recogniser(custom_list:List[str]=[]):
-    #custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}" for term in custom_list) + '\\b'
-    custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term.strip())}(?=\W|$)" for term in custom_list)
-    custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
-    #print("custom_pattern:", custom_pattern)
     custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
-    global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
     return custom_recogniser

 	print("Successfully downloaded and imported spaCy model", model_name)
 # #### Custom recognisers
 def custom_word_list_recogniser(custom_list:List[str]=[]):
+    # Create regex pattern, handling quotes carefully
+    quote_str = '"'
+    replace_str = '(?:"|"|")'
+    custom_regex = '|'.join(
+        rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
+        for term in custom_list
+    )
+    print(custom_regex)
+    custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
     custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
+        global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
     return custom_recogniser