Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Dec 24, 2024

Commit

59ff822

1 Parent(s): 8183bc4

Hopefully finally fixed the duplicate image_annotation_object issue

Browse files

Files changed (3) hide show

tools/file_conversion.py +3 -3
tools/file_redaction.py +48 -7
tools/redaction_review.py +10 -5

tools/file_conversion.py CHANGED Viewed

@@ -468,8 +468,8 @@ def prepare_image_or_pdf(
             converted_file_path = file_path
             image_file_paths = process_file(file_path, prepare_for_review)
-            # Create base version of the annotation object that doesn't have any annotations in it
-            if not all_annotations_object:
                 all_annotations_object = []
                 for image_path in image_file_paths:
@@ -478,7 +478,7 @@ def prepare_image_or_pdf(
                     all_annotations_object.append(annotation)
-                #print("all_annotations_object:", all_annotations_object)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image

             converted_file_path = file_path
             image_file_paths = process_file(file_path, prepare_for_review)
+            #Create base version of the annotation object that doesn't have any annotations in it
+            if (not all_annotations_object) & (prepare_for_review == True):
                 all_annotations_object = []
                 for image_path in image_file_paths:
                     all_annotations_object.append(annotation)
+                print("all_annotations_object:", all_annotations_object)
         elif is_pdf_or_image(file_path):  # Alternatively, if it's an image

tools/file_redaction.py CHANGED Viewed

@@ -1322,7 +1322,15 @@ def redact_image_pdf(file_path:str,
                     images.append(image)
                     pymupdf_doc = images
-                annotations_all_pages.append(image_annotations)
                 if analysis_type == textract_option:
                     # Write the updated existing textract data back to the JSON file
@@ -1337,7 +1345,15 @@ def redact_image_pdf(file_path:str,
             images.append(image)
             pymupdf_doc = images
-        annotations_all_pages.append(image_annotations)
         current_loop_page += 1
@@ -1871,6 +1887,8 @@ def redact_text_pdf(
                             if chosen_redact_entities:
                                 if pii_identification_method == "Local":
                                     # Process immediately for local analysis
                                     text_line_analyser_result = nlp_analyser.analyze(
                                         text=text_line.text,
@@ -1881,12 +1899,15 @@ def redact_text_pdf(
                                         allow_list=allow_list
                                     )
                                     all_text_line_results.append((i, text_line_analyser_result))
                                 elif pii_identification_method == "AWS Comprehend":
                                     # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
                                     custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
                                     text_line_analyser_result = nlp_analyser.analyze(
                                         text=text_line.text,
                                         language=language,
@@ -1984,13 +2005,19 @@ def redact_text_pdf(
                                 text_container_analyser_results.extend(text_line_analyser_result)
                                 text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
-                        page_analyser_results.extend(text_container_analyser_results)
-                        page_analysed_bounding_boxes.extend(text_container_analysed_bounding_boxes)
                 # Annotate redactions on page
                 annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
@@ -2028,14 +2055,28 @@ def redact_text_pdf(
                     progress.close(_tqdm=progress_bar)
                     tqdm._instances.clear()
-                    annotations_all_pages.append(image_annotations)
                     current_loop_page += 1
                     return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
-        annotations_all_pages.append(image_annotations)
         current_loop_page += 1

                     images.append(image)
                     pymupdf_doc = images
+                # Check if the image already exists in annotations_all_pages
+                print("annotations_all_pages:", annotations_all_pages)
+                existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
+                if existing_index is not None:
+                    # Replace the existing annotation
+                    annotations_all_pages[existing_index] = image_annotations
+                else:
+                    # Append new annotation if it doesn't exist
+                    annotations_all_pages.append(image_annotations)
                 if analysis_type == textract_option:
                     # Write the updated existing textract data back to the JSON file
             images.append(image)
             pymupdf_doc = images
+        # Check if the image already exists in annotations_all_pages
+        print("annotations_all_pages:", annotations_all_pages)
+        existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
+        if existing_index is not None:
+            # Replace the existing annotation
+            annotations_all_pages[existing_index] = image_annotations
+        else:
+            # Append new annotation if it doesn't exist
+            annotations_all_pages.append(image_annotations)
         current_loop_page += 1
                             if chosen_redact_entities:
                                 if pii_identification_method == "Local":
+                                    #print("chosen_redact_entities:", chosen_redact_entities)
                                     # Process immediately for local analysis
                                     text_line_analyser_result = nlp_analyser.analyze(
                                         text=text_line.text,
                                         allow_list=allow_list
                                     )
                                     all_text_line_results.append((i, text_line_analyser_result))
+                                    print("all_text_line_results:", all_text_line_results)
                                 elif pii_identification_method == "AWS Comprehend":
                                     # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
                                     custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
                                     text_line_analyser_result = nlp_analyser.analyze(
                                         text=text_line.text,
                                         language=language,
                                 text_container_analyser_results.extend(text_line_analyser_result)
                                 text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
+                                print("text_container_analyser_results:", text_container_analyser_results)
+                                page_analysed_bounding_boxes.extend(text_line_bounding_boxes)  # Add this line
+                print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
                 # Annotate redactions on page
                 annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
+                print("annotations_on_page:", annotations_on_page)
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     progress.close(_tqdm=progress_bar)
                     tqdm._instances.clear()
+                    # Check if the image already exists in annotations_all_pages
+                    existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
+                    if existing_index is not None:
+                        # Replace the existing annotation
+                        annotations_all_pages[existing_index] = image_annotations
+                    else:
+                        # Append new annotation if it doesn't exist
+                        annotations_all_pages.append(image_annotations)
                     current_loop_page += 1
                     return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
+        # Check if the image already exists in annotations_all_pages
+        existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
+        if existing_index is not None:
+            # Replace the existing annotation
+            annotations_all_pages[existing_index] = image_annotations
+        else:
+            # Append new annotation if it doesn't exist
+            annotations_all_pages.append(image_annotations)
         current_loop_page += 1

tools/redaction_review.py CHANGED Viewed

@@ -76,6 +76,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
             recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
         except Exception as e:
             print("Could not extract recogniser information:", e)
     else:
         review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
@@ -139,24 +140,28 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         for item in data:
             image_groups[item['image']].append(item)
-        # Process each group to remove duplicates
         result = []
         for image, items in image_groups.items():
             # Filter items with non-empty boxes
             non_empty_boxes = [item for item in items if item['boxes']]
             if non_empty_boxes:
-                # Add only the first one with non-empty boxes
                 result.append(non_empty_boxes[0])
             else:
-                # If all boxes are empty, add the first one
                 result.append(items[0])
         return result
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
-    print("image_annotator_object in update_annotator:", image_annotator_object)
-    print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],

             recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
         except Exception as e:
             print("Could not extract recogniser information:", e)
+            recogniser_dataframe_out = recogniser_dataframe_gr
     else:
         review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
         for item in data:
             image_groups[item['image']].append(item)
+        # Process each group to retain only the entry with non-empty boxes, if available
         result = []
         for image, items in image_groups.items():
             # Filter items with non-empty boxes
             non_empty_boxes = [item for item in items if item['boxes']]
             if non_empty_boxes:
+                # Keep the first entry with non-empty boxes
                 result.append(non_empty_boxes[0])
             else:
+                # If no non-empty boxes, keep the first item with empty boxes
                 result.append(items[0])
+        #print("result:", result)
         return result
+    #print("image_annotator_object in update_annotator before function:", image_annotator_object)
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
+    #print("image_annotator_object in update_annotator after function:", image_annotator_object)
+    #print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],