Commit
·
59ff822
1
Parent(s):
8183bc4
Hopefully finally fixed the duplicate image_annotation_object issue
Browse files- tools/file_conversion.py +3 -3
- tools/file_redaction.py +48 -7
- tools/redaction_review.py +10 -5
tools/file_conversion.py
CHANGED
@@ -468,8 +468,8 @@ def prepare_image_or_pdf(
|
|
468 |
converted_file_path = file_path
|
469 |
image_file_paths = process_file(file_path, prepare_for_review)
|
470 |
|
471 |
-
#
|
472 |
-
if not all_annotations_object:
|
473 |
all_annotations_object = []
|
474 |
|
475 |
for image_path in image_file_paths:
|
@@ -478,7 +478,7 @@ def prepare_image_or_pdf(
|
|
478 |
|
479 |
all_annotations_object.append(annotation)
|
480 |
|
481 |
-
|
482 |
|
483 |
|
484 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
|
|
468 |
converted_file_path = file_path
|
469 |
image_file_paths = process_file(file_path, prepare_for_review)
|
470 |
|
471 |
+
#Create base version of the annotation object that doesn't have any annotations in it
|
472 |
+
if (not all_annotations_object) & (prepare_for_review == True):
|
473 |
all_annotations_object = []
|
474 |
|
475 |
for image_path in image_file_paths:
|
|
|
478 |
|
479 |
all_annotations_object.append(annotation)
|
480 |
|
481 |
+
print("all_annotations_object:", all_annotations_object)
|
482 |
|
483 |
|
484 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
tools/file_redaction.py
CHANGED
@@ -1322,7 +1322,15 @@ def redact_image_pdf(file_path:str,
|
|
1322 |
images.append(image)
|
1323 |
pymupdf_doc = images
|
1324 |
|
1325 |
-
annotations_all_pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1326 |
|
1327 |
if analysis_type == textract_option:
|
1328 |
# Write the updated existing textract data back to the JSON file
|
@@ -1337,7 +1345,15 @@ def redact_image_pdf(file_path:str,
|
|
1337 |
images.append(image)
|
1338 |
pymupdf_doc = images
|
1339 |
|
1340 |
-
annotations_all_pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1341 |
|
1342 |
current_loop_page += 1
|
1343 |
|
@@ -1871,6 +1887,8 @@ def redact_text_pdf(
|
|
1871 |
if chosen_redact_entities:
|
1872 |
if pii_identification_method == "Local":
|
1873 |
|
|
|
|
|
1874 |
# Process immediately for local analysis
|
1875 |
text_line_analyser_result = nlp_analyser.analyze(
|
1876 |
text=text_line.text,
|
@@ -1881,12 +1899,15 @@ def redact_text_pdf(
|
|
1881 |
allow_list=allow_list
|
1882 |
)
|
1883 |
all_text_line_results.append((i, text_line_analyser_result))
|
|
|
|
|
1884 |
|
1885 |
elif pii_identification_method == "AWS Comprehend":
|
1886 |
|
1887 |
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
1888 |
custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
1889 |
|
|
|
1890 |
text_line_analyser_result = nlp_analyser.analyze(
|
1891 |
text=text_line.text,
|
1892 |
language=language,
|
@@ -1984,13 +2005,19 @@ def redact_text_pdf(
|
|
1984 |
|
1985 |
text_container_analyser_results.extend(text_line_analyser_result)
|
1986 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1987 |
-
|
1988 |
-
|
1989 |
-
|
|
|
|
|
|
|
|
|
1990 |
|
1991 |
# Annotate redactions on page
|
1992 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1993 |
|
|
|
|
|
1994 |
# Make pymupdf page redactions
|
1995 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1996 |
if redact_whole_page_list:
|
@@ -2028,14 +2055,28 @@ def redact_text_pdf(
|
|
2028 |
progress.close(_tqdm=progress_bar)
|
2029 |
tqdm._instances.clear()
|
2030 |
|
2031 |
-
annotations_all_pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2032 |
|
2033 |
current_loop_page += 1
|
2034 |
|
2035 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2036 |
|
2037 |
|
2038 |
-
annotations_all_pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2039 |
|
2040 |
current_loop_page += 1
|
2041 |
|
|
|
1322 |
images.append(image)
|
1323 |
pymupdf_doc = images
|
1324 |
|
1325 |
+
# Check if the image already exists in annotations_all_pages
|
1326 |
+
print("annotations_all_pages:", annotations_all_pages)
|
1327 |
+
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1328 |
+
if existing_index is not None:
|
1329 |
+
# Replace the existing annotation
|
1330 |
+
annotations_all_pages[existing_index] = image_annotations
|
1331 |
+
else:
|
1332 |
+
# Append new annotation if it doesn't exist
|
1333 |
+
annotations_all_pages.append(image_annotations)
|
1334 |
|
1335 |
if analysis_type == textract_option:
|
1336 |
# Write the updated existing textract data back to the JSON file
|
|
|
1345 |
images.append(image)
|
1346 |
pymupdf_doc = images
|
1347 |
|
1348 |
+
# Check if the image already exists in annotations_all_pages
|
1349 |
+
print("annotations_all_pages:", annotations_all_pages)
|
1350 |
+
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1351 |
+
if existing_index is not None:
|
1352 |
+
# Replace the existing annotation
|
1353 |
+
annotations_all_pages[existing_index] = image_annotations
|
1354 |
+
else:
|
1355 |
+
# Append new annotation if it doesn't exist
|
1356 |
+
annotations_all_pages.append(image_annotations)
|
1357 |
|
1358 |
current_loop_page += 1
|
1359 |
|
|
|
1887 |
if chosen_redact_entities:
|
1888 |
if pii_identification_method == "Local":
|
1889 |
|
1890 |
+
#print("chosen_redact_entities:", chosen_redact_entities)
|
1891 |
+
|
1892 |
# Process immediately for local analysis
|
1893 |
text_line_analyser_result = nlp_analyser.analyze(
|
1894 |
text=text_line.text,
|
|
|
1899 |
allow_list=allow_list
|
1900 |
)
|
1901 |
all_text_line_results.append((i, text_line_analyser_result))
|
1902 |
+
|
1903 |
+
print("all_text_line_results:", all_text_line_results)
|
1904 |
|
1905 |
elif pii_identification_method == "AWS Comprehend":
|
1906 |
|
1907 |
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
1908 |
custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
1909 |
|
1910 |
+
|
1911 |
text_line_analyser_result = nlp_analyser.analyze(
|
1912 |
text=text_line.text,
|
1913 |
language=language,
|
|
|
2005 |
|
2006 |
text_container_analyser_results.extend(text_line_analyser_result)
|
2007 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
2008 |
+
|
2009 |
+
print("text_container_analyser_results:", text_container_analyser_results)
|
2010 |
+
|
2011 |
+
page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
2012 |
+
|
2013 |
+
|
2014 |
+
print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
2015 |
|
2016 |
# Annotate redactions on page
|
2017 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
2018 |
|
2019 |
+
print("annotations_on_page:", annotations_on_page)
|
2020 |
+
|
2021 |
# Make pymupdf page redactions
|
2022 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
2023 |
if redact_whole_page_list:
|
|
|
2055 |
progress.close(_tqdm=progress_bar)
|
2056 |
tqdm._instances.clear()
|
2057 |
|
2058 |
+
# Check if the image already exists in annotations_all_pages
|
2059 |
+
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
2060 |
+
if existing_index is not None:
|
2061 |
+
# Replace the existing annotation
|
2062 |
+
annotations_all_pages[existing_index] = image_annotations
|
2063 |
+
else:
|
2064 |
+
# Append new annotation if it doesn't exist
|
2065 |
+
annotations_all_pages.append(image_annotations)
|
2066 |
|
2067 |
current_loop_page += 1
|
2068 |
|
2069 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2070 |
|
2071 |
|
2072 |
+
# Check if the image already exists in annotations_all_pages
|
2073 |
+
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
2074 |
+
if existing_index is not None:
|
2075 |
+
# Replace the existing annotation
|
2076 |
+
annotations_all_pages[existing_index] = image_annotations
|
2077 |
+
else:
|
2078 |
+
# Append new annotation if it doesn't exist
|
2079 |
+
annotations_all_pages.append(image_annotations)
|
2080 |
|
2081 |
current_loop_page += 1
|
2082 |
|
tools/redaction_review.py
CHANGED
@@ -76,6 +76,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
76 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
|
77 |
except Exception as e:
|
78 |
print("Could not extract recogniser information:", e)
|
|
|
79 |
|
80 |
else:
|
81 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
@@ -139,24 +140,28 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
139 |
for item in data:
|
140 |
image_groups[item['image']].append(item)
|
141 |
|
142 |
-
# Process each group to
|
143 |
result = []
|
144 |
for image, items in image_groups.items():
|
145 |
# Filter items with non-empty boxes
|
146 |
non_empty_boxes = [item for item in items if item['boxes']]
|
147 |
if non_empty_boxes:
|
148 |
-
#
|
149 |
result.append(non_empty_boxes[0])
|
150 |
else:
|
151 |
-
# If
|
152 |
result.append(items[0])
|
153 |
|
|
|
|
|
154 |
return result
|
|
|
|
|
155 |
|
156 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
157 |
|
158 |
-
print("image_annotator_object in update_annotator:", image_annotator_object)
|
159 |
-
print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
|
160 |
|
161 |
out_image_annotator = image_annotator(
|
162 |
value = image_annotator_object[page_num_reported - 1],
|
|
|
76 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
|
77 |
except Exception as e:
|
78 |
print("Could not extract recogniser information:", e)
|
79 |
+
recogniser_dataframe_out = recogniser_dataframe_gr
|
80 |
|
81 |
else:
|
82 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
|
|
140 |
for item in data:
|
141 |
image_groups[item['image']].append(item)
|
142 |
|
143 |
+
# Process each group to retain only the entry with non-empty boxes, if available
|
144 |
result = []
|
145 |
for image, items in image_groups.items():
|
146 |
# Filter items with non-empty boxes
|
147 |
non_empty_boxes = [item for item in items if item['boxes']]
|
148 |
if non_empty_boxes:
|
149 |
+
# Keep the first entry with non-empty boxes
|
150 |
result.append(non_empty_boxes[0])
|
151 |
else:
|
152 |
+
# If no non-empty boxes, keep the first item with empty boxes
|
153 |
result.append(items[0])
|
154 |
|
155 |
+
#print("result:", result)
|
156 |
+
|
157 |
return result
|
158 |
+
|
159 |
+
#print("image_annotator_object in update_annotator before function:", image_annotator_object)
|
160 |
|
161 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
162 |
|
163 |
+
#print("image_annotator_object in update_annotator after function:", image_annotator_object)
|
164 |
+
#print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
|
165 |
|
166 |
out_image_annotator = image_annotator(
|
167 |
value = image_annotator_object[page_num_reported - 1],
|