seanpedrickcase commited on
Commit
59ff822
·
1 Parent(s): 8183bc4

Hopefully finally fixed the duplicate image_annotation_object issue

Browse files
tools/file_conversion.py CHANGED
@@ -468,8 +468,8 @@ def prepare_image_or_pdf(
468
  converted_file_path = file_path
469
  image_file_paths = process_file(file_path, prepare_for_review)
470
 
471
- # Create base version of the annotation object that doesn't have any annotations in it
472
- if not all_annotations_object:
473
  all_annotations_object = []
474
 
475
  for image_path in image_file_paths:
@@ -478,7 +478,7 @@ def prepare_image_or_pdf(
478
 
479
  all_annotations_object.append(annotation)
480
 
481
- #print("all_annotations_object:", all_annotations_object)
482
 
483
 
484
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
 
468
  converted_file_path = file_path
469
  image_file_paths = process_file(file_path, prepare_for_review)
470
 
471
+ #Create base version of the annotation object that doesn't have any annotations in it
472
+ if (not all_annotations_object) & (prepare_for_review == True):
473
  all_annotations_object = []
474
 
475
  for image_path in image_file_paths:
 
478
 
479
  all_annotations_object.append(annotation)
480
 
481
+ print("all_annotations_object:", all_annotations_object)
482
 
483
 
484
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
tools/file_redaction.py CHANGED
@@ -1322,7 +1322,15 @@ def redact_image_pdf(file_path:str,
1322
  images.append(image)
1323
  pymupdf_doc = images
1324
 
1325
- annotations_all_pages.append(image_annotations)
 
 
 
 
 
 
 
 
1326
 
1327
  if analysis_type == textract_option:
1328
  # Write the updated existing textract data back to the JSON file
@@ -1337,7 +1345,15 @@ def redact_image_pdf(file_path:str,
1337
  images.append(image)
1338
  pymupdf_doc = images
1339
 
1340
- annotations_all_pages.append(image_annotations)
 
 
 
 
 
 
 
 
1341
 
1342
  current_loop_page += 1
1343
 
@@ -1871,6 +1887,8 @@ def redact_text_pdf(
1871
  if chosen_redact_entities:
1872
  if pii_identification_method == "Local":
1873
 
 
 
1874
  # Process immediately for local analysis
1875
  text_line_analyser_result = nlp_analyser.analyze(
1876
  text=text_line.text,
@@ -1881,12 +1899,15 @@ def redact_text_pdf(
1881
  allow_list=allow_list
1882
  )
1883
  all_text_line_results.append((i, text_line_analyser_result))
 
 
1884
 
1885
  elif pii_identification_method == "AWS Comprehend":
1886
 
1887
  # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1888
  custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1889
 
 
1890
  text_line_analyser_result = nlp_analyser.analyze(
1891
  text=text_line.text,
1892
  language=language,
@@ -1984,13 +2005,19 @@ def redact_text_pdf(
1984
 
1985
  text_container_analyser_results.extend(text_line_analyser_result)
1986
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1987
-
1988
- page_analyser_results.extend(text_container_analyser_results)
1989
- page_analysed_bounding_boxes.extend(text_container_analysed_bounding_boxes)
 
 
 
 
1990
 
1991
  # Annotate redactions on page
1992
  annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1993
 
 
 
1994
  # Make pymupdf page redactions
1995
  #print("redact_whole_page_list:", redact_whole_page_list)
1996
  if redact_whole_page_list:
@@ -2028,14 +2055,28 @@ def redact_text_pdf(
2028
  progress.close(_tqdm=progress_bar)
2029
  tqdm._instances.clear()
2030
 
2031
- annotations_all_pages.append(image_annotations)
 
 
 
 
 
 
 
2032
 
2033
  current_loop_page += 1
2034
 
2035
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2036
 
2037
 
2038
- annotations_all_pages.append(image_annotations)
 
 
 
 
 
 
 
2039
 
2040
  current_loop_page += 1
2041
 
 
1322
  images.append(image)
1323
  pymupdf_doc = images
1324
 
1325
+ # Check if the image already exists in annotations_all_pages
1326
+ print("annotations_all_pages:", annotations_all_pages)
1327
+ existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1328
+ if existing_index is not None:
1329
+ # Replace the existing annotation
1330
+ annotations_all_pages[existing_index] = image_annotations
1331
+ else:
1332
+ # Append new annotation if it doesn't exist
1333
+ annotations_all_pages.append(image_annotations)
1334
 
1335
  if analysis_type == textract_option:
1336
  # Write the updated existing textract data back to the JSON file
 
1345
  images.append(image)
1346
  pymupdf_doc = images
1347
 
1348
+ # Check if the image already exists in annotations_all_pages
1349
+ print("annotations_all_pages:", annotations_all_pages)
1350
+ existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
1351
+ if existing_index is not None:
1352
+ # Replace the existing annotation
1353
+ annotations_all_pages[existing_index] = image_annotations
1354
+ else:
1355
+ # Append new annotation if it doesn't exist
1356
+ annotations_all_pages.append(image_annotations)
1357
 
1358
  current_loop_page += 1
1359
 
 
1887
  if chosen_redact_entities:
1888
  if pii_identification_method == "Local":
1889
 
1890
+ #print("chosen_redact_entities:", chosen_redact_entities)
1891
+
1892
  # Process immediately for local analysis
1893
  text_line_analyser_result = nlp_analyser.analyze(
1894
  text=text_line.text,
 
1899
  allow_list=allow_list
1900
  )
1901
  all_text_line_results.append((i, text_line_analyser_result))
1902
+
1903
+ print("all_text_line_results:", all_text_line_results)
1904
 
1905
  elif pii_identification_method == "AWS Comprehend":
1906
 
1907
  # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1908
  custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1909
 
1910
+
1911
  text_line_analyser_result = nlp_analyser.analyze(
1912
  text=text_line.text,
1913
  language=language,
 
2005
 
2006
  text_container_analyser_results.extend(text_line_analyser_result)
2007
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
2008
+
2009
+ print("text_container_analyser_results:", text_container_analyser_results)
2010
+
2011
+ page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
2012
+
2013
+
2014
+ print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
2015
 
2016
  # Annotate redactions on page
2017
  annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
2018
 
2019
+ print("annotations_on_page:", annotations_on_page)
2020
+
2021
  # Make pymupdf page redactions
2022
  #print("redact_whole_page_list:", redact_whole_page_list)
2023
  if redact_whole_page_list:
 
2055
  progress.close(_tqdm=progress_bar)
2056
  tqdm._instances.clear()
2057
 
2058
+ # Check if the image already exists in annotations_all_pages
2059
+ existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
2060
+ if existing_index is not None:
2061
+ # Replace the existing annotation
2062
+ annotations_all_pages[existing_index] = image_annotations
2063
+ else:
2064
+ # Append new annotation if it doesn't exist
2065
+ annotations_all_pages.append(image_annotations)
2066
 
2067
  current_loop_page += 1
2068
 
2069
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2070
 
2071
 
2072
+ # Check if the image already exists in annotations_all_pages
2073
+ existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
2074
+ if existing_index is not None:
2075
+ # Replace the existing annotation
2076
+ annotations_all_pages[existing_index] = image_annotations
2077
+ else:
2078
+ # Append new annotation if it doesn't exist
2079
+ annotations_all_pages.append(image_annotations)
2080
 
2081
  current_loop_page += 1
2082
 
tools/redaction_review.py CHANGED
@@ -76,6 +76,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
76
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
77
  except Exception as e:
78
  print("Could not extract recogniser information:", e)
 
79
 
80
  else:
81
  review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
@@ -139,24 +140,28 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
139
  for item in data:
140
  image_groups[item['image']].append(item)
141
 
142
- # Process each group to remove duplicates
143
  result = []
144
  for image, items in image_groups.items():
145
  # Filter items with non-empty boxes
146
  non_empty_boxes = [item for item in items if item['boxes']]
147
  if non_empty_boxes:
148
- # Add only the first one with non-empty boxes
149
  result.append(non_empty_boxes[0])
150
  else:
151
- # If all boxes are empty, add the first one
152
  result.append(items[0])
153
 
 
 
154
  return result
 
 
155
 
156
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
157
 
158
- print("image_annotator_object in update_annotator:", image_annotator_object)
159
- print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
160
 
161
  out_image_annotator = image_annotator(
162
  value = image_annotator_object[page_num_reported - 1],
 
76
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities[0], choices=recogniser_entities, allow_custom_value=True, interactive=True)
77
  except Exception as e:
78
  print("Could not extract recogniser information:", e)
79
+ recogniser_dataframe_out = recogniser_dataframe_gr
80
 
81
  else:
82
  review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
 
140
  for item in data:
141
  image_groups[item['image']].append(item)
142
 
143
+ # Process each group to retain only the entry with non-empty boxes, if available
144
  result = []
145
  for image, items in image_groups.items():
146
  # Filter items with non-empty boxes
147
  non_empty_boxes = [item for item in items if item['boxes']]
148
  if non_empty_boxes:
149
+ # Keep the first entry with non-empty boxes
150
  result.append(non_empty_boxes[0])
151
  else:
152
+ # If no non-empty boxes, keep the first item with empty boxes
153
  result.append(items[0])
154
 
155
+ #print("result:", result)
156
+
157
  return result
158
+
159
+ #print("image_annotator_object in update_annotator before function:", image_annotator_object)
160
 
161
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
162
 
163
+ #print("image_annotator_object in update_annotator after function:", image_annotator_object)
164
+ #print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
165
 
166
  out_image_annotator = image_annotator(
167
  value = image_annotator_object[page_num_reported - 1],