seanpedrickcase commited on
Commit
5b4b5fb
·
1 Parent(s): a680619

Upgraded packages. Fixed some issues with review process. Better progress reporting for user.

Browse files
DocRedactApp_0.1.spec DELETED
@@ -1,52 +0,0 @@
1
- # -*- mode: python ; coding: utf-8 -*-
2
- from PyInstaller.utils.hooks import collect_data_files
3
-
4
- datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
5
- datas += collect_data_files('gradio_client')
6
- datas += collect_data_files('gradio')
7
-
8
-
9
- a = Analysis(
10
- ['app.py'],
11
- pathex=[],
12
- binaries=[],
13
- datas=datas,
14
- hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
15
- hookspath=['build_deps'],
16
- hooksconfig={},
17
- runtime_hooks=[],
18
- excludes=[],
19
- noarchive=False,
20
- optimize=0,
21
- module_collection_mode={
22
- 'gradio': 'py', # Collect gradio package as source .py files
23
- }
24
- )
25
- pyz = PYZ(a.pure)
26
-
27
- exe = EXE(
28
- pyz,
29
- a.scripts,
30
- [],
31
- exclude_binaries=True,
32
- name='DocRedactApp_0.1',
33
- debug=False,
34
- bootloader_ignore_signals=False,
35
- strip=False,
36
- upx=True,
37
- console=True,
38
- disable_windowed_traceback=False,
39
- argv_emulation=False,
40
- target_arch=None,
41
- codesign_identity=None,
42
- entitlements_file=None,
43
- )
44
- coll = COLLECT(
45
- exe,
46
- a.binaries,
47
- a.datas,
48
- strip=False,
49
- upx=True,
50
- upx_exclude=[],
51
- name='DocRedactApp_0.1',
52
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -76,7 +76,7 @@ with app:
76
  data_file_name_textbox = gr.Textbox(value="", visible=False)
77
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
78
  estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
79
- annotate_previous_page = gr.Number(value=1, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
80
 
81
 
82
  ###
@@ -121,7 +121,7 @@ with app:
121
 
122
  with gr.Row():
123
  annotation_last_page_button = gr.Button("Previous page")
124
- annotate_current_page = gr.Number(value=1, label="Current page", precision=0)
125
 
126
  annotation_next_page_button = gr.Button("Next page")
127
 
@@ -131,8 +131,10 @@ with app:
131
  label="Modify redaction boxes",
132
  label_list=["Redaction"],
133
  label_colors=[(0, 0, 0)],
 
134
  sources=None,#["upload"],
135
  show_clear_button=False,
 
136
  show_remove_button=False,
137
  interactive=False
138
  )
@@ -216,12 +218,14 @@ with app:
216
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
217
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
218
 
219
- annotate_current_page.change(
220
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
221
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
222
 
223
- annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page])
224
- annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page])
 
 
225
 
226
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
227
  annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
 
76
  data_file_name_textbox = gr.Textbox(value="", visible=False)
77
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
78
  estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
79
+ annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
80
 
81
 
82
  ###
 
121
 
122
  with gr.Row():
123
  annotation_last_page_button = gr.Button("Previous page")
124
+ annotate_current_page = gr.Number(value=1, label="Current page (select page number then press enter)", precision=0)
125
 
126
  annotation_next_page_button = gr.Button("Next page")
127
 
 
131
  label="Modify redaction boxes",
132
  label_list=["Redaction"],
133
  label_colors=[(0, 0, 0)],
134
+ show_label=False,
135
  sources=None,#["upload"],
136
  show_clear_button=False,
137
+ show_share_button=False,
138
  show_remove_button=False,
139
  interactive=False
140
  )
 
218
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
219
  then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
220
 
221
+ annotate_current_page.submit(
222
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
223
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
224
 
225
+ annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page]).\
226
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
227
+ annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page]).\
228
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
229
 
230
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
231
  annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
redaction_review.py DELETED
@@ -1,88 +0,0 @@
1
- import gradio as gr
2
- from gradio_image_annotation import image_annotator
3
- from gradio_image_annotation.image_annotator import AnnotatedImageData
4
-
5
- from tools.file_conversion import is_pdf, convert_pdf_to_images
6
- from tools.helper_functions import get_file_path_end, output_folder
7
- from tools.file_redaction import redact_page_with_pymupdf
8
- import json
9
- import pymupdf
10
- from PIL import ImageDraw, Image
11
-
12
- file_path = "output/page_as_img_example_complaint_letter_pages_1.png"
13
- #file_path = "examples/graduate-job-example-cover-letter.pdf"
14
-
15
-
16
- if is_pdf(file_path):
17
- images = convert_pdf_to_images(file_path)
18
- image = images[0]
19
- doc = pymupdf.open(file_path)
20
- else:
21
- doc = []
22
-
23
- with open('output/gradio_annotation_boxes.json', 'r') as f:
24
- gradio_annotation_boxes = json.load(f)
25
-
26
- example_annotation = {
27
- "image": file_path,
28
- "boxes": gradio_annotation_boxes
29
- }
30
-
31
- def apply_redactions(image_annotated:AnnotatedImageData, file_path:str, doc=[]):
32
- #print(image_annotated['image'])
33
-
34
- file_base = get_file_path_end(file_path)
35
-
36
- image = Image.fromarray(image_annotated['image'].astype('uint8'))
37
-
38
- draw = ImageDraw.Draw(image)
39
-
40
- if is_pdf(file_path) == False:
41
- for img_annotation_box in image_annotated['boxes']:
42
- coords = [img_annotation_box["xmin"],
43
- img_annotation_box["ymin"],
44
- img_annotation_box["xmax"],
45
- img_annotation_box["ymax"]]
46
-
47
- fill = img_annotation_box["color"]
48
-
49
- draw.rectangle(coords, fill=fill)
50
-
51
- image.save(output_folder + file_base + "_additional.png")
52
-
53
- # If it's a pdf, assume a doc object is available
54
- else:
55
- doc = redact_page_with_pymupdf(doc, image_annotated, 1, image)
56
-
57
-
58
- def crop(annotations):
59
- if annotations["boxes"]:
60
- box = annotations["boxes"][0]
61
- return annotations["image"][
62
- box["ymin"]:box["ymax"],
63
- box["xmin"]:box["xmax"]
64
- ]
65
- return None
66
-
67
- def get_boxes_json(annotations):
68
- return annotations["boxes"]
69
-
70
- with gr.Blocks() as demo:
71
- with gr.Tab("Object annotation", id="tab_object_annotation"):
72
-
73
- doc_state = gr.State(doc)
74
-
75
- file_path_textbox = gr.Textbox(value=file_path)
76
- annotator = image_annotator(
77
- example_annotation,
78
- label_list=["Redaction"],
79
- label_colors=[(0, 0, 0)],
80
- )
81
- button_get = gr.Button("Get bounding boxes")
82
- button_apply = gr.Button("Apply redactions")
83
- json_boxes = gr.JSON()
84
- button_get.click(get_boxes_json, annotator, json_boxes)
85
- button_apply.click(apply_redactions, inputs=[annotator, file_path_textbox, doc_state])
86
-
87
- if __name__ == "__main__":
88
- demo.launch(inbrowser=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,17 +1,17 @@
1
  pdfminer.six==20231228
2
  pdf2image==1.17.0
3
  pymupdf==1.24.10
4
- opencv-python==4.9.0.80
5
  presidio_analyzer==2.2.355
6
  presidio_anonymizer==2.2.355
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
- pandas==2.2.2
10
- spacy==3.7.5
11
- en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
12
- gradio>=4.26.0
13
- boto3==1.34.158
14
- pyarrow==14.0.2
15
  openpyxl==3.1.2
16
  Faker==22.2.0
17
  gradio_image_annotation==0.2.3
 
1
  pdfminer.six==20231228
2
  pdf2image==1.17.0
3
  pymupdf==1.24.10
4
+ opencv-python==4.10.0.84
5
  presidio_analyzer==2.2.355
6
  presidio_anonymizer==2.2.355
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
+ pandas==2.2.3
10
+ spacy==3.8.2
11
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
12
+ gradio==4.44.1
13
+ boto3==1.35.40
14
+ pyarrow==17.0.0
15
  openpyxl==3.1.2
16
  Faker==22.2.0
17
  gradio_image_annotation==0.2.3
tools/aws_textract.py CHANGED
@@ -158,7 +158,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
158
 
159
  handwriting.append(recogniser_result)
160
 
161
- print("Handwriting found:", handwriting[-1])
162
 
163
  # If handwriting or signature, add to bounding box
164
 
@@ -173,7 +173,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
173
  recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
174
 
175
  signatures.append(recogniser_result)
176
- print("Signature found:", signatures[-1])
177
 
178
  words = []
179
  words.append({
 
158
 
159
  handwriting.append(recogniser_result)
160
 
161
+ #print("Handwriting found:", handwriting[-1])
162
 
163
  # If handwriting or signature, add to bounding box
164
 
 
173
  recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
174
 
175
  signatures.append(recogniser_result)
176
+ #print("Signature found:", signatures[-1])
177
 
178
  words = []
179
  words.append({
tools/file_conversion.py CHANGED
@@ -49,8 +49,8 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
49
  images = []
50
 
51
  # Open the PDF file
52
- #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
53
- for page_num in range(page_min,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
54
 
55
  print("Converting page: ", str(page_num + 1))
56
 
 
49
  images = []
50
 
51
  # Open the PDF file
52
+ #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
53
+ for page_num in progress.tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
54
 
55
  print("Converting page: ", str(page_num + 1))
56
 
tools/file_redaction.py CHANGED
@@ -3,7 +3,8 @@ import re
3
  import json
4
  import io
5
  import os
6
- from PIL import Image, ImageChops, ImageFile
 
7
  ImageFile.LOAD_TRUNCATED_IMAGES = True
8
 
9
  from typing import List, Dict, Tuple
@@ -118,6 +119,16 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
118
  return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
119
 
120
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
 
 
 
 
 
 
 
 
 
 
121
  #Analyse and redact image-based pdf or image
122
  if is_pdf_or_image(file_path) == False:
123
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
@@ -477,17 +488,17 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
477
  # Process signature and handwriting results
478
  if signature_recogniser_results or handwriting_recogniser_results:
479
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
480
- print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
481
  bboxes.extend(handwriting_recogniser_results)
482
 
483
  if "Redact all identified signatures" in handwrite_signature_checkbox:
484
- print("Signature boxes exist at merge:", signature_recogniser_results)
485
  bboxes.extend(signature_recogniser_results)
486
 
487
  # Reconstruct bounding boxes for substrings of interest
488
  reconstructed_bboxes = []
489
  for bbox in bboxes:
490
- print("bbox:", bbox)
491
  bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
492
  for line_text, line_info in combined_results.items():
493
  line_box = line_info['bounding_box']
@@ -636,33 +647,37 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
636
  if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
637
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
638
 
639
- for i in range(0, number_of_pages):
 
640
  handwriting_or_signature_boxes = []
641
  signature_recogniser_results = []
642
  handwriting_recogniser_results = []
 
643
 
644
 
645
- # Assuming prepared_pdf_file_paths[i] is your PIL image object
646
  try:
647
- image = prepared_pdf_file_paths[i]#.copy()
648
- print("image:", image)
649
  except Exception as e:
650
  print("Could not redact page:", reported_page_number, "due to:")
651
  print(e)
 
652
  continue
653
 
654
- image_annotations = {"image": image, "boxes": []}
 
 
 
655
 
656
  #try:
657
- print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
658
 
659
- if i >= page_min and i < page_max:
660
-
661
- reported_page_number = str(i + 1)
662
 
663
- print("Redacting page", reported_page_number)
664
 
665
- pymupdf_page = pymupdf_doc.load_page(i)
666
 
667
  # Need image size to convert textract OCR outputs to the correct sizes
668
  page_width, page_height = image.size
@@ -811,6 +826,8 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
811
 
812
  all_image_annotations.append(image_annotations)
813
 
 
 
814
  all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
815
  logging_file_paths.append(ocr_results_file_path)
816
 
@@ -849,8 +866,6 @@ def analyse_text_container(text_container:OCRResult, language:str, chosen_redact
849
  score_threshold=score_threshold,
850
  return_decision_process=True,
851
  allow_list=allow_list)
852
-
853
- print(analyser_results)
854
 
855
  return analyser_results
856
 
@@ -1097,8 +1112,10 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
1097
  else: page_min = page_min - 1
1098
 
1099
  print("Page range is",str(page_min + 1), "to", str(page_max))
1100
-
1101
- for page_no in range(0, number_of_pages): #range(page_min, page_max):
 
 
1102
  #print("prepared_pdf_image_path:", prepared_pdf_image_path)
1103
  #print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
1104
  image = prepared_pdf_image_path[page_no]
@@ -1150,23 +1167,23 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
1150
 
1151
  # Analyse each line of text in turn for PII and add to list
1152
  for i, text_line in enumerate(line_level_text_results_list):
1153
- text_line_analyzer_result = []
1154
  text_line_bounding_boxes = []
1155
 
1156
  #print("text_line:", text_line.text)
1157
 
1158
- text_line_analyzer_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
1159
 
1160
  # Merge bounding boxes for the line if multiple found close together
1161
- if text_line_analyzer_result:
1162
  # Merge bounding boxes if very close together
1163
  #print("text_line_bounding_boxes:", text_line_bounding_boxes)
1164
  #print("line_characters:")
1165
  #print(line_characters[i])
1166
  #print("".join(char._text for char in line_characters[i]))
1167
- text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyzer_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
1168
 
1169
- text_container_analyser_results.extend(text_line_analyzer_result)
1170
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1171
 
1172
  #print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
@@ -1188,7 +1205,7 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
1188
 
1189
  annotations_all_pages.extend([annotations_on_page])
1190
 
1191
- print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
1192
 
1193
  # Write logs
1194
  # Create decision process table
@@ -1203,5 +1220,7 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
1203
  page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
1204
 
1205
  all_image_annotations.append(image_annotations)
 
 
1206
 
1207
  return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations
 
3
  import json
4
  import io
5
  import os
6
+ import boto3
7
+ from PIL import Image, ImageChops, ImageFile, ImageDraw
8
  ImageFile.LOAD_TRUNCATED_IMAGES = True
9
 
10
  from typing import List, Dict, Tuple
 
119
  return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
120
 
121
  if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
122
+
123
+ if in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
124
+ # Try accessing Textract through boto3
125
+ try:
126
+ boto3.client('textract')
127
+ except:
128
+ out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
129
+ print(out_message)
130
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
131
+
132
  #Analyse and redact image-based pdf or image
133
  if is_pdf_or_image(file_path) == False:
134
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
 
488
  # Process signature and handwriting results
489
  if signature_recogniser_results or handwriting_recogniser_results:
490
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
491
+ #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
492
  bboxes.extend(handwriting_recogniser_results)
493
 
494
  if "Redact all identified signatures" in handwrite_signature_checkbox:
495
+ #print("Signature boxes exist at merge:", signature_recogniser_results)
496
  bboxes.extend(signature_recogniser_results)
497
 
498
  # Reconstruct bounding boxes for substrings of interest
499
  reconstructed_bboxes = []
500
  for bbox in bboxes:
501
+ #print("bbox:", bbox)
502
  bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
503
  for line_text, line_info in combined_results.items():
504
  line_box = line_info['bounding_box']
 
647
  if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
648
  elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
649
 
650
+ for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
651
+ #for page_no in range(0, number_of_pages):
652
  handwriting_or_signature_boxes = []
653
  signature_recogniser_results = []
654
  handwriting_recogniser_results = []
655
+
656
 
657
 
658
+ # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
659
  try:
660
+ image = prepared_pdf_file_paths[page_no]#.copy()
661
+ #print("image:", image)
662
  except Exception as e:
663
  print("Could not redact page:", reported_page_number, "due to:")
664
  print(e)
665
+
666
  continue
667
 
668
+ image_annotations = {"image": image, "boxes": []}
669
+
670
+
671
+ pymupdf_page = pymupdf_doc.load_page(page_no)
672
 
673
  #try:
674
+ #print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
675
 
676
+ if page_no >= page_min and page_no < page_max:
 
 
677
 
678
+ reported_page_number = str(page_no + 1)
679
 
680
+ print("Redacting page", reported_page_number)
681
 
682
  # Need image size to convert textract OCR outputs to the correct sizes
683
  page_width, page_height = image.size
 
826
 
827
  all_image_annotations.append(image_annotations)
828
 
829
+ #print("\nall_image_annotations for page", str(page_no), "are:", all_image_annotations)
830
+
831
  all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
832
  logging_file_paths.append(ocr_results_file_path)
833
 
 
866
  score_threshold=score_threshold,
867
  return_decision_process=True,
868
  allow_list=allow_list)
 
 
869
 
870
  return analyser_results
871
 
 
1112
  else: page_min = page_min - 1
1113
 
1114
  print("Page range is",str(page_min + 1), "to", str(page_max))
1115
+
1116
+ #for page_no in range(0, number_of_pages):
1117
+ for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
1118
+
1119
  #print("prepared_pdf_image_path:", prepared_pdf_image_path)
1120
  #print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
1121
  image = prepared_pdf_image_path[page_no]
 
1167
 
1168
  # Analyse each line of text in turn for PII and add to list
1169
  for i, text_line in enumerate(line_level_text_results_list):
1170
+ text_line_analyser_result = []
1171
  text_line_bounding_boxes = []
1172
 
1173
  #print("text_line:", text_line.text)
1174
 
1175
+ text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
1176
 
1177
  # Merge bounding boxes for the line if multiple found close together
1178
+ if text_line_analyser_result:
1179
  # Merge bounding boxes if very close together
1180
  #print("text_line_bounding_boxes:", text_line_bounding_boxes)
1181
  #print("line_characters:")
1182
  #print(line_characters[i])
1183
  #print("".join(char._text for char in line_characters[i]))
1184
+ text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
1185
 
1186
+ text_container_analyser_results.extend(text_line_analyser_result)
1187
  text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1188
 
1189
  #print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
 
1205
 
1206
  annotations_all_pages.extend([annotations_on_page])
1207
 
1208
+ print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
1209
 
1210
  # Write logs
1211
  # Create decision process table
 
1220
  page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
1221
 
1222
  all_image_annotations.append(image_annotations)
1223
+
1224
+ #print("all_image_annotations:", all_image_annotations)
1225
 
1226
  return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations
tools/redaction_review.py CHANGED
@@ -38,18 +38,23 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
38
  return max_pages
39
 
40
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
41
- #print("\nImage annotator object:", image_annotator_object[0])
42
 
43
  if not image_annotator_object:
44
  return image_annotator(
45
  label="Modify redaction boxes",
46
  #label_list=["Redaction"],
47
  #label_colors=[(0, 0, 0)],
 
48
  sources=["upload"],
49
  show_clear_button=False,
 
50
  show_remove_button=False,
51
  interactive=False
52
- ), gr.Number(label = "Current page", value=1, precision=0)
 
 
 
53
 
54
  # Check bounding values for current page and page max
55
  if page_num > 0:
@@ -70,19 +75,21 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
70
  box_thickness=1,
71
  #label_list=["Redaction"],
72
  #label_colors=[(0, 0, 0)],
73
- height='60%',
74
- width='60%',
 
75
  box_min_size=1,
76
  box_selected_thickness=2,
77
  handle_size=4,
78
  sources=None,#["upload"],
79
  show_clear_button=False,
 
80
  show_remove_button=False,
81
  handles_cursor=True,
82
  interactive=True
83
  )
84
 
85
- number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
86
 
87
  return out_image_annotator, number_reported
88
 
@@ -90,7 +97,14 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
90
  '''
91
  Overwrite current image annotations with modifications
92
  '''
93
- print("all_image_annotations before:",all_image_annotations)
 
 
 
 
 
 
 
94
 
95
  image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
96
 
@@ -98,14 +112,15 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
98
 
99
  all_image_annotations[previous_page - 1] = image_annotated
100
 
101
- print("all_image_annotations after:",all_image_annotations)
102
 
103
  return all_image_annotations, current_page
104
 
105
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int):
106
  '''
107
  Apply modified redactions to a pymupdf
108
  '''
 
109
 
110
  output_files = []
111
 
@@ -154,23 +169,26 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
154
 
155
  number_of_pages = unredacted_doc.page_count
156
 
157
- for i in range(0, number_of_pages):
158
 
159
- print("Re-redacting page", str(i))
 
 
160
 
161
  image_loc = all_image_annotations[i]['image']
162
- print("Image location:", image_loc)
163
-
164
- # Load in image
165
- if isinstance(image_loc, Image.Image):
166
- # Save to file so the image annotator can pick it up
167
- image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
168
- image_loc.save(image_out_folder)
169
- image = image_out_folder
 
 
 
170
  elif isinstance(image_loc, str):
171
  image = Image.open(image_loc)
172
- else:
173
- image = Image.fromarray(image_loc.astype('uint8'))
174
 
175
  pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
176
  pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
@@ -181,20 +199,10 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
181
  output_files.append(out_pdf_file_path)
182
 
183
  # Save the gradio_annotation_boxes to a JSON file
184
- out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
185
- all_image_annotations_with_lists = all_image_annotations
186
-
187
- # Convert image arrays to lists for JSON serialization
188
- for annotation in all_image_annotations_with_lists:
189
- if isinstance(annotation['image'], np.ndarray):
190
- annotation['image'] = annotation['image'].tolist()
191
- elif isinstance(annotation['image'], Image.Image):
192
- annotation['image'] = image_out_folder
193
-
194
- with open(out_annotation_file_path, 'w') as f:
195
- json.dump(all_image_annotations_with_lists, f)
196
-
197
- output_files.append(out_annotation_file_path)
198
 
199
  return doc, all_image_annotations, output_files
200
 
 
38
  return max_pages
39
 
40
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
41
+ # print("\nImage annotator object:", image_annotator_object)
42
 
43
  if not image_annotator_object:
44
  return image_annotator(
45
  label="Modify redaction boxes",
46
  #label_list=["Redaction"],
47
  #label_colors=[(0, 0, 0)],
48
+ show_label=False,
49
  sources=["upload"],
50
  show_clear_button=False,
51
+ show_share_button=False,
52
  show_remove_button=False,
53
  interactive=False
54
+ ), gr.Number(label = "Current page (select page number then press enter)", value=1, precision=0)
55
+
56
+ if page_num is None:
57
+ page_num = 0
58
 
59
  # Check bounding values for current page and page max
60
  if page_num > 0:
 
75
  box_thickness=1,
76
  #label_list=["Redaction"],
77
  #label_colors=[(0, 0, 0)],
78
+ show_label=False,
79
+ height='100%',
80
+ width='100%',
81
  box_min_size=1,
82
  box_selected_thickness=2,
83
  handle_size=4,
84
  sources=None,#["upload"],
85
  show_clear_button=False,
86
+ show_share_button=False,
87
  show_remove_button=False,
88
  handles_cursor=True,
89
  interactive=True
90
  )
91
 
92
+ number_reported = gr.Number(label = "Current page (select page number then press enter)", value=page_num_reported, precision=0)
93
 
94
  return out_image_annotator, number_reported
95
 
 
97
  '''
98
  Overwrite current image annotations with modifications
99
  '''
100
+ #If no previous page or is 0, i.e. first time run, then make no changes
101
+ if not previous_page:
102
+ return all_image_annotations, current_page
103
+
104
+ if not current_page:
105
+ current_page = 1
106
+
107
+ #print("all_image_annotations before:",all_image_annotations)
108
 
109
  image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
110
 
 
112
 
113
  all_image_annotations[previous_page - 1] = image_annotated
114
 
115
+ #print("all_image_annotations after:",all_image_annotations)
116
 
117
  return all_image_annotations, current_page
118
 
119
+ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
120
  '''
121
  Apply modified redactions to a pymupdf
122
  '''
123
+ print("all_image_annotations:", all_image_annotations)
124
 
125
  output_files = []
126
 
 
169
 
170
  number_of_pages = unredacted_doc.page_count
171
 
172
+ print("Saving pages to file.")
173
 
174
+ for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
175
+
176
+ #print("Saving page", str(i))
177
 
178
  image_loc = all_image_annotations[i]['image']
179
+ #print("Image location:", image_loc)
180
+
181
+ # Load in image object
182
+ if isinstance(image_loc, np.ndarray):
183
+ image = Image.fromarray(image_loc.astype('uint8'))
184
+ #all_image_annotations[i]['image'] = image_loc.tolist()
185
+ elif isinstance(image_loc, Image.Image):
186
+ image = image_loc
187
+ #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
188
+ #image_loc.save(image_out_folder)
189
+ #all_image_annotations[i]['image'] = image_out_folder
190
  elif isinstance(image_loc, str):
191
  image = Image.open(image_loc)
 
 
192
 
193
  pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
194
  pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
 
199
  output_files.append(out_pdf_file_path)
200
 
201
  # Save the gradio_annotation_boxes to a JSON file
202
+ #out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
203
+ #with open(out_annotation_file_path, 'w') as f:
204
+ # json.dump(all_image_annotations, f)
205
+ #output_files.append(out_annotation_file_path)
 
 
 
 
 
 
 
 
 
 
206
 
207
  return doc, all_image_annotations, output_files
208