Commit
·
5b4b5fb
1
Parent(s):
a680619
Upgraded packages. Fixed some issues with review process. Better progress reporting for user.
Browse files- DocRedactApp_0.1.spec +0 -52
- app.py +9 -5
- redaction_review.py +0 -88
- requirements.txt +7 -7
- tools/aws_textract.py +2 -2
- tools/file_conversion.py +2 -2
- tools/file_redaction.py +44 -25
- tools/redaction_review.py +42 -34
DocRedactApp_0.1.spec
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
# -*- mode: python ; coding: utf-8 -*-
|
2 |
-
from PyInstaller.utils.hooks import collect_data_files
|
3 |
-
|
4 |
-
datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
|
5 |
-
datas += collect_data_files('gradio_client')
|
6 |
-
datas += collect_data_files('gradio')
|
7 |
-
|
8 |
-
|
9 |
-
a = Analysis(
|
10 |
-
['app.py'],
|
11 |
-
pathex=[],
|
12 |
-
binaries=[],
|
13 |
-
datas=datas,
|
14 |
-
hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
|
15 |
-
hookspath=['build_deps'],
|
16 |
-
hooksconfig={},
|
17 |
-
runtime_hooks=[],
|
18 |
-
excludes=[],
|
19 |
-
noarchive=False,
|
20 |
-
optimize=0,
|
21 |
-
module_collection_mode={
|
22 |
-
'gradio': 'py', # Collect gradio package as source .py files
|
23 |
-
}
|
24 |
-
)
|
25 |
-
pyz = PYZ(a.pure)
|
26 |
-
|
27 |
-
exe = EXE(
|
28 |
-
pyz,
|
29 |
-
a.scripts,
|
30 |
-
[],
|
31 |
-
exclude_binaries=True,
|
32 |
-
name='DocRedactApp_0.1',
|
33 |
-
debug=False,
|
34 |
-
bootloader_ignore_signals=False,
|
35 |
-
strip=False,
|
36 |
-
upx=True,
|
37 |
-
console=True,
|
38 |
-
disable_windowed_traceback=False,
|
39 |
-
argv_emulation=False,
|
40 |
-
target_arch=None,
|
41 |
-
codesign_identity=None,
|
42 |
-
entitlements_file=None,
|
43 |
-
)
|
44 |
-
coll = COLLECT(
|
45 |
-
exe,
|
46 |
-
a.binaries,
|
47 |
-
a.datas,
|
48 |
-
strip=False,
|
49 |
-
upx=True,
|
50 |
-
upx_exclude=[],
|
51 |
-
name='DocRedactApp_0.1',
|
52 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -76,7 +76,7 @@ with app:
|
|
76 |
data_file_name_textbox = gr.Textbox(value="", visible=False)
|
77 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
78 |
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
79 |
-
annotate_previous_page = gr.Number(value=
|
80 |
|
81 |
|
82 |
###
|
@@ -121,7 +121,7 @@ with app:
|
|
121 |
|
122 |
with gr.Row():
|
123 |
annotation_last_page_button = gr.Button("Previous page")
|
124 |
-
annotate_current_page = gr.Number(value=1, label="Current page", precision=0)
|
125 |
|
126 |
annotation_next_page_button = gr.Button("Next page")
|
127 |
|
@@ -131,8 +131,10 @@ with app:
|
|
131 |
label="Modify redaction boxes",
|
132 |
label_list=["Redaction"],
|
133 |
label_colors=[(0, 0, 0)],
|
|
|
134 |
sources=None,#["upload"],
|
135 |
show_clear_button=False,
|
|
|
136 |
show_remove_button=False,
|
137 |
interactive=False
|
138 |
)
|
@@ -216,12 +218,14 @@ with app:
|
|
216 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
|
217 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
218 |
|
219 |
-
annotate_current_page.
|
220 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
|
221 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
|
222 |
|
223 |
-
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page])
|
224 |
-
|
|
|
|
|
225 |
|
226 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
227 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
|
|
76 |
data_file_name_textbox = gr.Textbox(value="", visible=False)
|
77 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
78 |
estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
79 |
+
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
80 |
|
81 |
|
82 |
###
|
|
|
121 |
|
122 |
with gr.Row():
|
123 |
annotation_last_page_button = gr.Button("Previous page")
|
124 |
+
annotate_current_page = gr.Number(value=1, label="Current page (select page number then press enter)", precision=0)
|
125 |
|
126 |
annotation_next_page_button = gr.Button("Next page")
|
127 |
|
|
|
131 |
label="Modify redaction boxes",
|
132 |
label_list=["Redaction"],
|
133 |
label_colors=[(0, 0, 0)],
|
134 |
+
show_label=False,
|
135 |
sources=None,#["upload"],
|
136 |
show_clear_button=False,
|
137 |
+
show_share_button=False,
|
138 |
show_remove_button=False,
|
139 |
interactive=False
|
140 |
)
|
|
|
218 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
|
219 |
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
220 |
|
221 |
+
annotate_current_page.submit(
|
222 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
|
223 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
|
224 |
|
225 |
+
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page]).\
|
226 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
|
227 |
+
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page]).\
|
228 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
|
229 |
|
230 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
231 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
redaction_review.py
DELETED
@@ -1,88 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from gradio_image_annotation import image_annotator
|
3 |
-
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
4 |
-
|
5 |
-
from tools.file_conversion import is_pdf, convert_pdf_to_images
|
6 |
-
from tools.helper_functions import get_file_path_end, output_folder
|
7 |
-
from tools.file_redaction import redact_page_with_pymupdf
|
8 |
-
import json
|
9 |
-
import pymupdf
|
10 |
-
from PIL import ImageDraw, Image
|
11 |
-
|
12 |
-
file_path = "output/page_as_img_example_complaint_letter_pages_1.png"
|
13 |
-
#file_path = "examples/graduate-job-example-cover-letter.pdf"
|
14 |
-
|
15 |
-
|
16 |
-
if is_pdf(file_path):
|
17 |
-
images = convert_pdf_to_images(file_path)
|
18 |
-
image = images[0]
|
19 |
-
doc = pymupdf.open(file_path)
|
20 |
-
else:
|
21 |
-
doc = []
|
22 |
-
|
23 |
-
with open('output/gradio_annotation_boxes.json', 'r') as f:
|
24 |
-
gradio_annotation_boxes = json.load(f)
|
25 |
-
|
26 |
-
example_annotation = {
|
27 |
-
"image": file_path,
|
28 |
-
"boxes": gradio_annotation_boxes
|
29 |
-
}
|
30 |
-
|
31 |
-
def apply_redactions(image_annotated:AnnotatedImageData, file_path:str, doc=[]):
|
32 |
-
#print(image_annotated['image'])
|
33 |
-
|
34 |
-
file_base = get_file_path_end(file_path)
|
35 |
-
|
36 |
-
image = Image.fromarray(image_annotated['image'].astype('uint8'))
|
37 |
-
|
38 |
-
draw = ImageDraw.Draw(image)
|
39 |
-
|
40 |
-
if is_pdf(file_path) == False:
|
41 |
-
for img_annotation_box in image_annotated['boxes']:
|
42 |
-
coords = [img_annotation_box["xmin"],
|
43 |
-
img_annotation_box["ymin"],
|
44 |
-
img_annotation_box["xmax"],
|
45 |
-
img_annotation_box["ymax"]]
|
46 |
-
|
47 |
-
fill = img_annotation_box["color"]
|
48 |
-
|
49 |
-
draw.rectangle(coords, fill=fill)
|
50 |
-
|
51 |
-
image.save(output_folder + file_base + "_additional.png")
|
52 |
-
|
53 |
-
# If it's a pdf, assume a doc object is available
|
54 |
-
else:
|
55 |
-
doc = redact_page_with_pymupdf(doc, image_annotated, 1, image)
|
56 |
-
|
57 |
-
|
58 |
-
def crop(annotations):
|
59 |
-
if annotations["boxes"]:
|
60 |
-
box = annotations["boxes"][0]
|
61 |
-
return annotations["image"][
|
62 |
-
box["ymin"]:box["ymax"],
|
63 |
-
box["xmin"]:box["xmax"]
|
64 |
-
]
|
65 |
-
return None
|
66 |
-
|
67 |
-
def get_boxes_json(annotations):
|
68 |
-
return annotations["boxes"]
|
69 |
-
|
70 |
-
with gr.Blocks() as demo:
|
71 |
-
with gr.Tab("Object annotation", id="tab_object_annotation"):
|
72 |
-
|
73 |
-
doc_state = gr.State(doc)
|
74 |
-
|
75 |
-
file_path_textbox = gr.Textbox(value=file_path)
|
76 |
-
annotator = image_annotator(
|
77 |
-
example_annotation,
|
78 |
-
label_list=["Redaction"],
|
79 |
-
label_colors=[(0, 0, 0)],
|
80 |
-
)
|
81 |
-
button_get = gr.Button("Get bounding boxes")
|
82 |
-
button_apply = gr.Button("Apply redactions")
|
83 |
-
json_boxes = gr.JSON()
|
84 |
-
button_get.click(get_boxes_json, annotator, json_boxes)
|
85 |
-
button_apply.click(apply_redactions, inputs=[annotator, file_path_textbox, doc_state])
|
86 |
-
|
87 |
-
if __name__ == "__main__":
|
88 |
-
demo.launch(inbrowser=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
pdfminer.six==20231228
|
2 |
pdf2image==1.17.0
|
3 |
pymupdf==1.24.10
|
4 |
-
opencv-python==4.
|
5 |
presidio_analyzer==2.2.355
|
6 |
presidio_anonymizer==2.2.355
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
-
pandas==2.2.
|
10 |
-
spacy==3.
|
11 |
-
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.
|
12 |
-
gradio
|
13 |
-
boto3==1.
|
14 |
-
pyarrow==
|
15 |
openpyxl==3.1.2
|
16 |
Faker==22.2.0
|
17 |
gradio_image_annotation==0.2.3
|
|
|
1 |
pdfminer.six==20231228
|
2 |
pdf2image==1.17.0
|
3 |
pymupdf==1.24.10
|
4 |
+
opencv-python==4.10.0.84
|
5 |
presidio_analyzer==2.2.355
|
6 |
presidio_anonymizer==2.2.355
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
+
pandas==2.2.3
|
10 |
+
spacy==3.8.2
|
11 |
+
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
12 |
+
gradio==4.44.1
|
13 |
+
boto3==1.35.40
|
14 |
+
pyarrow==17.0.0
|
15 |
openpyxl==3.1.2
|
16 |
Faker==22.2.0
|
17 |
gradio_image_annotation==0.2.3
|
tools/aws_textract.py
CHANGED
@@ -158,7 +158,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
158 |
|
159 |
handwriting.append(recogniser_result)
|
160 |
|
161 |
-
print("Handwriting found:", handwriting[-1])
|
162 |
|
163 |
# If handwriting or signature, add to bounding box
|
164 |
|
@@ -173,7 +173,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
|
|
173 |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
|
174 |
|
175 |
signatures.append(recogniser_result)
|
176 |
-
print("Signature found:", signatures[-1])
|
177 |
|
178 |
words = []
|
179 |
words.append({
|
|
|
158 |
|
159 |
handwriting.append(recogniser_result)
|
160 |
|
161 |
+
#print("Handwriting found:", handwriting[-1])
|
162 |
|
163 |
# If handwriting or signature, add to bounding box
|
164 |
|
|
|
173 |
recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
|
174 |
|
175 |
signatures.append(recogniser_result)
|
176 |
+
#print("Signature found:", signatures[-1])
|
177 |
|
178 |
words = []
|
179 |
words.append({
|
tools/file_conversion.py
CHANGED
@@ -49,8 +49,8 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
|
|
49 |
images = []
|
50 |
|
51 |
# Open the PDF file
|
52 |
-
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
|
53 |
-
for page_num in
|
54 |
|
55 |
print("Converting page: ", str(page_num + 1))
|
56 |
|
|
|
49 |
images = []
|
50 |
|
51 |
# Open the PDF file
|
52 |
+
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
53 |
+
for page_num in progress.tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
|
54 |
|
55 |
print("Converting page: ", str(page_num + 1))
|
56 |
|
tools/file_redaction.py
CHANGED
@@ -3,7 +3,8 @@ import re
|
|
3 |
import json
|
4 |
import io
|
5 |
import os
|
6 |
-
|
|
|
7 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
8 |
|
9 |
from typing import List, Dict, Tuple
|
@@ -118,6 +119,16 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
|
|
118 |
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
|
119 |
|
120 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
#Analyse and redact image-based pdf or image
|
122 |
if is_pdf_or_image(file_path) == False:
|
123 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
@@ -477,17 +488,17 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
477 |
# Process signature and handwriting results
|
478 |
if signature_recogniser_results or handwriting_recogniser_results:
|
479 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
480 |
-
print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
481 |
bboxes.extend(handwriting_recogniser_results)
|
482 |
|
483 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
484 |
-
print("Signature boxes exist at merge:", signature_recogniser_results)
|
485 |
bboxes.extend(signature_recogniser_results)
|
486 |
|
487 |
# Reconstruct bounding boxes for substrings of interest
|
488 |
reconstructed_bboxes = []
|
489 |
for bbox in bboxes:
|
490 |
-
print("bbox:", bbox)
|
491 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
492 |
for line_text, line_info in combined_results.items():
|
493 |
line_box = line_info['bounding_box']
|
@@ -636,33 +647,37 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
636 |
if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
637 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
638 |
|
639 |
-
for
|
|
|
640 |
handwriting_or_signature_boxes = []
|
641 |
signature_recogniser_results = []
|
642 |
handwriting_recogniser_results = []
|
|
|
643 |
|
644 |
|
645 |
-
# Assuming prepared_pdf_file_paths[
|
646 |
try:
|
647 |
-
image = prepared_pdf_file_paths[
|
648 |
-
print("image:", image)
|
649 |
except Exception as e:
|
650 |
print("Could not redact page:", reported_page_number, "due to:")
|
651 |
print(e)
|
|
|
652 |
continue
|
653 |
|
654 |
-
image_annotations = {"image": image, "boxes": []}
|
|
|
|
|
|
|
655 |
|
656 |
#try:
|
657 |
-
print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
|
658 |
|
659 |
-
if
|
660 |
-
|
661 |
-
reported_page_number = str(i + 1)
|
662 |
|
663 |
-
|
664 |
|
665 |
-
|
666 |
|
667 |
# Need image size to convert textract OCR outputs to the correct sizes
|
668 |
page_width, page_height = image.size
|
@@ -811,6 +826,8 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
|
|
811 |
|
812 |
all_image_annotations.append(image_annotations)
|
813 |
|
|
|
|
|
814 |
all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
|
815 |
logging_file_paths.append(ocr_results_file_path)
|
816 |
|
@@ -849,8 +866,6 @@ def analyse_text_container(text_container:OCRResult, language:str, chosen_redact
|
|
849 |
score_threshold=score_threshold,
|
850 |
return_decision_process=True,
|
851 |
allow_list=allow_list)
|
852 |
-
|
853 |
-
print(analyser_results)
|
854 |
|
855 |
return analyser_results
|
856 |
|
@@ -1097,8 +1112,10 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
|
|
1097 |
else: page_min = page_min - 1
|
1098 |
|
1099 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
1100 |
-
|
1101 |
-
for page_no in range(0, number_of_pages):
|
|
|
|
|
1102 |
#print("prepared_pdf_image_path:", prepared_pdf_image_path)
|
1103 |
#print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
|
1104 |
image = prepared_pdf_image_path[page_no]
|
@@ -1150,23 +1167,23 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
|
|
1150 |
|
1151 |
# Analyse each line of text in turn for PII and add to list
|
1152 |
for i, text_line in enumerate(line_level_text_results_list):
|
1153 |
-
|
1154 |
text_line_bounding_boxes = []
|
1155 |
|
1156 |
#print("text_line:", text_line.text)
|
1157 |
|
1158 |
-
|
1159 |
|
1160 |
# Merge bounding boxes for the line if multiple found close together
|
1161 |
-
if
|
1162 |
# Merge bounding boxes if very close together
|
1163 |
#print("text_line_bounding_boxes:", text_line_bounding_boxes)
|
1164 |
#print("line_characters:")
|
1165 |
#print(line_characters[i])
|
1166 |
#print("".join(char._text for char in line_characters[i]))
|
1167 |
-
text_line_bounding_boxes = merge_text_bounding_boxes(
|
1168 |
|
1169 |
-
text_container_analyser_results.extend(
|
1170 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1171 |
|
1172 |
#print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
|
@@ -1188,7 +1205,7 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
|
|
1188 |
|
1189 |
annotations_all_pages.extend([annotations_on_page])
|
1190 |
|
1191 |
-
print("For page number:", page_no, "there are", len(
|
1192 |
|
1193 |
# Write logs
|
1194 |
# Create decision process table
|
@@ -1203,5 +1220,7 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
|
|
1203 |
page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
|
1204 |
|
1205 |
all_image_annotations.append(image_annotations)
|
|
|
|
|
1206 |
|
1207 |
return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations
|
|
|
3 |
import json
|
4 |
import io
|
5 |
import os
|
6 |
+
import boto3
|
7 |
+
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
8 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
9 |
|
10 |
from typing import List, Dict, Tuple
|
|
|
119 |
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
|
120 |
|
121 |
if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
122 |
+
|
123 |
+
if in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
|
124 |
+
# Try accessing Textract through boto3
|
125 |
+
try:
|
126 |
+
boto3.client('textract')
|
127 |
+
except:
|
128 |
+
out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
|
129 |
+
print(out_message)
|
130 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
|
131 |
+
|
132 |
#Analyse and redact image-based pdf or image
|
133 |
if is_pdf_or_image(file_path) == False:
|
134 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
|
|
488 |
# Process signature and handwriting results
|
489 |
if signature_recogniser_results or handwriting_recogniser_results:
|
490 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
491 |
+
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
492 |
bboxes.extend(handwriting_recogniser_results)
|
493 |
|
494 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
495 |
+
#print("Signature boxes exist at merge:", signature_recogniser_results)
|
496 |
bboxes.extend(signature_recogniser_results)
|
497 |
|
498 |
# Reconstruct bounding boxes for substrings of interest
|
499 |
reconstructed_bboxes = []
|
500 |
for bbox in bboxes:
|
501 |
+
#print("bbox:", bbox)
|
502 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
503 |
for line_text, line_info in combined_results.items():
|
504 |
line_box = line_info['bounding_box']
|
|
|
647 |
if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
648 |
elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
649 |
|
650 |
+
for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
|
651 |
+
#for page_no in range(0, number_of_pages):
|
652 |
handwriting_or_signature_boxes = []
|
653 |
signature_recogniser_results = []
|
654 |
handwriting_recogniser_results = []
|
655 |
+
|
656 |
|
657 |
|
658 |
+
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
659 |
try:
|
660 |
+
image = prepared_pdf_file_paths[page_no]#.copy()
|
661 |
+
#print("image:", image)
|
662 |
except Exception as e:
|
663 |
print("Could not redact page:", reported_page_number, "due to:")
|
664 |
print(e)
|
665 |
+
|
666 |
continue
|
667 |
|
668 |
+
image_annotations = {"image": image, "boxes": []}
|
669 |
+
|
670 |
+
|
671 |
+
pymupdf_page = pymupdf_doc.load_page(page_no)
|
672 |
|
673 |
#try:
|
674 |
+
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
|
675 |
|
676 |
+
if page_no >= page_min and page_no < page_max:
|
|
|
|
|
677 |
|
678 |
+
reported_page_number = str(page_no + 1)
|
679 |
|
680 |
+
print("Redacting page", reported_page_number)
|
681 |
|
682 |
# Need image size to convert textract OCR outputs to the correct sizes
|
683 |
page_width, page_height = image.size
|
|
|
826 |
|
827 |
all_image_annotations.append(image_annotations)
|
828 |
|
829 |
+
#print("\nall_image_annotations for page", str(page_no), "are:", all_image_annotations)
|
830 |
+
|
831 |
all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
|
832 |
logging_file_paths.append(ocr_results_file_path)
|
833 |
|
|
|
866 |
score_threshold=score_threshold,
|
867 |
return_decision_process=True,
|
868 |
allow_list=allow_list)
|
|
|
|
|
869 |
|
870 |
return analyser_results
|
871 |
|
|
|
1112 |
else: page_min = page_min - 1
|
1113 |
|
1114 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
1115 |
+
|
1116 |
+
#for page_no in range(0, number_of_pages):
|
1117 |
+
for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
|
1118 |
+
|
1119 |
#print("prepared_pdf_image_path:", prepared_pdf_image_path)
|
1120 |
#print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
|
1121 |
image = prepared_pdf_image_path[page_no]
|
|
|
1167 |
|
1168 |
# Analyse each line of text in turn for PII and add to list
|
1169 |
for i, text_line in enumerate(line_level_text_results_list):
|
1170 |
+
text_line_analyser_result = []
|
1171 |
text_line_bounding_boxes = []
|
1172 |
|
1173 |
#print("text_line:", text_line.text)
|
1174 |
|
1175 |
+
text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
|
1176 |
|
1177 |
# Merge bounding boxes for the line if multiple found close together
|
1178 |
+
if text_line_analyser_result:
|
1179 |
# Merge bounding boxes if very close together
|
1180 |
#print("text_line_bounding_boxes:", text_line_bounding_boxes)
|
1181 |
#print("line_characters:")
|
1182 |
#print(line_characters[i])
|
1183 |
#print("".join(char._text for char in line_characters[i]))
|
1184 |
+
text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
|
1185 |
|
1186 |
+
text_container_analyser_results.extend(text_line_analyser_result)
|
1187 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1188 |
|
1189 |
#print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
|
|
|
1205 |
|
1206 |
annotations_all_pages.extend([annotations_on_page])
|
1207 |
|
1208 |
+
print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1209 |
|
1210 |
# Write logs
|
1211 |
# Create decision process table
|
|
|
1220 |
page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
|
1221 |
|
1222 |
all_image_annotations.append(image_annotations)
|
1223 |
+
|
1224 |
+
#print("all_image_annotations:", all_image_annotations)
|
1225 |
|
1226 |
return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations
|
tools/redaction_review.py
CHANGED
@@ -38,18 +38,23 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
|
|
38 |
return max_pages
|
39 |
|
40 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
41 |
-
#print("\nImage annotator object:", image_annotator_object
|
42 |
|
43 |
if not image_annotator_object:
|
44 |
return image_annotator(
|
45 |
label="Modify redaction boxes",
|
46 |
#label_list=["Redaction"],
|
47 |
#label_colors=[(0, 0, 0)],
|
|
|
48 |
sources=["upload"],
|
49 |
show_clear_button=False,
|
|
|
50 |
show_remove_button=False,
|
51 |
interactive=False
|
52 |
-
), gr.Number(label = "Current page", value=1, precision=0)
|
|
|
|
|
|
|
53 |
|
54 |
# Check bounding values for current page and page max
|
55 |
if page_num > 0:
|
@@ -70,19 +75,21 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
|
70 |
box_thickness=1,
|
71 |
#label_list=["Redaction"],
|
72 |
#label_colors=[(0, 0, 0)],
|
73 |
-
|
74 |
-
|
|
|
75 |
box_min_size=1,
|
76 |
box_selected_thickness=2,
|
77 |
handle_size=4,
|
78 |
sources=None,#["upload"],
|
79 |
show_clear_button=False,
|
|
|
80 |
show_remove_button=False,
|
81 |
handles_cursor=True,
|
82 |
interactive=True
|
83 |
)
|
84 |
|
85 |
-
number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
|
86 |
|
87 |
return out_image_annotator, number_reported
|
88 |
|
@@ -90,7 +97,14 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
90 |
'''
|
91 |
Overwrite current image annotations with modifications
|
92 |
'''
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
96 |
|
@@ -98,14 +112,15 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
98 |
|
99 |
all_image_annotations[previous_page - 1] = image_annotated
|
100 |
|
101 |
-
print("all_image_annotations after:",all_image_annotations)
|
102 |
|
103 |
return all_image_annotations, current_page
|
104 |
|
105 |
-
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int):
|
106 |
'''
|
107 |
Apply modified redactions to a pymupdf
|
108 |
'''
|
|
|
109 |
|
110 |
output_files = []
|
111 |
|
@@ -154,23 +169,26 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
154 |
|
155 |
number_of_pages = unredacted_doc.page_count
|
156 |
|
157 |
-
|
158 |
|
159 |
-
|
|
|
|
|
160 |
|
161 |
image_loc = all_image_annotations[i]['image']
|
162 |
-
print("Image location:", image_loc)
|
163 |
-
|
164 |
-
# Load in image
|
165 |
-
if isinstance(image_loc,
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
image =
|
|
|
|
|
|
|
170 |
elif isinstance(image_loc, str):
|
171 |
image = Image.open(image_loc)
|
172 |
-
else:
|
173 |
-
image = Image.fromarray(image_loc.astype('uint8'))
|
174 |
|
175 |
pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
|
176 |
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
@@ -181,20 +199,10 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
181 |
output_files.append(out_pdf_file_path)
|
182 |
|
183 |
# Save the gradio_annotation_boxes to a JSON file
|
184 |
-
out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
|
185 |
-
|
186 |
-
|
187 |
-
#
|
188 |
-
for annotation in all_image_annotations_with_lists:
|
189 |
-
if isinstance(annotation['image'], np.ndarray):
|
190 |
-
annotation['image'] = annotation['image'].tolist()
|
191 |
-
elif isinstance(annotation['image'], Image.Image):
|
192 |
-
annotation['image'] = image_out_folder
|
193 |
-
|
194 |
-
with open(out_annotation_file_path, 'w') as f:
|
195 |
-
json.dump(all_image_annotations_with_lists, f)
|
196 |
-
|
197 |
-
output_files.append(out_annotation_file_path)
|
198 |
|
199 |
return doc, all_image_annotations, output_files
|
200 |
|
|
|
38 |
return max_pages
|
39 |
|
40 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
41 |
+
# print("\nImage annotator object:", image_annotator_object)
|
42 |
|
43 |
if not image_annotator_object:
|
44 |
return image_annotator(
|
45 |
label="Modify redaction boxes",
|
46 |
#label_list=["Redaction"],
|
47 |
#label_colors=[(0, 0, 0)],
|
48 |
+
show_label=False,
|
49 |
sources=["upload"],
|
50 |
show_clear_button=False,
|
51 |
+
show_share_button=False,
|
52 |
show_remove_button=False,
|
53 |
interactive=False
|
54 |
+
), gr.Number(label = "Current page (select page number then press enter)", value=1, precision=0)
|
55 |
+
|
56 |
+
if page_num is None:
|
57 |
+
page_num = 0
|
58 |
|
59 |
# Check bounding values for current page and page max
|
60 |
if page_num > 0:
|
|
|
75 |
box_thickness=1,
|
76 |
#label_list=["Redaction"],
|
77 |
#label_colors=[(0, 0, 0)],
|
78 |
+
show_label=False,
|
79 |
+
height='100%',
|
80 |
+
width='100%',
|
81 |
box_min_size=1,
|
82 |
box_selected_thickness=2,
|
83 |
handle_size=4,
|
84 |
sources=None,#["upload"],
|
85 |
show_clear_button=False,
|
86 |
+
show_share_button=False,
|
87 |
show_remove_button=False,
|
88 |
handles_cursor=True,
|
89 |
interactive=True
|
90 |
)
|
91 |
|
92 |
+
number_reported = gr.Number(label = "Current page (select page number then press enter)", value=page_num_reported, precision=0)
|
93 |
|
94 |
return out_image_annotator, number_reported
|
95 |
|
|
|
97 |
'''
|
98 |
Overwrite current image annotations with modifications
|
99 |
'''
|
100 |
+
#If no previous page or is 0, i.e. first time run, then make no changes
|
101 |
+
if not previous_page:
|
102 |
+
return all_image_annotations, current_page
|
103 |
+
|
104 |
+
if not current_page:
|
105 |
+
current_page = 1
|
106 |
+
|
107 |
+
#print("all_image_annotations before:",all_image_annotations)
|
108 |
|
109 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
110 |
|
|
|
112 |
|
113 |
all_image_annotations[previous_page - 1] = image_annotated
|
114 |
|
115 |
+
#print("all_image_annotations after:",all_image_annotations)
|
116 |
|
117 |
return all_image_annotations, current_page
|
118 |
|
119 |
+
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
|
120 |
'''
|
121 |
Apply modified redactions to a pymupdf
|
122 |
'''
|
123 |
+
print("all_image_annotations:", all_image_annotations)
|
124 |
|
125 |
output_files = []
|
126 |
|
|
|
169 |
|
170 |
number_of_pages = unredacted_doc.page_count
|
171 |
|
172 |
+
print("Saving pages to file.")
|
173 |
|
174 |
+
for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
|
175 |
+
|
176 |
+
#print("Saving page", str(i))
|
177 |
|
178 |
image_loc = all_image_annotations[i]['image']
|
179 |
+
#print("Image location:", image_loc)
|
180 |
+
|
181 |
+
# Load in image object
|
182 |
+
if isinstance(image_loc, np.ndarray):
|
183 |
+
image = Image.fromarray(image_loc.astype('uint8'))
|
184 |
+
#all_image_annotations[i]['image'] = image_loc.tolist()
|
185 |
+
elif isinstance(image_loc, Image.Image):
|
186 |
+
image = image_loc
|
187 |
+
#image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
|
188 |
+
#image_loc.save(image_out_folder)
|
189 |
+
#all_image_annotations[i]['image'] = image_out_folder
|
190 |
elif isinstance(image_loc, str):
|
191 |
image = Image.open(image_loc)
|
|
|
|
|
192 |
|
193 |
pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
|
194 |
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
|
|
199 |
output_files.append(out_pdf_file_path)
|
200 |
|
201 |
# Save the gradio_annotation_boxes to a JSON file
|
202 |
+
#out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
|
203 |
+
#with open(out_annotation_file_path, 'w') as f:
|
204 |
+
# json.dump(all_image_annotations, f)
|
205 |
+
#output_files.append(out_annotation_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
return doc, all_image_annotations, output_files
|
208 |
|