Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Oct 15, 2024

Commit

5b4b5fb

1 Parent(s): a680619

Upgraded packages. Fixed some issues with review process. Better progress reporting for user.

Browse files

Files changed (8) hide show

DocRedactApp_0.1.spec +0 -52
app.py +9 -5
redaction_review.py +0 -88
requirements.txt +7 -7
tools/aws_textract.py +2 -2
tools/file_conversion.py +2 -2
tools/file_redaction.py +44 -25
tools/redaction_review.py +42 -34

DocRedactApp_0.1.spec DELETED Viewed

@@ -1,52 +0,0 @@
-# -*- mode: python ; coding: utf-8 -*-
-from PyInstaller.utils.hooks import collect_data_files
-datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
-datas += collect_data_files('gradio_client')
-datas += collect_data_files('gradio')
-a = Analysis(
-    ['app.py'],
-    pathex=[],
-    binaries=[],
-    datas=datas,
-    hiddenimports=['pyarrow.vendored.version', 'pydicom.encoders'],
-    hookspath=['build_deps'],
-    hooksconfig={},
-    runtime_hooks=[],
-    excludes=[],
-    noarchive=False,
-    optimize=0,
-    module_collection_mode={
-        'gradio': 'py',  # Collect gradio package as source .py files
-    }
-)
-pyz = PYZ(a.pure)
-exe = EXE(
-    pyz,
-    a.scripts,
-    [],
-    exclude_binaries=True,
-    name='DocRedactApp_0.1',
-    debug=False,
-    bootloader_ignore_signals=False,
-    strip=False,
-    upx=True,
-    console=True,
-    disable_windowed_traceback=False,
-    argv_emulation=False,
-    target_arch=None,
-    codesign_identity=None,
-    entitlements_file=None,
-)
-coll = COLLECT(
-    exe,
-    a.binaries,
-    a.datas,
-    strip=False,
-    upx=True,
-    upx_exclude=[],
-    name='DocRedactApp_0.1',
-)

app.py CHANGED Viewed

@@ -76,7 +76,7 @@ with app:
     data_file_name_textbox = gr.Textbox(value="", visible=False)
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
-    annotate_previous_page = gr.Number(value=1, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
     ###
@@ -121,7 +121,7 @@ with app:
         with gr.Row():
             annotation_last_page_button = gr.Button("Previous page")
-            annotate_current_page = gr.Number(value=1, label="Current page", precision=0)
             annotation_next_page_button = gr.Button("Next page")
@@ -131,8 +131,10 @@ with app:
             label="Modify redaction boxes",
             label_list=["Redaction"],
             label_colors=[(0, 0, 0)],
             sources=None,#["upload"],
             show_clear_button=False,
             show_remove_button=False,
             interactive=False
         )
@@ -216,12 +218,14 @@ with app:
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
                     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
-    annotate_current_page.change(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
-    annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page])
-    annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)

     data_file_name_textbox = gr.Textbox(value="", visible=False)
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
+    annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
     ###
         with gr.Row():
             annotation_last_page_button = gr.Button("Previous page")
+            annotate_current_page = gr.Number(value=1, label="Current page (select page number then press enter)", precision=0)
             annotation_next_page_button = gr.Button("Next page")
             label="Modify redaction boxes",
             label_list=["Redaction"],
             label_colors=[(0, 0, 0)],
+            show_label=False,
             sources=None,#["upload"],
             show_clear_button=False,
+            show_share_button=False,
             show_remove_button=False,
             interactive=False
         )
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
                     then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
+    annotate_current_page.submit(
         modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
+    annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
+    annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
     #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
     annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)

redaction_review.py DELETED Viewed

@@ -1,88 +0,0 @@
-import gradio as gr
-from gradio_image_annotation import image_annotator
-from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.file_conversion import is_pdf, convert_pdf_to_images
-from tools.helper_functions import get_file_path_end, output_folder
-from tools.file_redaction import redact_page_with_pymupdf
-import json
-import pymupdf
-from PIL import ImageDraw, Image
-file_path = "output/page_as_img_example_complaint_letter_pages_1.png"
-#file_path = "examples/graduate-job-example-cover-letter.pdf"
-if is_pdf(file_path):
-    images = convert_pdf_to_images(file_path)
-    image = images[0]
-    doc = pymupdf.open(file_path)
-else:
-    doc = []
-with open('output/gradio_annotation_boxes.json', 'r') as f:
-    gradio_annotation_boxes = json.load(f)
-example_annotation = {
-    "image": file_path,
-    "boxes": gradio_annotation_boxes
-}
-def apply_redactions(image_annotated:AnnotatedImageData, file_path:str, doc=[]):
-    #print(image_annotated['image'])
-    file_base = get_file_path_end(file_path)
-    image = Image.fromarray(image_annotated['image'].astype('uint8'))
-    draw = ImageDraw.Draw(image)
-    if is_pdf(file_path) == False:
-        for img_annotation_box in image_annotated['boxes']:
-            coords = [img_annotation_box["xmin"],
-            img_annotation_box["ymin"],
-            img_annotation_box["xmax"],
-            img_annotation_box["ymax"]]
-            fill = img_annotation_box["color"]
-            draw.rectangle(coords, fill=fill)
-            image.save(output_folder + file_base + "_additional.png")
-    # If it's a pdf, assume a doc object is available
-    else:
-        doc = redact_page_with_pymupdf(doc, image_annotated, 1, image)
-def crop(annotations):
-    if annotations["boxes"]:
-        box = annotations["boxes"][0]
-        return annotations["image"][
-            box["ymin"]:box["ymax"],
-            box["xmin"]:box["xmax"]
-        ]
-    return None
-def get_boxes_json(annotations):
-    return annotations["boxes"]
-with gr.Blocks() as demo:
-    with gr.Tab("Object annotation", id="tab_object_annotation"):
-        doc_state = gr.State(doc)
-        file_path_textbox = gr.Textbox(value=file_path)
-        annotator = image_annotator(
-            example_annotation,
-            label_list=["Redaction"],
-            label_colors=[(0, 0, 0)],
-        )
-        button_get = gr.Button("Get bounding boxes")
-        button_apply = gr.Button("Apply redactions")
-        json_boxes = gr.JSON()
-        button_get.click(get_boxes_json, annotator, json_boxes)
-        button_apply.click(apply_redactions, inputs=[annotator, file_path_textbox, doc_state])
-if __name__ == "__main__":
-    demo.launch(inbrowser=True)

requirements.txt CHANGED Viewed

@@ -1,17 +1,17 @@
 pdfminer.six==20231228
 pdf2image==1.17.0
 pymupdf==1.24.10
-opencv-python==4.9.0.80
 presidio_analyzer==2.2.355
 presidio_anonymizer==2.2.355
 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
-pandas==2.2.2
-spacy==3.7.5
-en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
-gradio>=4.26.0
-boto3==1.34.158
-pyarrow==14.0.2
 openpyxl==3.1.2
 Faker==22.2.0
 gradio_image_annotation==0.2.3

 pdfminer.six==20231228
 pdf2image==1.17.0
 pymupdf==1.24.10
+opencv-python==4.10.0.84
 presidio_analyzer==2.2.355
 presidio_anonymizer==2.2.355
 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
+pandas==2.2.3
+spacy==3.8.2
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==4.44.1
+boto3==1.35.40
+pyarrow==17.0.0
 openpyxl==3.1.2
 Faker==22.2.0
 gradio_image_annotation==0.2.3

tools/aws_textract.py CHANGED Viewed

@@ -158,7 +158,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
                                         handwriting.append(recogniser_result)
-                                        print("Handwriting found:", handwriting[-1])
             # If handwriting or signature, add to bounding box
@@ -173,7 +173,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
                 recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
                 signatures.append(recogniser_result)
-                print("Signature found:", signatures[-1])
                 words = []
                 words.append({

                                         handwriting.append(recogniser_result)
+                                        #print("Handwriting found:", handwriting[-1])
             # If handwriting or signature, add to bounding box
                 recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
                 signatures.append(recogniser_result)
+                #print("Signature found:", signatures[-1])
                 words = []
                 words.append({

tools/file_conversion.py CHANGED Viewed

@@ -49,8 +49,8 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
     images = []
     # Open the PDF file
-    #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
-    for page_num in range(page_min,page_count): #progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"):
         print("Converting page: ", str(page_num + 1))

     images = []
     # Open the PDF file
+    #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
+    for page_num in progress.tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
         print("Converting page: ", str(page_num + 1))

tools/file_redaction.py CHANGED Viewed

@@ -3,7 +3,8 @@ import re
 import json
 import io
 import os
-from PIL import Image, ImageChops, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 from typing import List, Dict, Tuple
@@ -118,6 +119,16 @@ def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[s
             return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
@@ -477,17 +488,17 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
-            print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
             bboxes.extend(handwriting_recogniser_results)
         if "Redact all identified signatures" in handwrite_signature_checkbox:
-            print("Signature boxes exist at merge:", signature_recogniser_results)
             bboxes.extend(signature_recogniser_results)
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
-        print("bbox:", bbox)
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
             line_box = line_info['bounding_box']
@@ -636,33 +647,37 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
     if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
-    for i in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
         signature_recogniser_results = []
         handwriting_recogniser_results = []
-        # Assuming prepared_pdf_file_paths[i] is your PIL image object
         try:
-            image = prepared_pdf_file_paths[i]#.copy()
-            print("image:", image)
         except Exception as e:
             print("Could not redact page:", reported_page_number, "due to:")
             print(e)
             continue
-        image_annotations = {"image": image, "boxes": []}
         #try:
-        print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
-        if i >= page_min and i < page_max:
-            reported_page_number = str(i + 1)
-            print("Redacting page", reported_page_number)
-            pymupdf_page = pymupdf_doc.load_page(i)
             # Need image size to convert textract OCR outputs to the correct sizes
             page_width, page_height = image.size
@@ -811,6 +826,8 @@ def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:
         all_image_annotations.append(image_annotations)
     all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
     logging_file_paths.append(ocr_results_file_path)
@@ -849,8 +866,6 @@ def analyse_text_container(text_container:OCRResult, language:str, chosen_redact
                                                 score_threshold=score_threshold,
                                                 return_decision_process=True,
                                                 allow_list=allow_list)
-    print(analyser_results)
     return analyser_results
@@ -1097,8 +1112,10 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
     else: page_min = page_min - 1
     print("Page range is",str(page_min + 1), "to", str(page_max))
-    for page_no in range(0, number_of_pages): #range(page_min, page_max):
         #print("prepared_pdf_image_path:", prepared_pdf_image_path)
         #print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
         image = prepared_pdf_image_path[page_no]
@@ -1150,23 +1167,23 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
                         # Analyse each line of text in turn for PII and add to list
                         for i, text_line in enumerate(line_level_text_results_list):
-                            text_line_analyzer_result = []
                             text_line_bounding_boxes = []
                             #print("text_line:", text_line.text)
-                            text_line_analyzer_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
                             # Merge bounding boxes for the line if multiple found close together
-                            if text_line_analyzer_result:
                                 # Merge bounding boxes if very close together
                                 #print("text_line_bounding_boxes:", text_line_bounding_boxes)
                                 #print("line_characters:")
                                 #print(line_characters[i])
                                 #print("".join(char._text for char in line_characters[i]))
-                                text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyzer_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
-                                text_container_analyser_results.extend(text_line_analyzer_result)
                                 text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
                             #print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
@@ -1188,7 +1205,7 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
                 annotations_all_pages.extend([annotations_on_page])
-                print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
                 # Write logs
                 # Create decision process table
@@ -1203,5 +1220,7 @@ def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, cho
                     page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
         all_image_annotations.append(image_annotations)
     return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations

 import json
 import io
 import os
+import boto3
+from PIL import Image, ImageChops, ImageFile, ImageDraw
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 from typing import List, Dict, Tuple
             return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
+            if in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
+                # Try accessing Textract through boto3
+                try:
+                    boto3.client('textract')
+                except:
+                    out_message = "Cannot connect to AWS Textract. Please choose another redaction method."
+                    print(out_message)
+                    return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
     # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
+            #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
             bboxes.extend(handwriting_recogniser_results)
         if "Redact all identified signatures" in handwrite_signature_checkbox:
+            #print("Signature boxes exist at merge:", signature_recogniser_results)
             bboxes.extend(signature_recogniser_results)
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
+        #print("bbox:", bbox)
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
             line_box = line_info['bounding_box']
     if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
+    for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
+    #for page_no in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
         signature_recogniser_results = []
         handwriting_recogniser_results = []
+        # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
+            image = prepared_pdf_file_paths[page_no]#.copy()
+            #print("image:", image)
         except Exception as e:
             print("Could not redact page:", reported_page_number, "due to:")
             print(e)
             continue
+        image_annotations = {"image": image, "boxes": []}
+        pymupdf_page = pymupdf_doc.load_page(page_no)
         #try:
+        #print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
+        if page_no >= page_min and page_no < page_max:
+            reported_page_number = str(page_no + 1)
+            print("Redacting page", reported_page_number)
             # Need image size to convert textract OCR outputs to the correct sizes
             page_width, page_height = image.size
         all_image_annotations.append(image_annotations)
+        #print("\nall_image_annotations for page", str(page_no), "are:", all_image_annotations)
     all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
     logging_file_paths.append(ocr_results_file_path)
                                                 score_threshold=score_threshold,
                                                 return_decision_process=True,
                                                 allow_list=allow_list)
     return analyser_results
     else: page_min = page_min - 1
     print("Page range is",str(page_min + 1), "to", str(page_max))
+    #for page_no in range(0, number_of_pages):
+    for page_no in progress.tqdm(range(0, number_of_pages), unit="pages", desc="Redacting pages"):
         #print("prepared_pdf_image_path:", prepared_pdf_image_path)
         #print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
         image = prepared_pdf_image_path[page_no]
                         # Analyse each line of text in turn for PII and add to list
                         for i, text_line in enumerate(line_level_text_results_list):
+                            text_line_analyser_result = []
                             text_line_bounding_boxes = []
                             #print("text_line:", text_line.text)
+                            text_line_analyser_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
                             # Merge bounding boxes for the line if multiple found close together
+                            if text_line_analyser_result:
                                 # Merge bounding boxes if very close together
                                 #print("text_line_bounding_boxes:", text_line_bounding_boxes)
                                 #print("line_characters:")
                                 #print(line_characters[i])
                                 #print("".join(char._text for char in line_characters[i]))
+                                text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
+                                text_container_analyser_results.extend(text_line_analyser_result)
                                 text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
                             #print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
                 annotations_all_pages.extend([annotations_on_page])
+                print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
                 # Write logs
                 # Create decision process table
                     page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
         all_image_annotations.append(image_annotations)
+    #print("all_image_annotations:", all_image_annotations)
     return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations

tools/redaction_review.py CHANGED Viewed

@@ -38,18 +38,23 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
         return max_pages
 def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
-    #print("\nImage annotator object:", image_annotator_object[0])
     if not image_annotator_object:
         return image_annotator(
         label="Modify redaction boxes",
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
         sources=["upload"],
         show_clear_button=False,
         show_remove_button=False,
         interactive=False
-    ), gr.Number(label = "Current page", value=1, precision=0)
     # Check bounding values for current page and page max
     if page_num > 0:
@@ -70,19 +75,21 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
         box_thickness=1,
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
-        height='60%',
-        width='60%',
         box_min_size=1,
         box_selected_thickness=2,
         handle_size=4,
         sources=None,#["upload"],
         show_clear_button=False,
         show_remove_button=False,
         handles_cursor=True,
         interactive=True
     )
-    number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
     return out_image_annotator, number_reported
@@ -90,7 +97,14 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
     '''
     Overwrite current image annotations with modifications
     '''
-    print("all_image_annotations before:",all_image_annotations)
     image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
@@ -98,14 +112,15 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
     all_image_annotations[previous_page - 1] = image_annotated
-    print("all_image_annotations after:",all_image_annotations)
     return all_image_annotations, current_page
-def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int):
     '''
     Apply modified redactions to a pymupdf
     '''
     output_files = []
@@ -154,23 +169,26 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
         number_of_pages = unredacted_doc.page_count
-        for i in range(0, number_of_pages):
-            print("Re-redacting page", str(i))
             image_loc = all_image_annotations[i]['image']
-            print("Image location:", image_loc)
-            # Load in image
-            if isinstance(image_loc, Image.Image):
-                # Save to file so the image annotator can pick it up
-                image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
-                image_loc.save(image_out_folder)
-                image = image_out_folder
             elif isinstance(image_loc, str):
                 image = Image.open(image_loc)
-            else:
-                image = Image.fromarray(image_loc.astype('uint8'))
             pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
             pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
@@ -181,20 +199,10 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
     output_files.append(out_pdf_file_path)
     # Save the gradio_annotation_boxes to a JSON file
-    out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
-    all_image_annotations_with_lists = all_image_annotations
-    # Convert image arrays to lists for JSON serialization
-    for annotation in all_image_annotations_with_lists:
-        if isinstance(annotation['image'], np.ndarray):
-            annotation['image'] = annotation['image'].tolist()
-        elif isinstance(annotation['image'], Image.Image):
-            annotation['image'] = image_out_folder
-    with open(out_annotation_file_path, 'w') as f:
-        json.dump(all_image_annotations_with_lists, f)
-    output_files.append(out_annotation_file_path)
     return doc, all_image_annotations, output_files

         return max_pages
 def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
+    # print("\nImage annotator object:", image_annotator_object)
     if not image_annotator_object:
         return image_annotator(
         label="Modify redaction boxes",
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
+        show_label=False,
         sources=["upload"],
         show_clear_button=False,
+        show_share_button=False,
         show_remove_button=False,
         interactive=False
+    ), gr.Number(label = "Current page (select page number then press enter)", value=1, precision=0)
+    if page_num is None:
+        page_num = 0
     # Check bounding values for current page and page max
     if page_num > 0:
         box_thickness=1,
         #label_list=["Redaction"],
         #label_colors=[(0, 0, 0)],
+        show_label=False,
+        height='100%',
+        width='100%',
         box_min_size=1,
         box_selected_thickness=2,
         handle_size=4,
         sources=None,#["upload"],
         show_clear_button=False,
+        show_share_button=False,
         show_remove_button=False,
         handles_cursor=True,
         interactive=True
     )
+    number_reported = gr.Number(label = "Current page (select page number then press enter)", value=page_num_reported, precision=0)
     return out_image_annotator, number_reported
     '''
     Overwrite current image annotations with modifications
     '''
+    #If no previous page or is 0, i.e. first time run, then make no changes
+    if not previous_page:
+        return all_image_annotations, current_page
+    if not current_page:
+        current_page = 1
+    #print("all_image_annotations before:",all_image_annotations)
     image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
     all_image_annotations[previous_page - 1] = image_annotated
+    #print("all_image_annotations after:",all_image_annotations)
     return all_image_annotations, current_page
+def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf
     '''
+    print("all_image_annotations:", all_image_annotations)
     output_files = []
         number_of_pages = unredacted_doc.page_count
+        print("Saving pages to file.")
+        for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
+            #print("Saving page", str(i))
             image_loc = all_image_annotations[i]['image']
+            #print("Image location:", image_loc)
+            # Load in image object
+            if isinstance(image_loc, np.ndarray):
+                image = Image.fromarray(image_loc.astype('uint8'))
+                #all_image_annotations[i]['image'] = image_loc.tolist()
+            elif isinstance(image_loc, Image.Image):
+                image = image_loc
+                #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
+                #image_loc.save(image_out_folder)
+                #all_image_annotations[i]['image'] = image_out_folder
             elif isinstance(image_loc, str):
                 image = Image.open(image_loc)
             pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
             pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
     output_files.append(out_pdf_file_path)
     # Save the gradio_annotation_boxes to a JSON file
+    #out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
+    #with open(out_annotation_file_path, 'w') as f:
+    #    json.dump(all_image_annotations, f)
+    #output_files.append(out_annotation_file_path)
     return doc, all_image_annotations, output_files