Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Dec 17, 2024

Commit

9504619

1 Parent(s): b8e245f

Multithreaded file preparation. Can call Textract without signature detection

Browse files

Files changed (4) hide show

app.py +1 -1
tools/aws_textract.py +13 -5
tools/file_conversion.py +103 -35
tools/file_redaction.py +5 -4

app.py CHANGED Viewed

@@ -263,7 +263,7 @@ with app:
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
-            handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)

                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
+            handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
             #with gr.Row():
             in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)

tools/aws_textract.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import boto3
-from PIL import Image
 import io
-import json
 import pikepdf
 # Example: converting this single page to an image
-from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 def extract_textract_metadata(response):
@@ -23,7 +24,7 @@ def extract_textract_metadata(response):
         #'NumberOfPages': number_of_pages
     })
-def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
     '''
     Analyse page with AWS Textract
     '''
@@ -36,7 +37,14 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
     print("Analysing page with AWS Textract")
-    response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     # Wrap the response with the page number in the desired format
     wrapped_response = {

 import boto3
+#from PIL import Image
+from typing import List
 import io
+#import json
 import pikepdf
 # Example: converting this single page to an image
+#from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 def extract_textract_metadata(response):
         #'NumberOfPages': number_of_pages
     })
+def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
     '''
     Analyse page with AWS Textract
     '''
     print("Analysing page with AWS Textract")
+    # Redact signatures if specified
+    if "Redact all identified signatures" in handwrite_signature_checkbox:
+        print("Analysing document with signature detection")
+        response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
+    else:
+        print("Analysing document without signature detection")
+        # Call detect_document_text to extract plain text
+        response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
     # Wrap the response with the page number in the desired format
     wrapped_response = {

tools/file_conversion.py CHANGED Viewed

@@ -2,7 +2,6 @@ from pdf2image import convert_from_path, pdfinfo_from_path
 from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
 import re
 import gradio as gr
@@ -12,6 +11,7 @@ import pymupdf
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
 image_dpi = 300.0
@@ -46,61 +46,129 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
-def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
-    print("pdf_path in convert_pdf_to_images:", pdf_path)
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
-    print("Number of pages in PDF: ", str(page_count))
     images = []
-    # Open the PDF file
-    #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
-    for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
-        print("page_num in convert_pdf_to_images:", page_num)
-        print("Converting page: ", str(page_num + 1))
-        # Convert one page to image
-        out_path  = pdf_path + "_" + str(page_num) + ".png"
-        # Ensure the directory exists
-        os.makedirs(os.path.dirname(out_path), exist_ok=True)
-        # Check if the image already exists
-        if os.path.exists(out_path):
-            #print(f"Loading existing image from {out_path}.")
-            image = Image.open(out_path)  # Load the existing image
-        else:
-            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
-            image = image_l[0]
-            # Convert to greyscale
-            image = image.convert("L")
-            image.save(out_path, format="PNG")  # Save the new image
-        # If no images are returned, break the loop
-        if not image:
-            print("Conversion of page", str(page_num), "to file failed.")
-            break
-        # print("Conversion of page", str(page_num), "to file succeeded.")
-        # print("image:", image)
-        images.append(out_path)
-    print("PDF has been converted to images.")
-    # print("Images:", images)
-    return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path:str):

 from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from PIL import Image, ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 import os
 import re
 import gradio as gr
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
 image_dpi = 300.0
 # %%
 ## Convert pdf to image if necessary
+def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
+    """
+    Convert a single page of a PDF to an image and save it as a PNG.
+    Returns the path to the saved image.
+    """
+    try:
+        out_path = f"{pdf_path}_{page_num}.png"
+        # Ensure the directory exists
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+        # Check if the image already exists
+        if os.path.exists(out_path):
+            # Load the existing image
+            print(f"Loading existing image for page {page_num + 1}")
+            image = Image.open(out_path)
+        else:
+            # Convert the page to an image
+            print(f"Converting page {page_num + 1}")
+            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
+                                        dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
+            image = image_l[0]
+            # Convert to greyscale
+            image = image.convert("L")
+            image.save(out_path, format="PNG")
+        return out_path
+    except Exception as e:
+        print(f"Error processing page {page_num + 1}: {e}")
+        return None
+def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
+    """
+    Convert pages of a PDF to images using multithreading.
+    """
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
+    print(f"Number of pages in PDF: {page_count}")
     images = []
+    # Use ThreadPoolExecutor to process pages in parallel
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = []
+        for page_num in range(page_min, page_count):
+            futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
+        # Display progress using tqdm
+        for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
+            result = future.result()
+            if result:
+                images.append(result)
+            else:
+                print("A page failed to process.")
+    print("PDF has been converted to images.")
+    return images
+# Example usage
+if __name__ == "__main__":
+    pdf_path = "example.pdf"
+    image_dpi = 200
+    output_images = convert_pdf_to_images(pdf_path, image_dpi=image_dpi, num_threads=8)
+    print("Images saved:", output_images)
+# def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
+#     print("pdf_path in convert_pdf_to_images:", pdf_path)
+#     # Get the number of pages in the PDF
+#     page_count = pdfinfo_from_path(pdf_path)['Pages']
+#     print("Number of pages in PDF: ", str(page_count))
+#     images = []
+#     # Open the PDF file
+#     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
+#     for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
+#         #print("page_num in convert_pdf_to_images:", page_num)
+#         print("Converting page: ", str(page_num + 1))
+#         # Convert one page to image
+#         out_path  = pdf_path + "_" + str(page_num) + ".png"
+#         # Ensure the directory exists
+#         os.makedirs(os.path.dirname(out_path), exist_ok=True)
+#         # Check if the image already exists
+#         if os.path.exists(out_path):
+#             #print(f"Loading existing image from {out_path}.")
+#             image = Image.open(out_path)  # Load the existing image
+#         else:
+#             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
+#             image = image_l[0]
+#             # Convert to greyscale
+#             image = image.convert("L")
+#             image.save(out_path, format="PNG")  # Save the new image
+#         # If no images are returned, break the loop
+#         if not image:
+#             print("Conversion of page", str(page_num), "to file failed.")
+#             break
+#         # print("Conversion of page", str(page_num), "to file succeeded.")
+#         # print("image:", image)
+#         images.append(out_path)
+#     print("PDF has been converted to images.")
+#     # print("Images:", images)
+#     return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path:str):

tools/file_redaction.py CHANGED Viewed

@@ -36,7 +36,7 @@ from tools.presidio_analyzer_custom import recognizer_result_from_dict
 page_break_value = get_or_create_env_var('page_break_value', '500')
 print(f'The value of page_break_value is {page_break_value}')
-max_time_value = get_or_create_env_var('max_time_value', '105')
 print(f'The value of max_time_value is {max_time_value}')
 def sum_numbers_before_seconds(string:str):
@@ -689,7 +689,8 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
         # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
@@ -954,7 +955,7 @@ def redact_image_pdf(file_path:str,
                 json_file_path = output_folder + file_name + "_textract.json"
                 if not os.path.exists(json_file_path):
-                    text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client)  # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
@@ -974,7 +975,7 @@ def redact_image_pdf(file_path:str,
                         if not page_exists:  # If the page does not exist, analyze again
                             print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
-                            text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialize it as an empty list
                             if "pages" not in existing_data:

 page_break_value = get_or_create_env_var('page_break_value', '500')
 print(f'The value of page_break_value is {page_break_value}')
+max_time_value = get_or_create_env_var('max_time_value', '999999')
 print(f'The value of max_time_value is {max_time_value}')
 def sum_numbers_before_seconds(string:str):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
+    print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
         # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
                 json_file_path = output_folder + file_name + "_textract.json"
                 if not os.path.exists(json_file_path):
+                    text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
                         if not page_exists:  # If the page does not exist, analyze again
                             print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
+                            text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialize it as an empty list
                             if "pages" not in existing_data: