Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on May 15, 2024

Commit

0f18146

1 Parent(s): a63133d

Separated file preparation and file redaction functions. Hopefully sts endpoint access now works on AWS

Browse files

Files changed (4) hide show

app.py +14 -66
tools/aws_functions.py +3 -3
tools/file_conversion.py +58 -15
tools/file_redaction.py +60 -21

app.py CHANGED Viewed

@@ -1,12 +1,9 @@
-from tools.file_redaction import redact_text_pdf, redact_image_pdf
-from tools.helper_functions import get_file_path_end
-from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_functions import load_data_from_aws
 from typing import List
 import pandas as pd
 import gradio as gr
-import time
 #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
@@ -14,66 +11,6 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
-def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
-    tic = time.perf_counter()
-    out_message = ''
-    out_file_paths = []
-    in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    if file_path:
-        file_path_without_ext = get_file_path_end(file_path)
-    else:
-        out_message = "No file selected"
-        print(out_message)
-        return out_message, out_file_paths
-    if in_redact_method == "Image analysis":
-        # Analyse and redact image-based pdf or image
-        if is_pdf_or_image(file_path) == False:
-            return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
-        pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
-        out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
-        pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
-        out_file_paths.append(out_image_file_path)
-        out_message = "Image-based PDF successfully redacted and saved to file."
-    elif in_redact_method == "Text analysis":
-        if is_pdf(file_path) == False:
-            return "Please upload a PDF file for text analysis.", None
-        # Analyse text-based pdf
-        pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
-        out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
-        pdf_text.save(out_text_file_path)
-        out_file_paths.append(out_text_file_path)
-        # Convert annotated text pdf back to image to give genuine redactions
-        pdf_text_image_paths = process_file(out_text_file_path)
-        out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
-        pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
-        out_file_paths.append(out_text_image_file_path)
-        out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
-    else:
-        out_message = "No redaction method selected"
-        print(out_message)
-        return out_message, out_file_paths
-    toc = time.perf_counter()
-    out_time = f"Time taken: {toc - tic:0.1f} seconds."
-    print(out_time)
-    out_message = out_message + "\n\n" + out_time
-    return out_message, out_file_paths
 # Create the gradio interface
@@ -81,6 +18,9 @@ block = gr.Blocks(theme = gr.themes.Base())
 with block:
     gr.Markdown(
     """
     # Document redaction
@@ -106,6 +46,9 @@ with block:
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output file")
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label = "AWS data access", open = True):
                 aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
@@ -118,7 +61,12 @@ with block:
     ### Loading AWS data ###
     load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
-    redact_btn.click(fn = choose_and_run_redactor, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
                     outputs=[output_summary, output_file], api_name="redact")
 # Simple run for HF spaces or local on your computer

+from tools.file_redaction import choose_and_run_redactor
+from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
 from tools.aws_functions import load_data_from_aws
 from typing import List
 import pandas as pd
 import gradio as gr
 #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
 full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'
 # Create the gradio interface
 with block:
+    prepared_pdf_state = gr.State([])
+    output_image_files_state = gr.State([])
     gr.Markdown(
     """
     # Document redaction
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output file")
+        with gr.Row():
+            convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary")
     with gr.Tab(label="Advanced options"):
         with gr.Accordion(label = "AWS data access", open = True):
                 aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
     ### Loading AWS data ###
     load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
+    redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
+                    outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
+                    outputs=[output_summary, output_file], api_name="redact")
+    convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file],
                     outputs=[output_summary, output_file], api_name="redact")
 # Simple run for HF spaces or local on your computer

tools/aws_functions.py CHANGED Viewed

@@ -6,10 +6,10 @@ import os
 PandasDataFrame = Type[pd.DataFrame]
-bucket_name = 'doc-redaction-data'
 try:
-    session = boto3.Session(profile_name="default")
 except Exception as e:
     print(e)
@@ -24,7 +24,7 @@ except Exception as e:
 def get_assumed_role_info():
-    sts = boto3.client('sts')
     response = sts.get_caller_identity()
     # Extract ARN of the assumed role

 PandasDataFrame = Type[pd.DataFrame]
+bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
 try:
+    session = boto3.Session() # profile_name="default"
 except Exception as e:
     print(e)
 def get_assumed_role_info():
+    sts = boto3.client('sts', region_name='us-west-2')
     response = sts.get_caller_identity()
     # Extract ARN of the assumed role

tools/file_conversion.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
 from PIL import Image
 import os
 from gradio import Progress
 def is_pdf_or_image(filename):
     """
@@ -13,7 +15,7 @@ def is_pdf_or_image(filename):
     Returns:
         bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
     """
-    if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"):
         output = True
     else:
         output = False
@@ -34,7 +36,7 @@ def is_pdf(filename):
 # %%
 ## Convert pdf to image if necessary
-def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
@@ -54,25 +56,14 @@ def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
         if not image:
             break
-        # # Convert PDF to a list of images
-        # images = convert_from_path(pdf_path)
-        # images = []
         images.extend(image)
-    # Save each image as a separate file - deprecated
-    #image_paths = []
-    # for i, image in enumerate(images):
-    #     page_path = f"processing/page_{i+1}.png"
-    #     image.save(page_path, "PNG")
-    #     image_paths.append(page_path)
     print("PDF has been converted to images.")
     return images
-# %%
 def process_file(file_path):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
@@ -95,3 +86,55 @@ def process_file(file_path):
     return out_path

 from pdf2image import convert_from_path, pdfinfo_from_path
+from tools.helper_functions import get_file_path_end
 from PIL import Image
 import os
 from gradio import Progress
+from typing import List
 def is_pdf_or_image(filename):
     """
     Returns:
         bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
     """
+    if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
         output = True
     else:
         output = False
 # %%
 ## Convert pdf to image if necessary
+def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
     # Get the number of pages in the PDF
     page_count = pdfinfo_from_path(pdf_path)['Pages']
         if not image:
             break
         images.extend(image)
     print("PDF has been converted to images.")
     return images
+# %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
     return out_path
+def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
+    out_message = ''
+    out_file_paths = []
+    in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    if file_path:
+        file_path_without_ext = get_file_path_end(file_path)
+    else:
+        out_message = "No file selected"
+        print(out_message)
+        return out_message, out_file_paths
+    if in_redact_method == "Image analysis":
+        # Analyse and redact image-based pdf or image
+        if is_pdf_or_image(file_path) == False:
+            return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
+        out_file_path = process_file(file_path)
+    elif in_redact_method == "Text analysis":
+        if is_pdf(file_path) == False:
+            return "Please upload a PDF file for text analysis.", None
+        out_file_path = file_path
+    return out_message, out_file_path
+def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
+    file_path_without_ext = get_file_path_end(in_file_path)
+    out_file_paths = []
+    # Convert annotated text pdf back to image to give genuine redactions
+    print("Creating image version of results")
+    pdf_text_image_paths = process_file(out_text_file_path)
+    out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
+    pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
+    out_file_paths.append(out_text_image_file_path)
+    out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
+    return out_message, out_file_paths

tools/file_redaction.py CHANGED Viewed

@@ -7,8 +7,68 @@ from tools.file_conversion import process_file
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from gradio import Progress
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
@@ -42,7 +102,6 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
         image_analyser = ImageAnalyzerEngine(nlp_analyser)
         engine = ImageRedactorEngine(image_analyser)
         if language == 'en':
             ocr_lang = 'eng'
         else: ocr_lang = language
@@ -62,26 +121,6 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
             )
         images.append(redacted_image)
-        # multiple inputs (variant 2)
-        # with open("name.pdf","wb") as f:
-	    # f.write(img2pdf.convert(["test1.jpg", "test2.png"]))
-        # # Create page from image
-        # pdf.add_blank_page(page_size=(redacted_image.width, redacted_image.height))
-        # page = pdf.pages[-1]
-        # page.add_image(redacted_image, 0, 0)
-        # %%
-        # Get descriptive output of results for checks - not necessary except for debugging
-        # bboxes = image_analyser.analyze(image)
-        # # %%
-        # check_df = pd.DataFrame(bboxes)[0].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
-        # check_df.columns = ["type", "start", "end", "score", "left", "top", "width", "height"]
-        # check_df.to_csv("check_df.csv")
     return images

 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 from gradio import Progress
+import time
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
+from tools.helper_functions import get_file_path_end
+from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
+import gradio as gr
+def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
+    tic = time.perf_counter()
+    out_message = ''
+    out_file_paths = []
+    in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
+    if file_path:
+         file_path_without_ext = get_file_path_end(file_path)
+    else:
+         out_message = "No file selected"
+         print(out_message)
+         return out_message, out_file_paths
+    if in_redact_method == "Image analysis":
+        # Analyse and redact image-based pdf or image
+        # if is_pdf_or_image(file_path) == False:
+        #     return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
+        pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
+        out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
+        pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
+        out_file_paths.append(out_image_file_path)
+        out_message = "Image-based PDF successfully redacted and saved to file."
+    elif in_redact_method == "Text analysis":
+        if is_pdf(file_path) == False:
+            return "Please upload a PDF file for text analysis.", None
+        # Analyse text-based pdf
+        pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
+        out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
+        pdf_text.save(out_text_file_path)
+        out_file_paths.append(out_text_file_path)
+    else:
+        out_message = "No redaction method selected"
+        print(out_message)
+        return out_message, out_file_paths
+    toc = time.perf_counter()
+    out_time = f"Time taken: {toc - tic:0.1f} seconds."
+    print(out_time)
+    out_message = out_message + "\n\n" + out_time
+    return out_message, out_file_paths
 def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
     '''
         image_analyser = ImageAnalyzerEngine(nlp_analyser)
         engine = ImageRedactorEngine(image_analyser)
         if language == 'en':
             ocr_lang = 'eng'
         else: ocr_lang = language
             )
         images.append(redacted_image)
     return images