Commit
·
0f18146
1
Parent(s):
a63133d
Separated file preparation and file redaction functions. Hopefully sts endpoint access now works on AWS
Browse files- app.py +14 -66
- tools/aws_functions.py +3 -3
- tools/file_conversion.py +58 -15
- tools/file_redaction.py +60 -21
app.py
CHANGED
@@ -1,12 +1,9 @@
|
|
1 |
-
from tools.file_redaction import
|
2 |
-
from tools.
|
3 |
-
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
4 |
from tools.aws_functions import load_data_from_aws
|
5 |
-
|
6 |
from typing import List
|
7 |
import pandas as pd
|
8 |
import gradio as gr
|
9 |
-
import time
|
10 |
|
11 |
#file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
|
12 |
|
@@ -14,66 +11,6 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
|
|
14 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
15 |
language = 'en'
|
16 |
|
17 |
-
def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
18 |
-
|
19 |
-
tic = time.perf_counter()
|
20 |
-
|
21 |
-
out_message = ''
|
22 |
-
out_file_paths = []
|
23 |
-
|
24 |
-
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
25 |
-
|
26 |
-
if file_path:
|
27 |
-
file_path_without_ext = get_file_path_end(file_path)
|
28 |
-
else:
|
29 |
-
out_message = "No file selected"
|
30 |
-
print(out_message)
|
31 |
-
return out_message, out_file_paths
|
32 |
-
|
33 |
-
if in_redact_method == "Image analysis":
|
34 |
-
# Analyse and redact image-based pdf or image
|
35 |
-
if is_pdf_or_image(file_path) == False:
|
36 |
-
return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
37 |
-
|
38 |
-
pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
39 |
-
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
40 |
-
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
41 |
-
|
42 |
-
out_file_paths.append(out_image_file_path)
|
43 |
-
out_message = "Image-based PDF successfully redacted and saved to file."
|
44 |
-
|
45 |
-
elif in_redact_method == "Text analysis":
|
46 |
-
if is_pdf(file_path) == False:
|
47 |
-
return "Please upload a PDF file for text analysis.", None
|
48 |
-
|
49 |
-
# Analyse text-based pdf
|
50 |
-
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
51 |
-
out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
|
52 |
-
pdf_text.save(out_text_file_path)
|
53 |
-
|
54 |
-
out_file_paths.append(out_text_file_path)
|
55 |
-
|
56 |
-
# Convert annotated text pdf back to image to give genuine redactions
|
57 |
-
pdf_text_image_paths = process_file(out_text_file_path)
|
58 |
-
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
59 |
-
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
60 |
-
|
61 |
-
out_file_paths.append(out_text_image_file_path)
|
62 |
-
|
63 |
-
out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
|
64 |
-
|
65 |
-
else:
|
66 |
-
out_message = "No redaction method selected"
|
67 |
-
print(out_message)
|
68 |
-
return out_message, out_file_paths
|
69 |
-
|
70 |
-
toc = time.perf_counter()
|
71 |
-
out_time = f"Time taken: {toc - tic:0.1f} seconds."
|
72 |
-
print(out_time)
|
73 |
-
|
74 |
-
out_message = out_message + "\n\n" + out_time
|
75 |
-
|
76 |
-
return out_message, out_file_paths
|
77 |
|
78 |
# Create the gradio interface
|
79 |
|
@@ -81,6 +18,9 @@ block = gr.Blocks(theme = gr.themes.Base())
|
|
81 |
|
82 |
with block:
|
83 |
|
|
|
|
|
|
|
84 |
gr.Markdown(
|
85 |
"""
|
86 |
# Document redaction
|
@@ -106,6 +46,9 @@ with block:
|
|
106 |
output_summary = gr.Textbox(label="Output summary")
|
107 |
output_file = gr.File(label="Output file")
|
108 |
|
|
|
|
|
|
|
109 |
with gr.Tab(label="Advanced options"):
|
110 |
with gr.Accordion(label = "AWS data access", open = True):
|
111 |
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
@@ -118,7 +61,12 @@ with block:
|
|
118 |
### Loading AWS data ###
|
119 |
load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
120 |
|
121 |
-
redact_btn.click(fn =
|
|
|
|
|
|
|
|
|
|
|
122 |
outputs=[output_summary, output_file], api_name="redact")
|
123 |
|
124 |
# Simple run for HF spaces or local on your computer
|
|
|
1 |
+
from tools.file_redaction import choose_and_run_redactor
|
2 |
+
from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
|
|
|
3 |
from tools.aws_functions import load_data_from_aws
|
|
|
4 |
from typing import List
|
5 |
import pandas as pd
|
6 |
import gradio as gr
|
|
|
7 |
|
8 |
#file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
|
9 |
|
|
|
11 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
12 |
language = 'en'
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# Create the gradio interface
|
16 |
|
|
|
18 |
|
19 |
with block:
|
20 |
|
21 |
+
prepared_pdf_state = gr.State([])
|
22 |
+
output_image_files_state = gr.State([])
|
23 |
+
|
24 |
gr.Markdown(
|
25 |
"""
|
26 |
# Document redaction
|
|
|
46 |
output_summary = gr.Textbox(label="Output summary")
|
47 |
output_file = gr.File(label="Output file")
|
48 |
|
49 |
+
with gr.Row():
|
50 |
+
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary")
|
51 |
+
|
52 |
with gr.Tab(label="Advanced options"):
|
53 |
with gr.Accordion(label = "AWS data access", open = True):
|
54 |
aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
|
|
|
61 |
### Loading AWS data ###
|
62 |
load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
|
63 |
|
64 |
+
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
65 |
+
outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
66 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
|
67 |
+
outputs=[output_summary, output_file], api_name="redact")
|
68 |
+
|
69 |
+
convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file],
|
70 |
outputs=[output_summary, output_file], api_name="redact")
|
71 |
|
72 |
# Simple run for HF spaces or local on your computer
|
tools/aws_functions.py
CHANGED
@@ -6,10 +6,10 @@ import os
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
-
bucket_name = '
|
10 |
|
11 |
try:
|
12 |
-
session = boto3.Session(profile_name="default"
|
13 |
except Exception as e:
|
14 |
print(e)
|
15 |
|
@@ -24,7 +24,7 @@ except Exception as e:
|
|
24 |
|
25 |
|
26 |
def get_assumed_role_info():
|
27 |
-
sts = boto3.client('sts')
|
28 |
response = sts.get_caller_identity()
|
29 |
|
30 |
# Extract ARN of the assumed role
|
|
|
6 |
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
+
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
10 |
|
11 |
try:
|
12 |
+
session = boto3.Session() # profile_name="default"
|
13 |
except Exception as e:
|
14 |
print(e)
|
15 |
|
|
|
24 |
|
25 |
|
26 |
def get_assumed_role_info():
|
27 |
+
sts = boto3.client('sts', region_name='us-west-2')
|
28 |
response = sts.get_caller_identity()
|
29 |
|
30 |
# Extract ARN of the assumed role
|
tools/file_conversion.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
|
|
2 |
from PIL import Image
|
3 |
import os
|
4 |
from gradio import Progress
|
|
|
5 |
|
6 |
def is_pdf_or_image(filename):
|
7 |
"""
|
@@ -13,7 +15,7 @@ def is_pdf_or_image(filename):
|
|
13 |
Returns:
|
14 |
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
|
15 |
"""
|
16 |
-
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"):
|
17 |
output = True
|
18 |
else:
|
19 |
output = False
|
@@ -34,7 +36,7 @@ def is_pdf(filename):
|
|
34 |
# %%
|
35 |
## Convert pdf to image if necessary
|
36 |
|
37 |
-
def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
|
38 |
|
39 |
# Get the number of pages in the PDF
|
40 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
@@ -54,25 +56,14 @@ def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
|
|
54 |
if not image:
|
55 |
break
|
56 |
|
57 |
-
# # Convert PDF to a list of images
|
58 |
-
# images = convert_from_path(pdf_path)
|
59 |
-
|
60 |
-
# images = []
|
61 |
-
|
62 |
images.extend(image)
|
63 |
|
64 |
-
# Save each image as a separate file - deprecated
|
65 |
-
#image_paths = []
|
66 |
-
# for i, image in enumerate(images):
|
67 |
-
# page_path = f"processing/page_{i+1}.png"
|
68 |
-
# image.save(page_path, "PNG")
|
69 |
-
# image_paths.append(page_path)
|
70 |
-
|
71 |
print("PDF has been converted to images.")
|
72 |
|
73 |
return images
|
74 |
|
75 |
-
|
|
|
76 |
def process_file(file_path):
|
77 |
# Get the file extension
|
78 |
file_extension = os.path.splitext(file_path)[1].lower()
|
@@ -95,3 +86,55 @@ def process_file(file_path):
|
|
95 |
|
96 |
return out_path
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_path_end
|
3 |
from PIL import Image
|
4 |
import os
|
5 |
from gradio import Progress
|
6 |
+
from typing import List
|
7 |
|
8 |
def is_pdf_or_image(filename):
|
9 |
"""
|
|
|
15 |
Returns:
|
16 |
bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
|
17 |
"""
|
18 |
+
if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
|
19 |
output = True
|
20 |
else:
|
21 |
output = False
|
|
|
36 |
# %%
|
37 |
## Convert pdf to image if necessary
|
38 |
|
39 |
+
def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
|
40 |
|
41 |
# Get the number of pages in the PDF
|
42 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
|
|
56 |
if not image:
|
57 |
break
|
58 |
|
|
|
|
|
|
|
|
|
|
|
59 |
images.extend(image)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
print("PDF has been converted to images.")
|
62 |
|
63 |
return images
|
64 |
|
65 |
+
|
66 |
+
# %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
67 |
def process_file(file_path):
|
68 |
# Get the file extension
|
69 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
86 |
|
87 |
return out_path
|
88 |
|
89 |
+
def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
|
90 |
+
|
91 |
+
out_message = ''
|
92 |
+
out_file_paths = []
|
93 |
+
|
94 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
95 |
+
|
96 |
+
if file_path:
|
97 |
+
file_path_without_ext = get_file_path_end(file_path)
|
98 |
+
else:
|
99 |
+
out_message = "No file selected"
|
100 |
+
print(out_message)
|
101 |
+
return out_message, out_file_paths
|
102 |
+
|
103 |
+
if in_redact_method == "Image analysis":
|
104 |
+
# Analyse and redact image-based pdf or image
|
105 |
+
if is_pdf_or_image(file_path) == False:
|
106 |
+
return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
107 |
+
|
108 |
+
out_file_path = process_file(file_path)
|
109 |
+
|
110 |
+
elif in_redact_method == "Text analysis":
|
111 |
+
if is_pdf(file_path) == False:
|
112 |
+
return "Please upload a PDF file for text analysis.", None
|
113 |
+
|
114 |
+
out_file_path = file_path
|
115 |
+
|
116 |
+
return out_message, out_file_path
|
117 |
+
|
118 |
+
|
119 |
+
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
120 |
+
file_path_without_ext = get_file_path_end(in_file_path)
|
121 |
+
|
122 |
+
out_file_paths = []
|
123 |
+
|
124 |
+
# Convert annotated text pdf back to image to give genuine redactions
|
125 |
+
print("Creating image version of results")
|
126 |
+
pdf_text_image_paths = process_file(out_text_file_path)
|
127 |
+
out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
128 |
+
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
129 |
+
|
130 |
+
out_file_paths.append(out_text_image_file_path)
|
131 |
+
|
132 |
+
out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
|
133 |
+
|
134 |
+
return out_message, out_file_paths
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
|
tools/file_redaction.py
CHANGED
@@ -7,8 +7,68 @@ from tools.file_conversion import process_file
|
|
7 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
|
8 |
from pikepdf import Pdf, Dictionary, Name
|
9 |
from gradio import Progress
|
|
|
10 |
|
11 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
14 |
'''
|
@@ -42,7 +102,6 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
|
|
42 |
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
43 |
engine = ImageRedactorEngine(image_analyser)
|
44 |
|
45 |
-
|
46 |
if language == 'en':
|
47 |
ocr_lang = 'eng'
|
48 |
else: ocr_lang = language
|
@@ -62,26 +121,6 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
|
|
62 |
)
|
63 |
|
64 |
images.append(redacted_image)
|
65 |
-
|
66 |
-
# multiple inputs (variant 2)
|
67 |
-
# with open("name.pdf","wb") as f:
|
68 |
-
# f.write(img2pdf.convert(["test1.jpg", "test2.png"]))
|
69 |
-
|
70 |
-
# # Create page from image
|
71 |
-
# pdf.add_blank_page(page_size=(redacted_image.width, redacted_image.height))
|
72 |
-
# page = pdf.pages[-1]
|
73 |
-
# page.add_image(redacted_image, 0, 0)
|
74 |
-
|
75 |
-
# %%
|
76 |
-
# Get descriptive output of results for checks - not necessary except for debugging
|
77 |
-
# bboxes = image_analyser.analyze(image)
|
78 |
-
|
79 |
-
# # %%
|
80 |
-
# check_df = pd.DataFrame(bboxes)[0].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
81 |
-
|
82 |
-
# check_df.columns = ["type", "start", "end", "score", "left", "top", "width", "height"]
|
83 |
-
|
84 |
-
# check_df.to_csv("check_df.csv")
|
85 |
|
86 |
return images
|
87 |
|
|
|
7 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
|
8 |
from pikepdf import Pdf, Dictionary, Name
|
9 |
from gradio import Progress
|
10 |
+
import time
|
11 |
|
12 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
13 |
+
from tools.helper_functions import get_file_path_end
|
14 |
+
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
15 |
+
import gradio as gr
|
16 |
+
|
17 |
+
def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
18 |
+
|
19 |
+
tic = time.perf_counter()
|
20 |
+
|
21 |
+
out_message = ''
|
22 |
+
out_file_paths = []
|
23 |
+
|
24 |
+
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
25 |
+
|
26 |
+
if file_path:
|
27 |
+
file_path_without_ext = get_file_path_end(file_path)
|
28 |
+
else:
|
29 |
+
out_message = "No file selected"
|
30 |
+
print(out_message)
|
31 |
+
return out_message, out_file_paths
|
32 |
+
|
33 |
+
if in_redact_method == "Image analysis":
|
34 |
+
# Analyse and redact image-based pdf or image
|
35 |
+
# if is_pdf_or_image(file_path) == False:
|
36 |
+
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
37 |
+
|
38 |
+
pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
39 |
+
out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
|
40 |
+
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
41 |
+
|
42 |
+
out_file_paths.append(out_image_file_path)
|
43 |
+
out_message = "Image-based PDF successfully redacted and saved to file."
|
44 |
+
|
45 |
+
elif in_redact_method == "Text analysis":
|
46 |
+
if is_pdf(file_path) == False:
|
47 |
+
return "Please upload a PDF file for text analysis.", None
|
48 |
+
|
49 |
+
# Analyse text-based pdf
|
50 |
+
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
51 |
+
out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
|
52 |
+
pdf_text.save(out_text_file_path)
|
53 |
+
|
54 |
+
out_file_paths.append(out_text_file_path)
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
else:
|
60 |
+
out_message = "No redaction method selected"
|
61 |
+
print(out_message)
|
62 |
+
return out_message, out_file_paths
|
63 |
+
|
64 |
+
toc = time.perf_counter()
|
65 |
+
out_time = f"Time taken: {toc - tic:0.1f} seconds."
|
66 |
+
print(out_time)
|
67 |
+
|
68 |
+
out_message = out_message + "\n\n" + out_time
|
69 |
+
|
70 |
+
return out_message, out_file_paths
|
71 |
+
|
72 |
|
73 |
def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
74 |
'''
|
|
|
102 |
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
103 |
engine = ImageRedactorEngine(image_analyser)
|
104 |
|
|
|
105 |
if language == 'en':
|
106 |
ocr_lang = 'eng'
|
107 |
else: ocr_lang = language
|
|
|
121 |
)
|
122 |
|
123 |
images.append(redacted_image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
return images
|
126 |
|