Commit
·
f0c28d7
1
Parent(s):
e3365ed
Updated packages. Reinstituted multithreading with page load, now with order protected. Smaller spacy model used for speed. Textract calls should now be faster
Browse files- requirements.txt +6 -6
- tools/aws_textract.py +3 -3
- tools/file_conversion.py +110 -123
- tools/file_redaction.py +97 -36
- tools/helper_functions.py +8 -3
- tools/load_spacy_model_custom_recognisers.py +4 -4
requirements.txt
CHANGED
@@ -7,12 +7,12 @@ presidio_anonymizer==2.2.355
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
10 |
-
spacy==3.
|
11 |
-
en_core_web_lg @ https://github.com/explosion/spacy
|
12 |
-
|
13 |
-
gradio==5.
|
14 |
-
boto3==1.35.
|
15 |
-
pyarrow==
|
16 |
openpyxl==3.1.2
|
17 |
Faker==22.2.0
|
18 |
gradio_image_annotation==0.2.5
|
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
10 |
+
spacy==3.8.3
|
11 |
+
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
12 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
13 |
+
gradio==5.9.0
|
14 |
+
boto3==1.35.83
|
15 |
+
pyarrow==18.1.0
|
16 |
openpyxl==3.1.2
|
17 |
Faker==22.2.0
|
18 |
gradio_image_annotation==0.2.5
|
tools/aws_textract.py
CHANGED
@@ -46,8 +46,8 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
|
|
46 |
try:
|
47 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
48 |
except Exception as e:
|
49 |
-
print("Textract call failed due to:", e, "trying again in
|
50 |
-
time.sleep(
|
51 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
52 |
else:
|
53 |
#print("Analysing document without signature detection")
|
@@ -185,7 +185,7 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
|
185 |
|
186 |
if recogniser_result not in handwriting:
|
187 |
handwriting.append(recogniser_result)
|
188 |
-
print("Handwriting found:", handwriting[-1])
|
189 |
|
190 |
# If handwriting or signature, add to bounding box
|
191 |
|
|
|
46 |
try:
|
47 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
48 |
except Exception as e:
|
49 |
+
print("Textract call failed due to:", e, "trying again in 3 seconds.")
|
50 |
+
time.sleep(3)
|
51 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
52 |
else:
|
53 |
#print("Analysing document without signature detection")
|
|
|
185 |
|
186 |
if recogniser_result not in handwriting:
|
187 |
handwriting.append(recogniser_result)
|
188 |
+
#print("Handwriting found:", handwriting[-1])
|
189 |
|
190 |
# If handwriting or signature, add to bounding box
|
191 |
|
tools/file_conversion.py
CHANGED
@@ -48,122 +48,112 @@ def is_pdf(filename):
|
|
48 |
|
49 |
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
# print(f"Converting page {page_num + 1}")
|
70 |
-
# image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
71 |
-
# dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
72 |
-
# image = image_l[0]
|
73 |
-
|
74 |
-
# # Convert to greyscale
|
75 |
-
# image = image.convert("L")
|
76 |
-
# image.save(out_path, format="PNG")
|
77 |
-
|
78 |
-
# return out_path
|
79 |
-
|
80 |
-
# except Exception as e:
|
81 |
-
# print(f"Error processing page {page_num + 1}: {e}")
|
82 |
-
# return None
|
83 |
-
|
84 |
-
# def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
85 |
-
# """
|
86 |
-
# Convert pages of a PDF to images using multithreading.
|
87 |
-
# """
|
88 |
-
# # Get the number of pages in the PDF
|
89 |
-
# page_count = pdfinfo_from_path(pdf_path)['Pages']
|
90 |
-
# print(f"Number of pages in PDF: {page_count}")
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
#
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
# print("A page failed to process.")
|
107 |
|
108 |
-
#
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
-
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
112 |
|
113 |
-
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
|
119 |
-
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
|
125 |
-
|
126 |
|
127 |
-
|
128 |
|
129 |
-
|
130 |
-
|
131 |
|
132 |
-
|
133 |
-
|
134 |
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
|
140 |
-
|
141 |
-
|
142 |
|
143 |
-
|
144 |
|
145 |
-
|
146 |
-
|
147 |
|
148 |
-
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
|
155 |
-
|
156 |
-
|
157 |
|
158 |
-
|
159 |
|
160 |
-
|
161 |
-
|
162 |
|
163 |
-
|
164 |
|
165 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
166 |
-
def process_file(file_path:str):
|
167 |
# Get the file extension
|
168 |
file_extension = os.path.splitext(file_path)[1].lower()
|
169 |
|
@@ -178,7 +168,7 @@ def process_file(file_path:str):
|
|
178 |
elif file_extension == '.pdf':
|
179 |
print(f"{file_path} is a PDF file. Converting to image set")
|
180 |
# Run your function for processing PDF files here
|
181 |
-
img_object = convert_pdf_to_images(file_path)
|
182 |
|
183 |
else:
|
184 |
print(f"{file_path} is not an image or PDF file.")
|
@@ -195,7 +185,7 @@ def get_input_file_names(file_input):
|
|
195 |
file_name_with_extension = ""
|
196 |
full_file_name = ""
|
197 |
|
198 |
-
print("file_input in input file names:", file_input)
|
199 |
if isinstance(file_input, dict):
|
200 |
file_input = os.path.abspath(file_input["name"])
|
201 |
|
@@ -222,8 +212,6 @@ def get_input_file_names(file_input):
|
|
222 |
|
223 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
224 |
|
225 |
-
print("all_relevant_files_str:", all_relevant_files_str)
|
226 |
-
|
227 |
return all_relevant_files_str, file_name_with_extension, full_file_name
|
228 |
|
229 |
def prepare_image_or_pdf(
|
@@ -253,6 +241,7 @@ def prepare_image_or_pdf(
|
|
253 |
out_message (List[str]): List to store output messages.
|
254 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
255 |
number_of_pages (int): integer indicating the number of pages in the document
|
|
|
256 |
all_annotations_object(List of annotation objects): All annotations for current document
|
257 |
prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
258 |
progress (Progress): Progress tracker for the operation.
|
@@ -352,11 +341,11 @@ def prepare_image_or_pdf(
|
|
352 |
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
353 |
in_redact_method = tesseract_ocr_option
|
354 |
|
355 |
-
|
356 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
357 |
if file_path.endswith(".json"):
|
358 |
|
359 |
if prepare_for_review == True:
|
|
|
360 |
if isinstance(file_path, str):
|
361 |
with open(file_path, 'r') as json_file:
|
362 |
all_annotations_object = json.load(json_file)
|
@@ -372,11 +361,12 @@ def prepare_image_or_pdf(
|
|
372 |
]
|
373 |
image_file_paths_pages = [int(i) for i in image_file_paths_pages]
|
374 |
|
375 |
-
|
376 |
-
# If PDF pages have been converted to image files, replace the current image paths in the json to this
|
377 |
if image_file_paths:
|
|
|
378 |
for i, annotation in enumerate(all_annotations_object):
|
379 |
annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
|
|
|
380 |
|
381 |
# Check if the annotation page number exists in the image file paths pages
|
382 |
if annotation_page_number in image_file_paths_pages:
|
@@ -385,7 +375,7 @@ def prepare_image_or_pdf(
|
|
385 |
correct_image_page = annotation_page_number
|
386 |
annotation["image"] = image_file_paths[correct_image_page]
|
387 |
else:
|
388 |
-
print("Page not found.")
|
389 |
|
390 |
#print("all_annotations_object:", all_annotations_object)
|
391 |
|
@@ -404,30 +394,24 @@ def prepare_image_or_pdf(
|
|
404 |
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
405 |
continue
|
406 |
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
elif in_redact_method == text_ocr_option:
|
424 |
-
if is_pdf(file_path) == False:
|
425 |
-
out_message = "Please upload a PDF file for text analysis."
|
426 |
-
print(out_message)
|
427 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
428 |
-
|
429 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
430 |
-
image_file_path = process_file(file_path)
|
431 |
|
432 |
converted_file_paths.append(converted_file_path)
|
433 |
image_file_paths.extend(image_file_path)
|
@@ -453,7 +437,10 @@ def prepare_image_or_pdf(
|
|
453 |
out_message.append(out_time)
|
454 |
out_message_out = '\n'.join(out_message)
|
455 |
|
|
|
456 |
number_of_pages = len(image_file_paths)
|
|
|
|
|
457 |
|
458 |
|
459 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
|
|
48 |
|
49 |
|
50 |
|
51 |
+
def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple[int, str]:
|
52 |
+
try:
|
53 |
+
out_path = f"{pdf_path}_{page_num}.png"
|
54 |
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
55 |
+
if os.path.exists(out_path):
|
56 |
+
print(f"Loading existing image for page {page_num + 1}")
|
57 |
+
image = Image.open(out_path)
|
58 |
+
else:
|
59 |
+
print(f"Converting page {page_num + 1}")
|
60 |
+
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
61 |
+
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
62 |
+
image = image_l[0]
|
63 |
+
image = image.convert("L")
|
64 |
+
image.save(out_path, format="PNG")
|
65 |
+
return page_num, out_path
|
66 |
+
except Exception as e:
|
67 |
+
print(f"Error processing page {page_num + 1}: {e}")
|
68 |
+
return page_num, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
+
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
71 |
+
|
72 |
+
# If preparing for review, just load the first page
|
73 |
+
if prepare_for_review == True:
|
74 |
+
page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
|
75 |
+
else:
|
76 |
+
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
77 |
+
|
78 |
+
print(f"Number of pages in PDF: {page_count}")
|
79 |
+
|
80 |
+
results = []
|
81 |
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
82 |
+
futures = []
|
83 |
+
for page_num in range(page_min, page_count):
|
84 |
+
futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
|
85 |
|
86 |
+
for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
|
87 |
+
page_num, result = future.result()
|
88 |
+
if result:
|
89 |
+
results.append((page_num, result))
|
90 |
+
else:
|
91 |
+
print(f"Page {page_num + 1} failed to process.")
|
|
|
92 |
|
93 |
+
# Sort results by page number
|
94 |
+
results.sort(key=lambda x: x[0])
|
95 |
+
images = [result[1] for result in results]
|
96 |
+
|
97 |
+
print("PDF has been converted to images.")
|
98 |
+
return images
|
99 |
+
|
100 |
|
101 |
+
# def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
102 |
|
103 |
+
# print("pdf_path in convert_pdf_to_images:", pdf_path)
|
104 |
|
105 |
+
# # Get the number of pages in the PDF
|
106 |
+
# page_count = pdfinfo_from_path(pdf_path)['Pages']
|
107 |
+
# print("Number of pages in PDF: ", str(page_count))
|
108 |
|
109 |
+
# images = []
|
110 |
|
111 |
+
# # Open the PDF file
|
112 |
+
# #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
113 |
+
# for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
|
114 |
|
115 |
+
# #print("page_num in convert_pdf_to_images:", page_num)
|
116 |
|
117 |
+
# print("Converting page: ", str(page_num + 1))
|
118 |
|
119 |
+
# # Convert one page to image
|
120 |
+
# out_path = pdf_path + "_" + str(page_num) + ".png"
|
121 |
|
122 |
+
# # Ensure the directory exists
|
123 |
+
# os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
124 |
|
125 |
+
# # Check if the image already exists
|
126 |
+
# if os.path.exists(out_path):
|
127 |
+
# #print(f"Loading existing image from {out_path}.")
|
128 |
+
# image = Image.open(out_path) # Load the existing image
|
129 |
|
130 |
+
# else:
|
131 |
+
# image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
132 |
|
133 |
+
# image = image_l[0]
|
134 |
|
135 |
+
# # Convert to greyscale
|
136 |
+
# image = image.convert("L")
|
137 |
|
138 |
+
# image.save(out_path, format="PNG") # Save the new image
|
139 |
|
140 |
+
# # If no images are returned, break the loop
|
141 |
+
# if not image:
|
142 |
+
# print("Conversion of page", str(page_num), "to file failed.")
|
143 |
+
# break
|
144 |
|
145 |
+
# # print("Conversion of page", str(page_num), "to file succeeded.")
|
146 |
+
# # print("image:", image)
|
147 |
|
148 |
+
# images.append(out_path)
|
149 |
|
150 |
+
# print("PDF has been converted to images.")
|
151 |
+
# # print("Images:", images)
|
152 |
|
153 |
+
# return images
|
154 |
|
155 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
156 |
+
def process_file(file_path:str, prepare_for_review:bool=False):
|
157 |
# Get the file extension
|
158 |
file_extension = os.path.splitext(file_path)[1].lower()
|
159 |
|
|
|
168 |
elif file_extension == '.pdf':
|
169 |
print(f"{file_path} is a PDF file. Converting to image set")
|
170 |
# Run your function for processing PDF files here
|
171 |
+
img_object = convert_pdf_to_images(file_path, prepare_for_review)
|
172 |
|
173 |
else:
|
174 |
print(f"{file_path} is not an image or PDF file.")
|
|
|
185 |
file_name_with_extension = ""
|
186 |
full_file_name = ""
|
187 |
|
188 |
+
#print("file_input in input file names:", file_input)
|
189 |
if isinstance(file_input, dict):
|
190 |
file_input = os.path.abspath(file_input["name"])
|
191 |
|
|
|
212 |
|
213 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
214 |
|
|
|
|
|
215 |
return all_relevant_files_str, file_name_with_extension, full_file_name
|
216 |
|
217 |
def prepare_image_or_pdf(
|
|
|
241 |
out_message (List[str]): List to store output messages.
|
242 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
243 |
number_of_pages (int): integer indicating the number of pages in the document
|
244 |
+
current_loop_page_number (int): Current number of loop
|
245 |
all_annotations_object(List of annotation objects): All annotations for current document
|
246 |
prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
247 |
progress (Progress): Progress tracker for the operation.
|
|
|
341 |
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
342 |
in_redact_method = tesseract_ocr_option
|
343 |
|
|
|
344 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
345 |
if file_path.endswith(".json"):
|
346 |
|
347 |
if prepare_for_review == True:
|
348 |
+
print("Preparing file for review")
|
349 |
if isinstance(file_path, str):
|
350 |
with open(file_path, 'r') as json_file:
|
351 |
all_annotations_object = json.load(json_file)
|
|
|
361 |
]
|
362 |
image_file_paths_pages = [int(i) for i in image_file_paths_pages]
|
363 |
|
364 |
+
# If PDF pages have been converted to image files, replace the current image paths in the json to this.
|
|
|
365 |
if image_file_paths:
|
366 |
+
|
367 |
for i, annotation in enumerate(all_annotations_object):
|
368 |
annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
|
369 |
+
#print("Annotation page number:", annotation_page_number)
|
370 |
|
371 |
# Check if the annotation page number exists in the image file paths pages
|
372 |
if annotation_page_number in image_file_paths_pages:
|
|
|
375 |
correct_image_page = annotation_page_number
|
376 |
annotation["image"] = image_file_paths[correct_image_page]
|
377 |
else:
|
378 |
+
print("Page", annotation_page_number, "image file not found.")
|
379 |
|
380 |
#print("all_annotations_object:", all_annotations_object)
|
381 |
|
|
|
394 |
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
395 |
continue
|
396 |
|
397 |
+
# Must be a pdf or image at this point
|
398 |
+
else:
|
399 |
+
|
400 |
+
# Convert pdf/image file to correct format for redaction
|
401 |
+
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
402 |
+
if is_pdf_or_image(file_path) == False:
|
403 |
+
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
404 |
+
print(out_message)
|
405 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
406 |
+
|
407 |
+
elif in_redact_method == text_ocr_option:
|
408 |
+
if is_pdf(file_path) == False:
|
409 |
+
out_message = "Please upload a PDF file for text analysis."
|
410 |
+
print(out_message)
|
411 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
412 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
414 |
+
image_file_path = process_file(file_path, prepare_for_review)
|
415 |
|
416 |
converted_file_paths.append(converted_file_path)
|
417 |
image_file_paths.extend(image_file_path)
|
|
|
437 |
out_message.append(out_time)
|
438 |
out_message_out = '\n'.join(out_message)
|
439 |
|
440 |
+
if prepare_for_review == False:
|
441 |
number_of_pages = len(image_file_paths)
|
442 |
+
else:
|
443 |
+
number_of_pages = len(all_annotations_object)
|
444 |
|
445 |
|
446 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
tools/file_redaction.py
CHANGED
@@ -689,8 +689,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
689 |
merged_bboxes = []
|
690 |
grouped_bboxes = defaultdict(list)
|
691 |
|
692 |
-
print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
|
693 |
-
|
694 |
# Process signature and handwriting results
|
695 |
if signature_recogniser_results or handwriting_recogniser_results:
|
696 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
@@ -906,6 +904,30 @@ def redact_image_pdf(file_path:str,
|
|
906 |
if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
907 |
elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
909 |
if current_loop_page == 0: page_loop_start = 0
|
910 |
else: page_loop_start = current_loop_page
|
911 |
|
@@ -919,7 +941,7 @@ def redact_image_pdf(file_path:str,
|
|
919 |
page_break_return = False
|
920 |
|
921 |
reported_page_number = str(page_no + 1)
|
922 |
-
print("Redacting page:", reported_page_number)
|
923 |
|
924 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
925 |
try:
|
@@ -962,49 +984,72 @@ def redact_image_pdf(file_path:str,
|
|
962 |
image_buffer = io.BytesIO()
|
963 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
964 |
pdf_page_as_bytes = image_buffer.getvalue()
|
965 |
-
|
966 |
-
|
967 |
-
json_file_path = output_folder + file_name + "_textract.json"
|
968 |
-
|
969 |
-
if not os.path.exists(json_file_path):
|
970 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
971 |
logging_file_paths.append(json_file_path)
|
972 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
973 |
|
974 |
-
|
975 |
|
976 |
-
# Write the updated existing_data back to the JSON file
|
977 |
-
with open(json_file_path, 'w') as json_file:
|
978 |
-
json.dump(wrapped_text_blocks, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
979 |
else:
|
980 |
-
#
|
981 |
-
|
982 |
-
with open(json_file_path, 'r') as json_file:
|
983 |
-
existing_data = json.load(json_file)
|
984 |
|
985 |
-
|
986 |
-
|
|
|
987 |
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
|
992 |
-
|
993 |
-
|
994 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
995 |
|
996 |
-
|
997 |
-
existing_data["pages"].append(text_blocks)
|
998 |
|
999 |
-
# Write the updated existing_data back to the JSON file
|
1000 |
-
with open(json_file_path, 'w') as json_file:
|
1001 |
-
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1002 |
|
1003 |
-
|
1004 |
-
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1008 |
|
1009 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1010 |
|
@@ -1124,6 +1169,11 @@ def redact_image_pdf(file_path:str,
|
|
1124 |
|
1125 |
annotations_all_pages.append(image_annotations)
|
1126 |
|
|
|
|
|
|
|
|
|
|
|
1127 |
current_loop_page += 1
|
1128 |
|
1129 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
@@ -1142,7 +1192,18 @@ def redact_image_pdf(file_path:str,
|
|
1142 |
progress.close(_tqdm=progress_bar)
|
1143 |
tqdm._instances.clear()
|
1144 |
|
|
|
|
|
|
|
|
|
|
|
1145 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
|
|
|
|
|
|
|
|
|
|
|
|
1146 |
|
1147 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1148 |
|
@@ -1675,8 +1736,8 @@ def redact_text_pdf(
|
|
1675 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
|
1676 |
|
1677 |
#print("Did redact_page_with_pymupdf function")
|
1678 |
-
|
1679 |
-
print("For page number:",
|
1680 |
|
1681 |
# Write logs
|
1682 |
# Create decision process table
|
|
|
689 |
merged_bboxes = []
|
690 |
grouped_bboxes = defaultdict(list)
|
691 |
|
|
|
|
|
692 |
# Process signature and handwriting results
|
693 |
if signature_recogniser_results or handwriting_recogniser_results:
|
694 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
|
|
904 |
if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
905 |
elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
906 |
|
907 |
+
# If running Textract, check if file already exists. If it does, load in existing data
|
908 |
+
# Import results from json and convert
|
909 |
+
if analysis_type == textract_option:
|
910 |
+
|
911 |
+
json_file_path = output_folder + file_name + "_textract.json"
|
912 |
+
logging_file_paths.append(json_file_path)
|
913 |
+
|
914 |
+
if not os.path.exists(json_file_path):
|
915 |
+
no_textract_file = True
|
916 |
+
print("No existing Textract results file found.")
|
917 |
+
existing_data = {}
|
918 |
+
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
919 |
+
#logging_file_paths.append(json_file_path)
|
920 |
+
#request_metadata = request_metadata + "\n" + new_request_metadata
|
921 |
+
#wrapped_text_blocks = {"pages":[text_blocks]}
|
922 |
+
else:
|
923 |
+
# Open the file and load the JSON data
|
924 |
+
no_textract_file = False
|
925 |
+
print("Found existing Textract json results file.")
|
926 |
+
with open(json_file_path, 'r') as json_file:
|
927 |
+
existing_data = json.load(json_file)
|
928 |
+
|
929 |
+
###
|
930 |
+
|
931 |
if current_loop_page == 0: page_loop_start = 0
|
932 |
else: page_loop_start = current_loop_page
|
933 |
|
|
|
941 |
page_break_return = False
|
942 |
|
943 |
reported_page_number = str(page_no + 1)
|
944 |
+
#print("Redacting page:", reported_page_number)
|
945 |
|
946 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
947 |
try:
|
|
|
984 |
image_buffer = io.BytesIO()
|
985 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
986 |
pdf_page_as_bytes = image_buffer.getvalue()
|
987 |
+
|
988 |
+
if not existing_data:
|
|
|
|
|
|
|
989 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
990 |
logging_file_paths.append(json_file_path)
|
991 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
992 |
|
993 |
+
existing_data = {"pages":[text_blocks]}
|
994 |
|
|
|
|
|
|
|
995 |
else:
|
996 |
+
# Check if the current reported_page_number exists in the loaded JSON
|
997 |
+
page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
|
|
|
|
|
998 |
|
999 |
+
if not page_exists: # If the page does not exist, analyze again
|
1000 |
+
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
1001 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1002 |
|
1003 |
+
# Check if "pages" key exists, if not, initialize it as an empty list
|
1004 |
+
if "pages" not in existing_data:
|
1005 |
+
existing_data["pages"] = []
|
1006 |
|
1007 |
+
# Append the new page data
|
1008 |
+
existing_data["pages"].append(text_blocks)
|
1009 |
+
|
1010 |
+
request_metadata = request_metadata + "\n" + new_request_metadata
|
1011 |
+
else:
|
1012 |
+
# If the page exists, retrieve the data
|
1013 |
+
text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1014 |
+
|
1015 |
+
|
1016 |
+
# if not os.path.exists(json_file_path):
|
1017 |
+
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1018 |
+
# logging_file_paths.append(json_file_path)
|
1019 |
+
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1020 |
|
1021 |
+
# existing_data = {"pages":[text_blocks]}
|
|
|
1022 |
|
|
|
|
|
|
|
1023 |
|
1024 |
+
# else:
|
1025 |
+
# # Open the file and load the JSON data
|
1026 |
+
# print("Found existing Textract json results file.")
|
1027 |
+
# with open(json_file_path, 'r') as json_file:
|
1028 |
+
# existing_data = json.load(json_file)
|
1029 |
+
|
1030 |
+
# # Check if the current reported_page_number exists in the loaded JSON
|
1031 |
+
# page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
|
1032 |
+
|
1033 |
+
# if not page_exists: # If the page does not exist, analyze again
|
1034 |
+
# print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
1035 |
+
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1036 |
+
|
1037 |
+
# # Check if "pages" key exists, if not, initialize it as an empty list
|
1038 |
+
# if "pages" not in existing_data:
|
1039 |
+
# existing_data["pages"] = []
|
1040 |
+
|
1041 |
+
# # Append the new page data
|
1042 |
+
# existing_data["pages"].append(text_blocks)
|
1043 |
+
|
1044 |
+
# # Write the updated existing_data back to the JSON file
|
1045 |
+
# with open(json_file_path, 'w') as json_file:
|
1046 |
+
# json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1047 |
+
|
1048 |
+
# logging_file_paths.append(json_file_path)
|
1049 |
+
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1050 |
+
# else:
|
1051 |
+
# # If the page exists, retrieve the data
|
1052 |
+
# text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1053 |
|
1054 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1055 |
|
|
|
1169 |
|
1170 |
annotations_all_pages.append(image_annotations)
|
1171 |
|
1172 |
+
if analysis_type == textract_option:
|
1173 |
+
# Write the updated existing textract data back to the JSON file
|
1174 |
+
with open(json_file_path, 'w') as json_file:
|
1175 |
+
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1176 |
+
|
1177 |
current_loop_page += 1
|
1178 |
|
1179 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
|
|
1192 |
progress.close(_tqdm=progress_bar)
|
1193 |
tqdm._instances.clear()
|
1194 |
|
1195 |
+
if analysis_type == textract_option:
|
1196 |
+
# Write the updated existing textract data back to the JSON file
|
1197 |
+
with open(json_file_path, 'w') as json_file:
|
1198 |
+
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1199 |
+
|
1200 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1201 |
+
|
1202 |
+
if analysis_type == textract_option:
|
1203 |
+
# Write the updated existing textract data back to the JSON file
|
1204 |
+
|
1205 |
+
with open(json_file_path, 'w') as json_file:
|
1206 |
+
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1207 |
|
1208 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1209 |
|
|
|
1736 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
|
1737 |
|
1738 |
#print("Did redact_page_with_pymupdf function")
|
1739 |
+
reported_page_no = page_no + 1
|
1740 |
+
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1741 |
|
1742 |
# Write logs
|
1743 |
# Create decision process table
|
tools/helper_functions.py
CHANGED
@@ -31,9 +31,9 @@ def get_or_create_env_var(var_name, default_value):
|
|
31 |
|
32 |
|
33 |
# Names for options labels
|
34 |
-
text_ocr_option = "
|
35 |
-
tesseract_ocr_option = "OCR
|
36 |
-
textract_option = "
|
37 |
|
38 |
local_pii_detector = "Local"
|
39 |
aws_pii_detector = "AWS Comprehend"
|
@@ -263,6 +263,11 @@ async def get_connection_params(request: gr.Request):
|
|
263 |
base_folder = "user-files/"
|
264 |
print("Cognito ID found:", out_session_hash)
|
265 |
|
|
|
|
|
|
|
|
|
|
|
266 |
else:
|
267 |
out_session_hash = request.session_hash
|
268 |
base_folder = "temp-files/"
|
|
|
31 |
|
32 |
|
33 |
# Names for options labels
|
34 |
+
text_ocr_option = "Local model - selectable text"
|
35 |
+
tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
|
36 |
+
textract_option = "AWS Textract service - all PDF types"
|
37 |
|
38 |
local_pii_detector = "Local"
|
39 |
aws_pii_detector = "AWS Comprehend"
|
|
|
263 |
base_folder = "user-files/"
|
264 |
print("Cognito ID found:", out_session_hash)
|
265 |
|
266 |
+
elif 'x-amzn-oidc-identity' in request.headers:
|
267 |
+
out_session_hash = request.headers['x-amzn-oidc-identity']
|
268 |
+
base_folder = "user-files/"
|
269 |
+
print("Cognito ID found:", out_session_hash)
|
270 |
+
|
271 |
else:
|
272 |
out_session_hash = request.session_hash
|
273 |
base_folder = "temp-files/"
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -8,20 +8,20 @@ from spacy.cli.download import download
|
|
8 |
import re
|
9 |
|
10 |
# %%
|
11 |
-
model_name = "
|
12 |
score_threshold = 0.001
|
13 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
14 |
|
15 |
#Load spacy model
|
16 |
try:
|
17 |
-
import
|
18 |
-
nlp =
|
19 |
print("Successfully imported spaCy model")
|
20 |
|
21 |
except:
|
22 |
download(model_name)
|
23 |
nlp = spacy.load(model_name)
|
24 |
-
print("Successfully downloaded and imported spaCy model")
|
25 |
|
26 |
# #### Custom recognisers
|
27 |
# Allow user to create their own recogniser
|
|
|
8 |
import re
|
9 |
|
10 |
# %%
|
11 |
+
model_name = "en_core_web_sm" #"en_core_web_trf"
|
12 |
score_threshold = 0.001
|
13 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
14 |
|
15 |
#Load spacy model
|
16 |
try:
|
17 |
+
import en_core_web_sm
|
18 |
+
nlp = en_core_web_sm.load()
|
19 |
print("Successfully imported spaCy model")
|
20 |
|
21 |
except:
|
22 |
download(model_name)
|
23 |
nlp = spacy.load(model_name)
|
24 |
+
print("Successfully downloaded and imported spaCy model", model_name)
|
25 |
|
26 |
# #### Custom recognisers
|
27 |
# Allow user to create their own recogniser
|