Commit
·
9504619
1
Parent(s):
b8e245f
Multithreaded file preparation. Can call Textract without signature detection
Browse files- app.py +1 -1
- tools/aws_textract.py +13 -5
- tools/file_conversion.py +103 -35
- tools/file_redaction.py +5 -4
app.py
CHANGED
@@ -263,7 +263,7 @@ with app:
|
|
263 |
|
264 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
265 |
|
266 |
-
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"
|
267 |
#with gr.Row():
|
268 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
269 |
|
|
|
263 |
|
264 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
265 |
|
266 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
|
267 |
#with gr.Row():
|
268 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
269 |
|
tools/aws_textract.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
import boto3
|
2 |
-
from PIL import Image
|
|
|
3 |
import io
|
4 |
-
import json
|
5 |
import pikepdf
|
6 |
# Example: converting this single page to an image
|
7 |
-
from pdf2image import convert_from_bytes
|
8 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
9 |
|
10 |
def extract_textract_metadata(response):
|
@@ -23,7 +24,7 @@ def extract_textract_metadata(response):
|
|
23 |
#'NumberOfPages': number_of_pages
|
24 |
})
|
25 |
|
26 |
-
def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
|
27 |
'''
|
28 |
Analyse page with AWS Textract
|
29 |
'''
|
@@ -36,7 +37,14 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
|
|
36 |
|
37 |
print("Analysing page with AWS Textract")
|
38 |
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Wrap the response with the page number in the desired format
|
42 |
wrapped_response = {
|
|
|
1 |
import boto3
|
2 |
+
#from PIL import Image
|
3 |
+
from typing import List
|
4 |
import io
|
5 |
+
#import json
|
6 |
import pikepdf
|
7 |
# Example: converting this single page to an image
|
8 |
+
#from pdf2image import convert_from_bytes
|
9 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
10 |
|
11 |
def extract_textract_metadata(response):
|
|
|
24 |
#'NumberOfPages': number_of_pages
|
25 |
})
|
26 |
|
27 |
+
def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
|
28 |
'''
|
29 |
Analyse page with AWS Textract
|
30 |
'''
|
|
|
37 |
|
38 |
print("Analysing page with AWS Textract")
|
39 |
|
40 |
+
# Redact signatures if specified
|
41 |
+
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
42 |
+
print("Analysing document with signature detection")
|
43 |
+
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
44 |
+
else:
|
45 |
+
print("Analysing document without signature detection")
|
46 |
+
# Call detect_document_text to extract plain text
|
47 |
+
response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
|
48 |
|
49 |
# Wrap the response with the page number in the desired format
|
50 |
wrapped_response = {
|
tools/file_conversion.py
CHANGED
@@ -2,7 +2,6 @@ from pdf2image import convert_from_path, pdfinfo_from_path
|
|
2 |
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
-
|
6 |
import os
|
7 |
import re
|
8 |
import gradio as gr
|
@@ -12,6 +11,7 @@ import pymupdf
|
|
12 |
from tqdm import tqdm
|
13 |
from gradio import Progress
|
14 |
from typing import List, Optional
|
|
|
15 |
|
16 |
image_dpi = 300.0
|
17 |
|
@@ -46,61 +46,129 @@ def is_pdf(filename):
|
|
46 |
# %%
|
47 |
## Convert pdf to image if necessary
|
48 |
|
49 |
-
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
50 |
|
51 |
-
print("pdf_path in convert_pdf_to_images:", pdf_path)
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
# Get the number of pages in the PDF
|
54 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
55 |
-
print("Number of pages in PDF: "
|
56 |
|
57 |
images = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
62 |
|
63 |
-
print("page_num in convert_pdf_to_images:", page_num)
|
64 |
-
|
65 |
-
print("Converting page: ", str(page_num + 1))
|
66 |
|
67 |
-
|
68 |
-
out_path = pdf_path + "_" + str(page_num) + ".png"
|
69 |
-
|
70 |
-
# Ensure the directory exists
|
71 |
-
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
72 |
|
73 |
-
|
74 |
-
if os.path.exists(out_path):
|
75 |
-
#print(f"Loading existing image from {out_path}.")
|
76 |
-
image = Image.open(out_path) # Load the existing image
|
77 |
|
|
|
|
|
|
|
78 |
|
|
|
79 |
|
80 |
-
|
81 |
-
|
|
|
82 |
|
83 |
-
|
|
|
|
|
84 |
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
87 |
|
88 |
-
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
print("Conversion of page", str(page_num), "to file failed.")
|
93 |
-
break
|
94 |
|
95 |
-
|
96 |
-
# print("image:", image)
|
97 |
|
98 |
-
|
|
|
99 |
|
100 |
-
|
101 |
-
# print("Images:", images)
|
102 |
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
106 |
def process_file(file_path:str):
|
|
|
2 |
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
|
5 |
import os
|
6 |
import re
|
7 |
import gradio as gr
|
|
|
11 |
from tqdm import tqdm
|
12 |
from gradio import Progress
|
13 |
from typing import List, Optional
|
14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
15 |
|
16 |
image_dpi = 300.0
|
17 |
|
|
|
46 |
# %%
|
47 |
## Convert pdf to image if necessary
|
48 |
|
|
|
49 |
|
|
|
50 |
|
51 |
+
def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
|
52 |
+
"""
|
53 |
+
Convert a single page of a PDF to an image and save it as a PNG.
|
54 |
+
Returns the path to the saved image.
|
55 |
+
"""
|
56 |
+
try:
|
57 |
+
out_path = f"{pdf_path}_{page_num}.png"
|
58 |
+
|
59 |
+
# Ensure the directory exists
|
60 |
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
61 |
+
|
62 |
+
# Check if the image already exists
|
63 |
+
if os.path.exists(out_path):
|
64 |
+
# Load the existing image
|
65 |
+
print(f"Loading existing image for page {page_num + 1}")
|
66 |
+
image = Image.open(out_path)
|
67 |
+
else:
|
68 |
+
# Convert the page to an image
|
69 |
+
print(f"Converting page {page_num + 1}")
|
70 |
+
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
71 |
+
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
72 |
+
image = image_l[0]
|
73 |
+
|
74 |
+
# Convert to greyscale
|
75 |
+
image = image.convert("L")
|
76 |
+
image.save(out_path, format="PNG")
|
77 |
+
|
78 |
+
return out_path
|
79 |
+
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error processing page {page_num + 1}: {e}")
|
82 |
+
return None
|
83 |
+
|
84 |
+
def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
85 |
+
"""
|
86 |
+
Convert pages of a PDF to images using multithreading.
|
87 |
+
"""
|
88 |
# Get the number of pages in the PDF
|
89 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
90 |
+
print(f"Number of pages in PDF: {page_count}")
|
91 |
|
92 |
images = []
|
93 |
+
|
94 |
+
# Use ThreadPoolExecutor to process pages in parallel
|
95 |
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
96 |
+
futures = []
|
97 |
+
for page_num in range(page_min, page_count):
|
98 |
+
futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
|
99 |
+
|
100 |
+
# Display progress using tqdm
|
101 |
+
for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
|
102 |
+
result = future.result()
|
103 |
+
if result:
|
104 |
+
images.append(result)
|
105 |
+
else:
|
106 |
+
print("A page failed to process.")
|
107 |
+
|
108 |
+
print("PDF has been converted to images.")
|
109 |
+
return images
|
110 |
|
111 |
+
# Example usage
|
112 |
+
if __name__ == "__main__":
|
113 |
+
pdf_path = "example.pdf"
|
114 |
+
image_dpi = 200
|
115 |
+
output_images = convert_pdf_to_images(pdf_path, image_dpi=image_dpi, num_threads=8)
|
116 |
+
print("Images saved:", output_images)
|
117 |
|
|
|
|
|
|
|
118 |
|
119 |
+
# def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
# print("pdf_path in convert_pdf_to_images:", pdf_path)
|
|
|
|
|
|
|
122 |
|
123 |
+
# # Get the number of pages in the PDF
|
124 |
+
# page_count = pdfinfo_from_path(pdf_path)['Pages']
|
125 |
+
# print("Number of pages in PDF: ", str(page_count))
|
126 |
|
127 |
+
# images = []
|
128 |
|
129 |
+
# # Open the PDF file
|
130 |
+
# #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
131 |
+
# for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
|
132 |
|
133 |
+
# #print("page_num in convert_pdf_to_images:", page_num)
|
134 |
+
|
135 |
+
# print("Converting page: ", str(page_num + 1))
|
136 |
|
137 |
+
# # Convert one page to image
|
138 |
+
# out_path = pdf_path + "_" + str(page_num) + ".png"
|
139 |
+
|
140 |
+
# # Ensure the directory exists
|
141 |
+
# os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
142 |
|
143 |
+
# # Check if the image already exists
|
144 |
+
# if os.path.exists(out_path):
|
145 |
+
# #print(f"Loading existing image from {out_path}.")
|
146 |
+
# image = Image.open(out_path) # Load the existing image
|
147 |
|
148 |
+
# else:
|
149 |
+
# image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
|
|
|
|
150 |
|
151 |
+
# image = image_l[0]
|
|
|
152 |
|
153 |
+
# # Convert to greyscale
|
154 |
+
# image = image.convert("L")
|
155 |
|
156 |
+
# image.save(out_path, format="PNG") # Save the new image
|
|
|
157 |
|
158 |
+
# # If no images are returned, break the loop
|
159 |
+
# if not image:
|
160 |
+
# print("Conversion of page", str(page_num), "to file failed.")
|
161 |
+
# break
|
162 |
+
|
163 |
+
# # print("Conversion of page", str(page_num), "to file succeeded.")
|
164 |
+
# # print("image:", image)
|
165 |
+
|
166 |
+
# images.append(out_path)
|
167 |
+
|
168 |
+
# print("PDF has been converted to images.")
|
169 |
+
# # print("Images:", images)
|
170 |
+
|
171 |
+
# return images
|
172 |
|
173 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
174 |
def process_file(file_path:str):
|
tools/file_redaction.py
CHANGED
@@ -36,7 +36,7 @@ from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
|
36 |
page_break_value = get_or_create_env_var('page_break_value', '500')
|
37 |
print(f'The value of page_break_value is {page_break_value}')
|
38 |
|
39 |
-
max_time_value = get_or_create_env_var('max_time_value', '
|
40 |
print(f'The value of max_time_value is {max_time_value}')
|
41 |
|
42 |
def sum_numbers_before_seconds(string:str):
|
@@ -689,7 +689,8 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
689 |
merged_bboxes = []
|
690 |
grouped_bboxes = defaultdict(list)
|
691 |
|
692 |
-
|
|
|
693 |
# Process signature and handwriting results
|
694 |
if signature_recogniser_results or handwriting_recogniser_results:
|
695 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
@@ -954,7 +955,7 @@ def redact_image_pdf(file_path:str,
|
|
954 |
json_file_path = output_folder + file_name + "_textract.json"
|
955 |
|
956 |
if not os.path.exists(json_file_path):
|
957 |
-
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client) # Analyse page with Textract
|
958 |
logging_file_paths.append(json_file_path)
|
959 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
960 |
|
@@ -974,7 +975,7 @@ def redact_image_pdf(file_path:str,
|
|
974 |
|
975 |
if not page_exists: # If the page does not exist, analyze again
|
976 |
print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
977 |
-
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
|
978 |
|
979 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
980 |
if "pages" not in existing_data:
|
|
|
36 |
page_break_value = get_or_create_env_var('page_break_value', '500')
|
37 |
print(f'The value of page_break_value is {page_break_value}')
|
38 |
|
39 |
+
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
40 |
print(f'The value of max_time_value is {max_time_value}')
|
41 |
|
42 |
def sum_numbers_before_seconds(string:str):
|
|
|
689 |
merged_bboxes = []
|
690 |
grouped_bboxes = defaultdict(list)
|
691 |
|
692 |
+
print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
|
693 |
+
|
694 |
# Process signature and handwriting results
|
695 |
if signature_recogniser_results or handwriting_recogniser_results:
|
696 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
|
|
955 |
json_file_path = output_folder + file_name + "_textract.json"
|
956 |
|
957 |
if not os.path.exists(json_file_path):
|
958 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
959 |
logging_file_paths.append(json_file_path)
|
960 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
961 |
|
|
|
975 |
|
976 |
if not page_exists: # If the page does not exist, analyze again
|
977 |
print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
978 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, handwrite_signature_checkbox) # Analyse page with Textract
|
979 |
|
980 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
981 |
if "pages" not in existing_data:
|