seanpedrickcase commited on
Commit
9504619
·
1 Parent(s): b8e245f

Multithreaded file preparation. Can call Textract without signature detection

Browse files
app.py CHANGED
@@ -263,7 +263,7 @@ with app:
263
 
264
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
265
 
266
- handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
267
  #with gr.Row():
268
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
269
 
 
263
 
264
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
265
 
266
+ handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
267
  #with gr.Row():
268
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
269
 
tools/aws_textract.py CHANGED
@@ -1,10 +1,11 @@
1
  import boto3
2
- from PIL import Image
 
3
  import io
4
- import json
5
  import pikepdf
6
  # Example: converting this single page to an image
7
- from pdf2image import convert_from_bytes
8
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
9
 
10
  def extract_textract_metadata(response):
@@ -23,7 +24,7 @@ def extract_textract_metadata(response):
23
  #'NumberOfPages': number_of_pages
24
  })
25
 
26
- def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
27
  '''
28
  Analyse page with AWS Textract
29
  '''
@@ -36,7 +37,14 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client=""):
36
 
37
  print("Analysing page with AWS Textract")
38
 
39
- response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
 
 
 
 
 
 
 
40
 
41
  # Wrap the response with the page number in the desired format
42
  wrapped_response = {
 
1
  import boto3
2
+ #from PIL import Image
3
+ from typing import List
4
  import io
5
+ #import json
6
  import pikepdf
7
  # Example: converting this single page to an image
8
+ #from pdf2image import convert_from_bytes
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
 
11
  def extract_textract_metadata(response):
 
24
  #'NumberOfPages': number_of_pages
25
  })
26
 
27
+ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
28
  '''
29
  Analyse page with AWS Textract
30
  '''
 
37
 
38
  print("Analysing page with AWS Textract")
39
 
40
+ # Redact signatures if specified
41
+ if "Redact all identified signatures" in handwrite_signature_checkbox:
42
+ print("Analysing document with signature detection")
43
+ response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
44
+ else:
45
+ print("Analysing document without signature detection")
46
+ # Call detect_document_text to extract plain text
47
+ response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
48
 
49
  # Wrap the response with the page number in the desired format
50
  wrapped_response = {
tools/file_conversion.py CHANGED
@@ -2,7 +2,6 @@ from pdf2image import convert_from_path, pdfinfo_from_path
2
  from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
-
6
  import os
7
  import re
8
  import gradio as gr
@@ -12,6 +11,7 @@ import pymupdf
12
  from tqdm import tqdm
13
  from gradio import Progress
14
  from typing import List, Optional
 
15
 
16
  image_dpi = 300.0
17
 
@@ -46,61 +46,129 @@ def is_pdf(filename):
46
  # %%
47
  ## Convert pdf to image if necessary
48
 
49
- def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
50
 
51
- print("pdf_path in convert_pdf_to_images:", pdf_path)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  # Get the number of pages in the PDF
54
  page_count = pdfinfo_from_path(pdf_path)['Pages']
55
- print("Number of pages in PDF: ", str(page_count))
56
 
57
  images = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- # Open the PDF file
60
- #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
61
- for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
 
 
 
62
 
63
- print("page_num in convert_pdf_to_images:", page_num)
64
-
65
- print("Converting page: ", str(page_num + 1))
66
 
67
- # Convert one page to image
68
- out_path = pdf_path + "_" + str(page_num) + ".png"
69
-
70
- # Ensure the directory exists
71
- os.makedirs(os.path.dirname(out_path), exist_ok=True)
72
 
73
- # Check if the image already exists
74
- if os.path.exists(out_path):
75
- #print(f"Loading existing image from {out_path}.")
76
- image = Image.open(out_path) # Load the existing image
77
 
 
 
 
78
 
 
79
 
80
- else:
81
- image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
 
82
 
83
- image = image_l[0]
 
 
84
 
85
- # Convert to greyscale
86
- image = image.convert("L")
 
 
 
87
 
88
- image.save(out_path, format="PNG") # Save the new image
 
 
 
89
 
90
- # If no images are returned, break the loop
91
- if not image:
92
- print("Conversion of page", str(page_num), "to file failed.")
93
- break
94
 
95
- # print("Conversion of page", str(page_num), "to file succeeded.")
96
- # print("image:", image)
97
 
98
- images.append(out_path)
 
99
 
100
- print("PDF has been converted to images.")
101
- # print("Images:", images)
102
 
103
- return images
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
106
  def process_file(file_path:str):
 
2
  from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
 
5
  import os
6
  import re
7
  import gradio as gr
 
11
  from tqdm import tqdm
12
  from gradio import Progress
13
  from typing import List, Optional
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
 
16
  image_dpi = 300.0
17
 
 
46
  # %%
47
  ## Convert pdf to image if necessary
48
 
 
49
 
 
50
 
51
+ def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
52
+ """
53
+ Convert a single page of a PDF to an image and save it as a PNG.
54
+ Returns the path to the saved image.
55
+ """
56
+ try:
57
+ out_path = f"{pdf_path}_{page_num}.png"
58
+
59
+ # Ensure the directory exists
60
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
61
+
62
+ # Check if the image already exists
63
+ if os.path.exists(out_path):
64
+ # Load the existing image
65
+ print(f"Loading existing image for page {page_num + 1}")
66
+ image = Image.open(out_path)
67
+ else:
68
+ # Convert the page to an image
69
+ print(f"Converting page {page_num + 1}")
70
+ image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
71
+ dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
72
+ image = image_l[0]
73
+
74
+ # Convert to greyscale
75
+ image = image.convert("L")
76
+ image.save(out_path, format="PNG")
77
+
78
+ return out_path
79
+
80
+ except Exception as e:
81
+ print(f"Error processing page {page_num + 1}: {e}")
82
+ return None
83
+
84
+ def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
85
+ """
86
+ Convert pages of a PDF to images using multithreading.
87
+ """
88
  # Get the number of pages in the PDF
89
  page_count = pdfinfo_from_path(pdf_path)['Pages']
90
+ print(f"Number of pages in PDF: {page_count}")
91
 
92
  images = []
93
+
94
+ # Use ThreadPoolExecutor to process pages in parallel
95
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
96
+ futures = []
97
+ for page_num in range(page_min, page_count):
98
+ futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
99
+
100
+ # Display progress using tqdm
101
+ for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
102
+ result = future.result()
103
+ if result:
104
+ images.append(result)
105
+ else:
106
+ print("A page failed to process.")
107
+
108
+ print("PDF has been converted to images.")
109
+ return images
110
 
111
+ # Example usage
112
+ if __name__ == "__main__":
113
+ pdf_path = "example.pdf"
114
+ image_dpi = 200
115
+ output_images = convert_pdf_to_images(pdf_path, image_dpi=image_dpi, num_threads=8)
116
+ print("Images saved:", output_images)
117
 
 
 
 
118
 
119
+ # def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
 
 
 
 
120
 
121
+ # print("pdf_path in convert_pdf_to_images:", pdf_path)
 
 
 
122
 
123
+ # # Get the number of pages in the PDF
124
+ # page_count = pdfinfo_from_path(pdf_path)['Pages']
125
+ # print("Number of pages in PDF: ", str(page_count))
126
 
127
+ # images = []
128
 
129
+ # # Open the PDF file
130
+ # #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
131
+ # for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
132
 
133
+ # #print("page_num in convert_pdf_to_images:", page_num)
134
+
135
+ # print("Converting page: ", str(page_num + 1))
136
 
137
+ # # Convert one page to image
138
+ # out_path = pdf_path + "_" + str(page_num) + ".png"
139
+
140
+ # # Ensure the directory exists
141
+ # os.makedirs(os.path.dirname(out_path), exist_ok=True)
142
 
143
+ # # Check if the image already exists
144
+ # if os.path.exists(out_path):
145
+ # #print(f"Loading existing image from {out_path}.")
146
+ # image = Image.open(out_path) # Load the existing image
147
 
148
+ # else:
149
+ # image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
 
 
150
 
151
+ # image = image_l[0]
 
152
 
153
+ # # Convert to greyscale
154
+ # image = image.convert("L")
155
 
156
+ # image.save(out_path, format="PNG") # Save the new image
 
157
 
158
+ # # If no images are returned, break the loop
159
+ # if not image:
160
+ # print("Conversion of page", str(page_num), "to file failed.")
161
+ # break
162
+
163
+ # # print("Conversion of page", str(page_num), "to file succeeded.")
164
+ # # print("image:", image)
165
+
166
+ # images.append(out_path)
167
+
168
+ # print("PDF has been converted to images.")
169
+ # # print("Images:", images)
170
+
171
+ # return images
172
 
173
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
174
  def process_file(file_path:str):
tools/file_redaction.py CHANGED
@@ -36,7 +36,7 @@ from tools.presidio_analyzer_custom import recognizer_result_from_dict
36
  page_break_value = get_or_create_env_var('page_break_value', '500')
37
  print(f'The value of page_break_value is {page_break_value}')
38
 
39
- max_time_value = get_or_create_env_var('max_time_value', '105')
40
  print(f'The value of max_time_value is {max_time_value}')
41
 
42
  def sum_numbers_before_seconds(string:str):
@@ -689,7 +689,8 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
689
  merged_bboxes = []
690
  grouped_bboxes = defaultdict(list)
691
 
692
-
 
693
  # Process signature and handwriting results
694
  if signature_recogniser_results or handwriting_recogniser_results:
695
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
@@ -954,7 +955,7 @@ def redact_image_pdf(file_path:str,
954
  json_file_path = output_folder + file_name + "_textract.json"
955
 
956
  if not os.path.exists(json_file_path):
957
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client) # Analyse page with Textract
958
  logging_file_paths.append(json_file_path)
959
  request_metadata = request_metadata + "\n" + new_request_metadata
960
 
@@ -974,7 +975,7 @@ def redact_image_pdf(file_path:str,
974
 
975
  if not page_exists: # If the page does not exist, analyze again
976
  print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
977
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number) # Analyse page with Textract
978
 
979
  # Check if "pages" key exists, if not, initialize it as an empty list
980
  if "pages" not in existing_data:
 
36
  page_break_value = get_or_create_env_var('page_break_value', '500')
37
  print(f'The value of page_break_value is {page_break_value}')
38
 
39
+ max_time_value = get_or_create_env_var('max_time_value', '999999')
40
  print(f'The value of max_time_value is {max_time_value}')
41
 
42
  def sum_numbers_before_seconds(string:str):
 
689
  merged_bboxes = []
690
  grouped_bboxes = defaultdict(list)
691
 
692
+ print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
693
+
694
  # Process signature and handwriting results
695
  if signature_recogniser_results or handwriting_recogniser_results:
696
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
 
955
  json_file_path = output_folder + file_name + "_textract.json"
956
 
957
  if not os.path.exists(json_file_path):
958
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
959
  logging_file_paths.append(json_file_path)
960
  request_metadata = request_metadata + "\n" + new_request_metadata
961
 
 
975
 
976
  if not page_exists: # If the page does not exist, analyze again
977
  print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
978
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, handwrite_signature_checkbox) # Analyse page with Textract
979
 
980
  # Check if "pages" key exists, if not, initialize it as an empty list
981
  if "pages" not in existing_data: