seanpedrickcase commited on
Commit
f0c28d7
·
1 Parent(s): e3365ed

Updated packages. Reinstituted multithreading with page load, now with order protected. Smaller spacy model used for speed. Textract calls should now be faster

Browse files
requirements.txt CHANGED
@@ -7,12 +7,12 @@ presidio_anonymizer==2.2.355
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
10
- spacy==3.7.5
11
- en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
12
- #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-#3.8.0.tar.gz
13
- gradio==5.4.0
14
- boto3==1.35.54
15
- pyarrow==17.0.0
16
  openpyxl==3.1.2
17
  Faker==22.2.0
18
  gradio_image_annotation==0.2.5
 
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
10
+ spacy==3.8.3
11
+ #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
12
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
13
+ gradio==5.9.0
14
+ boto3==1.35.83
15
+ pyarrow==18.1.0
16
  openpyxl==3.1.2
17
  Faker==22.2.0
18
  gradio_image_annotation==0.2.5
tools/aws_textract.py CHANGED
@@ -46,8 +46,8 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
46
  try:
47
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
48
  except Exception as e:
49
- print("Textract call failed due to:", e, "trying again in 5 seconds.")
50
- time.sleep(5)
51
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
52
  else:
53
  #print("Analysing document without signature detection")
@@ -185,7 +185,7 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
185
 
186
  if recogniser_result not in handwriting:
187
  handwriting.append(recogniser_result)
188
- print("Handwriting found:", handwriting[-1])
189
 
190
  # If handwriting or signature, add to bounding box
191
 
 
46
  try:
47
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
48
  except Exception as e:
49
+ print("Textract call failed due to:", e, "trying again in 3 seconds.")
50
+ time.sleep(3)
51
  response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
52
  else:
53
  #print("Analysing document without signature detection")
 
185
 
186
  if recogniser_result not in handwriting:
187
  handwriting.append(recogniser_result)
188
+ #print("Handwriting found:", handwriting[-1])
189
 
190
  # If handwriting or signature, add to bounding box
191
 
tools/file_conversion.py CHANGED
@@ -48,122 +48,112 @@ def is_pdf(filename):
48
 
49
 
50
 
51
- # def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
52
- # """
53
- # Convert a single page of a PDF to an image and save it as a PNG.
54
- # Returns the path to the saved image.
55
- # """
56
- # try:
57
- # out_path = f"{pdf_path}_{page_num}.png"
58
-
59
- # # Ensure the directory exists
60
- # os.makedirs(os.path.dirname(out_path), exist_ok=True)
61
-
62
- # # Check if the image already exists
63
- # if os.path.exists(out_path):
64
- # # Load the existing image
65
- # print(f"Loading existing image for page {page_num + 1}")
66
- # image = Image.open(out_path)
67
- # else:
68
- # # Convert the page to an image
69
- # print(f"Converting page {page_num + 1}")
70
- # image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
71
- # dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
72
- # image = image_l[0]
73
-
74
- # # Convert to greyscale
75
- # image = image.convert("L")
76
- # image.save(out_path, format="PNG")
77
-
78
- # return out_path
79
-
80
- # except Exception as e:
81
- # print(f"Error processing page {page_num + 1}: {e}")
82
- # return None
83
-
84
- # def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
85
- # """
86
- # Convert pages of a PDF to images using multithreading.
87
- # """
88
- # # Get the number of pages in the PDF
89
- # page_count = pdfinfo_from_path(pdf_path)['Pages']
90
- # print(f"Number of pages in PDF: {page_count}")
91
 
92
- # images = []
93
-
94
- # # Use ThreadPoolExecutor to process pages in parallel
95
- # with ThreadPoolExecutor(max_workers=num_threads) as executor:
96
- # futures = []
97
- # for page_num in range(page_min, page_count):
98
- # futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
 
 
 
 
 
 
 
 
99
 
100
- # # Display progress using tqdm
101
- # for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
102
- # result = future.result()
103
- # if result:
104
- # images.append(result)
105
- # else:
106
- # print("A page failed to process.")
107
 
108
- # print("PDF has been converted to images.")
109
- # return images
 
 
 
 
 
110
 
111
- def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
112
 
113
- print("pdf_path in convert_pdf_to_images:", pdf_path)
114
 
115
- # Get the number of pages in the PDF
116
- page_count = pdfinfo_from_path(pdf_path)['Pages']
117
- print("Number of pages in PDF: ", str(page_count))
118
 
119
- images = []
120
 
121
- # Open the PDF file
122
- #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
123
- for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
124
 
125
- #print("page_num in convert_pdf_to_images:", page_num)
126
 
127
- print("Converting page: ", str(page_num + 1))
128
 
129
- # Convert one page to image
130
- out_path = pdf_path + "_" + str(page_num) + ".png"
131
 
132
- # Ensure the directory exists
133
- os.makedirs(os.path.dirname(out_path), exist_ok=True)
134
 
135
- # Check if the image already exists
136
- if os.path.exists(out_path):
137
- #print(f"Loading existing image from {out_path}.")
138
- image = Image.open(out_path) # Load the existing image
139
 
140
- else:
141
- image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
142
 
143
- image = image_l[0]
144
 
145
- # Convert to greyscale
146
- image = image.convert("L")
147
 
148
- image.save(out_path, format="PNG") # Save the new image
149
 
150
- # If no images are returned, break the loop
151
- if not image:
152
- print("Conversion of page", str(page_num), "to file failed.")
153
- break
154
 
155
- # print("Conversion of page", str(page_num), "to file succeeded.")
156
- # print("image:", image)
157
 
158
- images.append(out_path)
159
 
160
- print("PDF has been converted to images.")
161
- # print("Images:", images)
162
 
163
- return images
164
 
165
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
166
- def process_file(file_path:str):
167
  # Get the file extension
168
  file_extension = os.path.splitext(file_path)[1].lower()
169
 
@@ -178,7 +168,7 @@ def process_file(file_path:str):
178
  elif file_extension == '.pdf':
179
  print(f"{file_path} is a PDF file. Converting to image set")
180
  # Run your function for processing PDF files here
181
- img_object = convert_pdf_to_images(file_path)
182
 
183
  else:
184
  print(f"{file_path} is not an image or PDF file.")
@@ -195,7 +185,7 @@ def get_input_file_names(file_input):
195
  file_name_with_extension = ""
196
  full_file_name = ""
197
 
198
- print("file_input in input file names:", file_input)
199
  if isinstance(file_input, dict):
200
  file_input = os.path.abspath(file_input["name"])
201
 
@@ -222,8 +212,6 @@ def get_input_file_names(file_input):
222
 
223
  all_relevant_files_str = ", ".join(all_relevant_files)
224
 
225
- print("all_relevant_files_str:", all_relevant_files_str)
226
-
227
  return all_relevant_files_str, file_name_with_extension, full_file_name
228
 
229
  def prepare_image_or_pdf(
@@ -253,6 +241,7 @@ def prepare_image_or_pdf(
253
  out_message (List[str]): List to store output messages.
254
  first_loop_state (bool): Flag indicating if this is the first iteration.
255
  number_of_pages (int): integer indicating the number of pages in the document
 
256
  all_annotations_object(List of annotation objects): All annotations for current document
257
  prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
258
  progress (Progress): Progress tracker for the operation.
@@ -352,11 +341,11 @@ def prepare_image_or_pdf(
352
  if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
353
  in_redact_method = tesseract_ocr_option
354
 
355
-
356
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
357
  if file_path.endswith(".json"):
358
 
359
  if prepare_for_review == True:
 
360
  if isinstance(file_path, str):
361
  with open(file_path, 'r') as json_file:
362
  all_annotations_object = json.load(json_file)
@@ -372,11 +361,12 @@ def prepare_image_or_pdf(
372
  ]
373
  image_file_paths_pages = [int(i) for i in image_file_paths_pages]
374
 
375
-
376
- # If PDF pages have been converted to image files, replace the current image paths in the json to this
377
  if image_file_paths:
 
378
  for i, annotation in enumerate(all_annotations_object):
379
  annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
 
380
 
381
  # Check if the annotation page number exists in the image file paths pages
382
  if annotation_page_number in image_file_paths_pages:
@@ -385,7 +375,7 @@ def prepare_image_or_pdf(
385
  correct_image_page = annotation_page_number
386
  annotation["image"] = image_file_paths[correct_image_page]
387
  else:
388
- print("Page not found.")
389
 
390
  #print("all_annotations_object:", all_annotations_object)
391
 
@@ -404,30 +394,24 @@ def prepare_image_or_pdf(
404
  json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
405
  continue
406
 
407
-
408
- print("in_redact_method:", in_redact_method)
409
-
410
- # Convert pdf/image file to correct format for redaction
411
- if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
412
- if is_pdf_or_image(file_path) == False:
413
- out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
414
- print(out_message)
415
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
416
-
417
- print("In correct preparation area.")
418
-
419
- print("file_path at process_file:", file_path)
420
- converted_file_path = process_file(file_path)
421
- image_file_path = converted_file_path
422
-
423
- elif in_redact_method == text_ocr_option:
424
- if is_pdf(file_path) == False:
425
- out_message = "Please upload a PDF file for text analysis."
426
- print(out_message)
427
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
428
-
429
  converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
430
- image_file_path = process_file(file_path)
431
 
432
  converted_file_paths.append(converted_file_path)
433
  image_file_paths.extend(image_file_path)
@@ -453,7 +437,10 @@ def prepare_image_or_pdf(
453
  out_message.append(out_time)
454
  out_message_out = '\n'.join(out_message)
455
 
 
456
  number_of_pages = len(image_file_paths)
 
 
457
 
458
 
459
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
 
48
 
49
 
50
 
51
+ def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple[int, str]:
52
+ try:
53
+ out_path = f"{pdf_path}_{page_num}.png"
54
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
55
+ if os.path.exists(out_path):
56
+ print(f"Loading existing image for page {page_num + 1}")
57
+ image = Image.open(out_path)
58
+ else:
59
+ print(f"Converting page {page_num + 1}")
60
+ image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
61
+ dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
62
+ image = image_l[0]
63
+ image = image.convert("L")
64
+ image.save(out_path, format="PNG")
65
+ return page_num, out_path
66
+ except Exception as e:
67
+ print(f"Error processing page {page_num + 1}: {e}")
68
+ return page_num, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
71
+
72
+ # If preparing for review, just load the first page
73
+ if prepare_for_review == True:
74
+ page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
75
+ else:
76
+ page_count = pdfinfo_from_path(pdf_path)['Pages']
77
+
78
+ print(f"Number of pages in PDF: {page_count}")
79
+
80
+ results = []
81
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
82
+ futures = []
83
+ for page_num in range(page_min, page_count):
84
+ futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
85
 
86
+ for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
87
+ page_num, result = future.result()
88
+ if result:
89
+ results.append((page_num, result))
90
+ else:
91
+ print(f"Page {page_num + 1} failed to process.")
 
92
 
93
+ # Sort results by page number
94
+ results.sort(key=lambda x: x[0])
95
+ images = [result[1] for result in results]
96
+
97
+ print("PDF has been converted to images.")
98
+ return images
99
+
100
 
101
+ # def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
102
 
103
+ # print("pdf_path in convert_pdf_to_images:", pdf_path)
104
 
105
+ # # Get the number of pages in the PDF
106
+ # page_count = pdfinfo_from_path(pdf_path)['Pages']
107
+ # print("Number of pages in PDF: ", str(page_count))
108
 
109
+ # images = []
110
 
111
+ # # Open the PDF file
112
+ # #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
113
+ # for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
114
 
115
+ # #print("page_num in convert_pdf_to_images:", page_num)
116
 
117
+ # print("Converting page: ", str(page_num + 1))
118
 
119
+ # # Convert one page to image
120
+ # out_path = pdf_path + "_" + str(page_num) + ".png"
121
 
122
+ # # Ensure the directory exists
123
+ # os.makedirs(os.path.dirname(out_path), exist_ok=True)
124
 
125
+ # # Check if the image already exists
126
+ # if os.path.exists(out_path):
127
+ # #print(f"Loading existing image from {out_path}.")
128
+ # image = Image.open(out_path) # Load the existing image
129
 
130
+ # else:
131
+ # image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
132
 
133
+ # image = image_l[0]
134
 
135
+ # # Convert to greyscale
136
+ # image = image.convert("L")
137
 
138
+ # image.save(out_path, format="PNG") # Save the new image
139
 
140
+ # # If no images are returned, break the loop
141
+ # if not image:
142
+ # print("Conversion of page", str(page_num), "to file failed.")
143
+ # break
144
 
145
+ # # print("Conversion of page", str(page_num), "to file succeeded.")
146
+ # # print("image:", image)
147
 
148
+ # images.append(out_path)
149
 
150
+ # print("PDF has been converted to images.")
151
+ # # print("Images:", images)
152
 
153
+ # return images
154
 
155
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
156
+ def process_file(file_path:str, prepare_for_review:bool=False):
157
  # Get the file extension
158
  file_extension = os.path.splitext(file_path)[1].lower()
159
 
 
168
  elif file_extension == '.pdf':
169
  print(f"{file_path} is a PDF file. Converting to image set")
170
  # Run your function for processing PDF files here
171
+ img_object = convert_pdf_to_images(file_path, prepare_for_review)
172
 
173
  else:
174
  print(f"{file_path} is not an image or PDF file.")
 
185
  file_name_with_extension = ""
186
  full_file_name = ""
187
 
188
+ #print("file_input in input file names:", file_input)
189
  if isinstance(file_input, dict):
190
  file_input = os.path.abspath(file_input["name"])
191
 
 
212
 
213
  all_relevant_files_str = ", ".join(all_relevant_files)
214
 
 
 
215
  return all_relevant_files_str, file_name_with_extension, full_file_name
216
 
217
  def prepare_image_or_pdf(
 
241
  out_message (List[str]): List to store output messages.
242
  first_loop_state (bool): Flag indicating if this is the first iteration.
243
  number_of_pages (int): integer indicating the number of pages in the document
244
+ current_loop_page_number (int): Current number of loop
245
  all_annotations_object(List of annotation objects): All annotations for current document
246
  prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
247
  progress (Progress): Progress tracker for the operation.
 
341
  if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
342
  in_redact_method = tesseract_ocr_option
343
 
 
344
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
345
  if file_path.endswith(".json"):
346
 
347
  if prepare_for_review == True:
348
+ print("Preparing file for review")
349
  if isinstance(file_path, str):
350
  with open(file_path, 'r') as json_file:
351
  all_annotations_object = json.load(json_file)
 
361
  ]
362
  image_file_paths_pages = [int(i) for i in image_file_paths_pages]
363
 
364
+ # If PDF pages have been converted to image files, replace the current image paths in the json to this.
 
365
  if image_file_paths:
366
+
367
  for i, annotation in enumerate(all_annotations_object):
368
  annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
369
+ #print("Annotation page number:", annotation_page_number)
370
 
371
  # Check if the annotation page number exists in the image file paths pages
372
  if annotation_page_number in image_file_paths_pages:
 
375
  correct_image_page = annotation_page_number
376
  annotation["image"] = image_file_paths[correct_image_page]
377
  else:
378
+ print("Page", annotation_page_number, "image file not found.")
379
 
380
  #print("all_annotations_object:", all_annotations_object)
381
 
 
394
  json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
395
  continue
396
 
397
+ # Must be a pdf or image at this point
398
+ else:
399
+
400
+ # Convert pdf/image file to correct format for redaction
401
+ if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
402
+ if is_pdf_or_image(file_path) == False:
403
+ out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
404
+ print(out_message)
405
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
406
+
407
+ elif in_redact_method == text_ocr_option:
408
+ if is_pdf(file_path) == False:
409
+ out_message = "Please upload a PDF file for text analysis."
410
+ print(out_message)
411
+ return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
412
+
 
 
 
 
 
 
413
  converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
414
+ image_file_path = process_file(file_path, prepare_for_review)
415
 
416
  converted_file_paths.append(converted_file_path)
417
  image_file_paths.extend(image_file_path)
 
437
  out_message.append(out_time)
438
  out_message_out = '\n'.join(out_message)
439
 
440
+ if prepare_for_review == False:
441
  number_of_pages = len(image_file_paths)
442
+ else:
443
+ number_of_pages = len(all_annotations_object)
444
 
445
 
446
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
tools/file_redaction.py CHANGED
@@ -689,8 +689,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
689
  merged_bboxes = []
690
  grouped_bboxes = defaultdict(list)
691
 
692
- print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
693
-
694
  # Process signature and handwriting results
695
  if signature_recogniser_results or handwriting_recogniser_results:
696
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
@@ -906,6 +904,30 @@ def redact_image_pdf(file_path:str,
906
  if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
907
  elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  if current_loop_page == 0: page_loop_start = 0
910
  else: page_loop_start = current_loop_page
911
 
@@ -919,7 +941,7 @@ def redact_image_pdf(file_path:str,
919
  page_break_return = False
920
 
921
  reported_page_number = str(page_no + 1)
922
- print("Redacting page:", reported_page_number)
923
 
924
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
925
  try:
@@ -962,49 +984,72 @@ def redact_image_pdf(file_path:str,
962
  image_buffer = io.BytesIO()
963
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
964
  pdf_page_as_bytes = image_buffer.getvalue()
965
-
966
- #json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
967
- json_file_path = output_folder + file_name + "_textract.json"
968
-
969
- if not os.path.exists(json_file_path):
970
  text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
971
  logging_file_paths.append(json_file_path)
972
  request_metadata = request_metadata + "\n" + new_request_metadata
973
 
974
- wrapped_text_blocks = {"pages":[text_blocks]}
975
 
976
- # Write the updated existing_data back to the JSON file
977
- with open(json_file_path, 'w') as json_file:
978
- json.dump(wrapped_text_blocks, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
979
  else:
980
- # Open the file and load the JSON data
981
- print("Found existing Textract json results file.")
982
- with open(json_file_path, 'r') as json_file:
983
- existing_data = json.load(json_file)
984
 
985
- # Check if the current reported_page_number exists in the loaded JSON
986
- page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
 
987
 
988
- if not page_exists: # If the page does not exist, analyze again
989
- print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
990
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
991
 
992
- # Check if "pages" key exists, if not, initialize it as an empty list
993
- if "pages" not in existing_data:
994
- existing_data["pages"] = []
 
 
 
 
 
 
 
 
 
 
995
 
996
- # Append the new page data
997
- existing_data["pages"].append(text_blocks)
998
 
999
- # Write the updated existing_data back to the JSON file
1000
- with open(json_file_path, 'w') as json_file:
1001
- json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1002
 
1003
- logging_file_paths.append(json_file_path)
1004
- request_metadata = request_metadata + "\n" + new_request_metadata
1005
- else:
1006
- # If the page exists, retrieve the data
1007
- text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
 
1009
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1010
 
@@ -1124,6 +1169,11 @@ def redact_image_pdf(file_path:str,
1124
 
1125
  annotations_all_pages.append(image_annotations)
1126
 
 
 
 
 
 
1127
  current_loop_page += 1
1128
 
1129
  return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1142,7 +1192,18 @@ def redact_image_pdf(file_path:str,
1142
  progress.close(_tqdm=progress_bar)
1143
  tqdm._instances.clear()
1144
 
 
 
 
 
 
1145
  return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 
 
 
 
 
 
1146
 
1147
  return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1148
 
@@ -1675,8 +1736,8 @@ def redact_text_pdf(
1675
  pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
1676
 
1677
  #print("Did redact_page_with_pymupdf function")
1678
-
1679
- print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
1680
 
1681
  # Write logs
1682
  # Create decision process table
 
689
  merged_bboxes = []
690
  grouped_bboxes = defaultdict(list)
691
 
 
 
692
  # Process signature and handwriting results
693
  if signature_recogniser_results or handwriting_recogniser_results:
694
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
 
904
  if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
905
  elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
906
 
907
+ # If running Textract, check if file already exists. If it does, load in existing data
908
+ # Import results from json and convert
909
+ if analysis_type == textract_option:
910
+
911
+ json_file_path = output_folder + file_name + "_textract.json"
912
+ logging_file_paths.append(json_file_path)
913
+
914
+ if not os.path.exists(json_file_path):
915
+ no_textract_file = True
916
+ print("No existing Textract results file found.")
917
+ existing_data = {}
918
+ #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
919
+ #logging_file_paths.append(json_file_path)
920
+ #request_metadata = request_metadata + "\n" + new_request_metadata
921
+ #wrapped_text_blocks = {"pages":[text_blocks]}
922
+ else:
923
+ # Open the file and load the JSON data
924
+ no_textract_file = False
925
+ print("Found existing Textract json results file.")
926
+ with open(json_file_path, 'r') as json_file:
927
+ existing_data = json.load(json_file)
928
+
929
+ ###
930
+
931
  if current_loop_page == 0: page_loop_start = 0
932
  else: page_loop_start = current_loop_page
933
 
 
941
  page_break_return = False
942
 
943
  reported_page_number = str(page_no + 1)
944
+ #print("Redacting page:", reported_page_number)
945
 
946
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
947
  try:
 
984
  image_buffer = io.BytesIO()
985
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
986
  pdf_page_as_bytes = image_buffer.getvalue()
987
+
988
+ if not existing_data:
 
 
 
989
  text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
990
  logging_file_paths.append(json_file_path)
991
  request_metadata = request_metadata + "\n" + new_request_metadata
992
 
993
+ existing_data = {"pages":[text_blocks]}
994
 
 
 
 
995
  else:
996
+ # Check if the current reported_page_number exists in the loaded JSON
997
+ page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
 
 
998
 
999
+ if not page_exists: # If the page does not exist, analyze again
1000
+ print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
1001
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1002
 
1003
+ # Check if "pages" key exists, if not, initialize it as an empty list
1004
+ if "pages" not in existing_data:
1005
+ existing_data["pages"] = []
1006
 
1007
+ # Append the new page data
1008
+ existing_data["pages"].append(text_blocks)
1009
+
1010
+ request_metadata = request_metadata + "\n" + new_request_metadata
1011
+ else:
1012
+ # If the page exists, retrieve the data
1013
+ text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1014
+
1015
+
1016
+ # if not os.path.exists(json_file_path):
1017
+ # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1018
+ # logging_file_paths.append(json_file_path)
1019
+ # request_metadata = request_metadata + "\n" + new_request_metadata
1020
 
1021
+ # existing_data = {"pages":[text_blocks]}
 
1022
 
 
 
 
1023
 
1024
+ # else:
1025
+ # # Open the file and load the JSON data
1026
+ # print("Found existing Textract json results file.")
1027
+ # with open(json_file_path, 'r') as json_file:
1028
+ # existing_data = json.load(json_file)
1029
+
1030
+ # # Check if the current reported_page_number exists in the loaded JSON
1031
+ # page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
1032
+
1033
+ # if not page_exists: # If the page does not exist, analyze again
1034
+ # print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
1035
+ # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1036
+
1037
+ # # Check if "pages" key exists, if not, initialize it as an empty list
1038
+ # if "pages" not in existing_data:
1039
+ # existing_data["pages"] = []
1040
+
1041
+ # # Append the new page data
1042
+ # existing_data["pages"].append(text_blocks)
1043
+
1044
+ # # Write the updated existing_data back to the JSON file
1045
+ # with open(json_file_path, 'w') as json_file:
1046
+ # json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1047
+
1048
+ # logging_file_paths.append(json_file_path)
1049
+ # request_metadata = request_metadata + "\n" + new_request_metadata
1050
+ # else:
1051
+ # # If the page exists, retrieve the data
1052
+ # text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1053
 
1054
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1055
 
 
1169
 
1170
  annotations_all_pages.append(image_annotations)
1171
 
1172
+ if analysis_type == textract_option:
1173
+ # Write the updated existing textract data back to the JSON file
1174
+ with open(json_file_path, 'w') as json_file:
1175
+ json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1176
+
1177
  current_loop_page += 1
1178
 
1179
  return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 
1192
  progress.close(_tqdm=progress_bar)
1193
  tqdm._instances.clear()
1194
 
1195
+ if analysis_type == textract_option:
1196
+ # Write the updated existing textract data back to the JSON file
1197
+ with open(json_file_path, 'w') as json_file:
1198
+ json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1199
+
1200
  return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1201
+
1202
+ if analysis_type == textract_option:
1203
+ # Write the updated existing textract data back to the JSON file
1204
+
1205
+ with open(json_file_path, 'w') as json_file:
1206
+ json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1207
 
1208
  return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1209
 
 
1736
  pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
1737
 
1738
  #print("Did redact_page_with_pymupdf function")
1739
+ reported_page_no = page_no + 1
1740
+ print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
1741
 
1742
  # Write logs
1743
  # Create decision process table
tools/helper_functions.py CHANGED
@@ -31,9 +31,9 @@ def get_or_create_env_var(var_name, default_value):
31
 
32
 
33
  # Names for options labels
34
- text_ocr_option = "Simple text analysis - docs with selectable text"
35
- tesseract_ocr_option = "OCR analysis for documents without selectable text - best for typed text"
36
- textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
37
 
38
  local_pii_detector = "Local"
39
  aws_pii_detector = "AWS Comprehend"
@@ -263,6 +263,11 @@ async def get_connection_params(request: gr.Request):
263
  base_folder = "user-files/"
264
  print("Cognito ID found:", out_session_hash)
265
 
 
 
 
 
 
266
  else:
267
  out_session_hash = request.session_hash
268
  base_folder = "temp-files/"
 
31
 
32
 
33
  # Names for options labels
34
+ text_ocr_option = "Local model - selectable text"
35
+ tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
36
+ textract_option = "AWS Textract service - all PDF types"
37
 
38
  local_pii_detector = "Local"
39
  aws_pii_detector = "AWS Comprehend"
 
263
  base_folder = "user-files/"
264
  print("Cognito ID found:", out_session_hash)
265
 
266
+ elif 'x-amzn-oidc-identity' in request.headers:
267
+ out_session_hash = request.headers['x-amzn-oidc-identity']
268
+ base_folder = "user-files/"
269
+ print("Cognito ID found:", out_session_hash)
270
+
271
  else:
272
  out_session_hash = request.session_hash
273
  base_folder = "temp-files/"
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -8,20 +8,20 @@ from spacy.cli.download import download
8
  import re
9
 
10
  # %%
11
- model_name = "en_core_web_lg" #"en_core_web_trf"
12
  score_threshold = 0.001
13
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
14
 
15
  #Load spacy model
16
  try:
17
- import en_core_web_lg
18
- nlp = en_core_web_lg.load()
19
  print("Successfully imported spaCy model")
20
 
21
  except:
22
  download(model_name)
23
  nlp = spacy.load(model_name)
24
- print("Successfully downloaded and imported spaCy model")
25
 
26
  # #### Custom recognisers
27
  # Allow user to create their own recogniser
 
8
  import re
9
 
10
  # %%
11
+ model_name = "en_core_web_sm" #"en_core_web_trf"
12
  score_threshold = 0.001
13
  custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
14
 
15
  #Load spacy model
16
  try:
17
+ import en_core_web_sm
18
+ nlp = en_core_web_sm.load()
19
  print("Successfully imported spaCy model")
20
 
21
  except:
22
  download(model_name)
23
  nlp = spacy.load(model_name)
24
+ print("Successfully downloaded and imported spaCy model", model_name)
25
 
26
  # #### Custom recognisers
27
  # Allow user to create their own recogniser