seanpedrickcase commited on
Commit
c3a8cd7
·
1 Parent(s): 6ac4be4

You can now have output redaction boxes in grey according to an environment variable. Review files are now saved every time page is changed.

Browse files
Files changed (4) hide show
  1. app.py +21 -8
  2. tools/auth.py +3 -3
  3. tools/file_conversion.py +7 -3
  4. tools/redaction_review.py +52 -57
app.py CHANGED
@@ -13,7 +13,7 @@ from gradio_image_annotation.image_annotator import AnnotatedImageData
13
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
- from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
17
  from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
@@ -41,6 +41,8 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
41
 
42
  language = 'en'
43
 
 
 
44
  host_name = socket.gethostname()
45
  feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
46
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
@@ -84,6 +86,8 @@ with app:
84
  log_files_output_list_state = gr.State([])
85
 
86
  review_file_state = gr.State(pd.DataFrame())
 
 
87
 
88
  # Logging state
89
  log_file_name = 'log.csv'
@@ -197,7 +201,7 @@ with app:
197
  # Object annotation
198
  with gr.Tab("Review redactions", id="tab_object_annotation"):
199
 
200
- with gr.Accordion(label = "Review redaction file", open=False):
201
  output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
202
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
203
 
@@ -344,14 +348,18 @@ with app:
344
  # Page controls at top
345
  annotate_current_page.submit(
346
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
347
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
348
 
349
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
350
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
351
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
 
352
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
353
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
354
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
355
 
356
  # Zoom in and out on annotator
357
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
@@ -368,18 +376,23 @@ with app:
368
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
369
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
370
 
 
 
371
  # Page controls at bottom
372
  annotate_current_page_bottom.submit(
373
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
374
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
375
 
376
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
377
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
378
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
379
 
380
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
381
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
382
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
 
383
 
384
  # Review side bar controls
385
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
 
13
  from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
+ from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
  from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
 
41
 
42
  language = 'en'
43
 
44
+
45
+
46
  host_name = socket.gethostname()
47
  feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
48
  access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
 
86
  log_files_output_list_state = gr.State([])
87
 
88
  review_file_state = gr.State(pd.DataFrame())
89
+
90
+ do_not_save_pdf_state = gr.State(False)
91
 
92
  # Logging state
93
  log_file_name = 'log.csv'
 
201
  # Object annotation
202
  with gr.Tab("Review redactions", id="tab_object_annotation"):
203
 
204
+ with gr.Accordion(label = "Review redaction file", open=True):
205
  output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
206
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
207
 
 
348
  # Page controls at top
349
  annotate_current_page.submit(
350
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
351
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
352
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
353
 
354
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
355
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
356
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
357
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
358
+
359
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
360
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
361
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
362
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
363
 
364
  # Zoom in and out on annotator
365
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
 
376
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
377
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
378
 
379
+ do_not_save_pdf_state
380
+
381
  # Page controls at bottom
382
  annotate_current_page_bottom.submit(
383
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
384
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
385
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
386
 
387
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
388
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
389
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
390
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
391
 
392
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
393
  then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
394
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
395
+ then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
396
 
397
  # Review side bar controls
398
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
tools/auth.py CHANGED
@@ -7,13 +7,13 @@ import base64
7
  from tools.helper_functions import get_or_create_env_var
8
 
9
  client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
10
- print(f'The value of AWS_CLIENT_ID is {client_id}')
11
 
12
  client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
13
- print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
14
 
15
  user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
16
- print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
17
 
18
  def calculate_secret_hash(client_id, client_secret, username):
19
  message = username + client_id
 
7
  from tools.helper_functions import get_or_create_env_var
8
 
9
  client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
10
+ #print(f'The value of AWS_CLIENT_ID is {client_id}')
11
 
12
  client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
13
+ #print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
14
 
15
  user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
16
+ #print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
17
 
18
  def calculate_secret_hash(client_id, client_secret, username):
19
  message = username + client_id
tools/file_conversion.py CHANGED
@@ -1,5 +1,5 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, read_file
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
  import os
@@ -48,7 +48,8 @@ def is_pdf(filename):
48
  # %%
49
  ## Convert pdf to image if necessary
50
 
51
-
 
52
 
53
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
54
  try:
@@ -261,7 +262,10 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
261
  else:
262
  out_colour = img_annotation_box["color"]
263
  else:
264
- out_colour = (0,0,0)
 
 
 
265
 
266
  shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
267
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
  import os
 
48
  # %%
49
  ## Convert pdf to image if necessary
50
 
51
+ CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
52
+ print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
53
 
54
  def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
55
  try:
 
262
  else:
263
  out_colour = img_annotation_box["color"]
264
  else:
265
+ if CUSTOM_BOX_COLOUR == "grey":
266
+ out_colour = (0.5, 0.5, 0.5)
267
+ else:
268
+ out_colour = (0,0,0)
269
 
270
  shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
271
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
tools/redaction_review.py CHANGED
@@ -55,12 +55,6 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
55
  '''
56
  recogniser_entities = []
57
  recogniser_dataframe = pd.DataFrame()
58
- #recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
59
- #recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
60
-
61
- #print("recogniser_dataframe_gr", recogniser_dataframe_gr)
62
- #print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
63
- #print("recogniser_dataframe_gr.iloc[0,0]:", recogniser_dataframe_gr.iloc[0,0])
64
 
65
  if recogniser_dataframe_gr.iloc[0,0] == "":
66
  try:
@@ -228,7 +222,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
228
 
229
  return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
230
 
231
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
232
  '''
233
  Apply modified redactions to a pymupdf and export review files
234
  '''
@@ -251,75 +245,76 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
251
  file_paths = [file_paths]
252
 
253
  for file_path in file_paths:
254
- print("file_path:", file_path)
255
  file_base = get_file_path_end(file_path)
256
 
257
  file_extension = os.path.splitext(file_path)[1].lower()
258
 
259
- # If working with image docs
260
- if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
261
- image = Image.open(file_paths[-1])
 
262
 
263
- #image = pdf_doc
264
 
265
- draw = ImageDraw.Draw(image)
266
 
267
- for img_annotation_box in image_annotated['boxes']:
268
- coords = [img_annotation_box["xmin"],
269
- img_annotation_box["ymin"],
270
- img_annotation_box["xmax"],
271
- img_annotation_box["ymax"]]
272
 
273
- fill = img_annotation_box["color"]
274
 
275
- draw.rectangle(coords, fill=fill)
276
 
277
- image.save(output_folder + file_base + "_redacted.png")
278
 
279
- doc = [image]
280
 
281
- elif file_extension in '.csv':
282
- print("This is a csv")
283
- pdf_doc = []
284
 
285
- # If working with pdfs
286
- elif is_pdf(file_path) == True:
287
- pdf_doc = pymupdf.open(file_path)
288
 
289
- number_of_pages = pdf_doc.page_count
290
 
291
- print("Saving pages to file.")
292
 
293
- for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
294
 
295
- #print("Saving page", str(i))
296
-
297
- image_loc = all_image_annotations[i]['image']
298
- #print("Image location:", image_loc)
299
 
300
- # Load in image object
301
- if isinstance(image_loc, np.ndarray):
302
- image = Image.fromarray(image_loc.astype('uint8'))
303
- #all_image_annotations[i]['image'] = image_loc.tolist()
304
- elif isinstance(image_loc, Image.Image):
305
- image = image_loc
306
- #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
307
- #image_loc.save(image_out_folder)
308
- #all_image_annotations[i]['image'] = image_out_folder
309
- elif isinstance(image_loc, str):
310
- image = Image.open(image_loc)
311
 
312
- pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
313
- pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
314
 
315
- else:
316
- print("File type not recognised.")
317
-
318
- #try:
319
- if pdf_doc:
320
- out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
321
- pdf_doc.save(out_pdf_file_path)
322
- output_files.append(out_pdf_file_path)
323
 
324
  try:
325
  print("Saving annotations to JSON")
@@ -331,7 +326,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
331
 
332
  print("Saving annotations to CSV review file")
333
 
334
- print("review_file_state:", review_file_state)
335
 
336
  # Convert json to csv and also save this
337
  review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
 
55
  '''
56
  recogniser_entities = []
57
  recogniser_dataframe = pd.DataFrame()
 
 
 
 
 
 
58
 
59
  if recogniser_dataframe_gr.iloc[0,0] == "":
60
  try:
 
222
 
223
  return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
224
 
225
+ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
226
  '''
227
  Apply modified redactions to a pymupdf and export review files
228
  '''
 
245
  file_paths = [file_paths]
246
 
247
  for file_path in file_paths:
248
+ #print("file_path:", file_path)
249
  file_base = get_file_path_end(file_path)
250
 
251
  file_extension = os.path.splitext(file_path)[1].lower()
252
 
253
+ if save_pdf == True:
254
+ # If working with image docs
255
+ if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
256
+ image = Image.open(file_paths[-1])
257
 
258
+ #image = pdf_doc
259
 
260
+ draw = ImageDraw.Draw(image)
261
 
262
+ for img_annotation_box in image_annotated['boxes']:
263
+ coords = [img_annotation_box["xmin"],
264
+ img_annotation_box["ymin"],
265
+ img_annotation_box["xmax"],
266
+ img_annotation_box["ymax"]]
267
 
268
+ fill = img_annotation_box["color"]
269
 
270
+ draw.rectangle(coords, fill=fill)
271
 
272
+ image.save(output_folder + file_base + "_redacted.png")
273
 
274
+ doc = [image]
275
 
276
+ elif file_extension in '.csv':
277
+ print("This is a csv")
278
+ pdf_doc = []
279
 
280
+ # If working with pdfs
281
+ elif is_pdf(file_path) == True:
282
+ pdf_doc = pymupdf.open(file_path)
283
 
284
+ number_of_pages = pdf_doc.page_count
285
 
286
+ print("Saving pages to file.")
287
 
288
+ for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
289
 
290
+ #print("Saving page", str(i))
291
+
292
+ image_loc = all_image_annotations[i]['image']
293
+ #print("Image location:", image_loc)
294
 
295
+ # Load in image object
296
+ if isinstance(image_loc, np.ndarray):
297
+ image = Image.fromarray(image_loc.astype('uint8'))
298
+ #all_image_annotations[i]['image'] = image_loc.tolist()
299
+ elif isinstance(image_loc, Image.Image):
300
+ image = image_loc
301
+ #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
302
+ #image_loc.save(image_out_folder)
303
+ #all_image_annotations[i]['image'] = image_out_folder
304
+ elif isinstance(image_loc, str):
305
+ image = Image.open(image_loc)
306
 
307
+ pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
308
+ pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
309
 
310
+ else:
311
+ print("File type not recognised.")
312
+
313
+ #try:
314
+ if pdf_doc:
315
+ out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
316
+ pdf_doc.save(out_pdf_file_path)
317
+ output_files.append(out_pdf_file_path)
318
 
319
  try:
320
  print("Saving annotations to JSON")
 
326
 
327
  print("Saving annotations to CSV review file")
328
 
329
+ #print("review_file_state:", review_file_state)
330
 
331
  # Convert json to csv and also save this
332
  review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)