seanpedrickcase commited on
Commit
ec98119
·
1 Parent(s): c71d0c1

Comprehend now uses custom spacy recognisers on top of defaults. Added zoom functionality to annotator. Fixed some pdf mediabox issues and redacted image output issues.

Browse files
app.py CHANGED
@@ -13,9 +13,10 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
13
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
- from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
 
19
 
20
 
21
  today_rev = datetime.now().strftime("%Y%m%d")
@@ -29,6 +30,10 @@ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT
29
 
30
  full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
31
 
 
 
 
 
32
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
33
 
34
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
@@ -117,6 +122,12 @@ with app:
117
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
118
 
119
 
 
 
 
 
 
 
120
  ###
121
  # UI DESIGN
122
  ###
@@ -164,6 +175,9 @@ with app:
164
  annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
165
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
166
  annotation_next_page_button = gr.Button("Next page", scale = 3)
 
 
 
167
 
168
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
169
 
@@ -238,9 +252,9 @@ with app:
238
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
239
 
240
  with gr.Accordion("Add or remove entity types to redact", open = False):
241
- in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
242
-
243
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
 
 
244
 
245
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
246
  #with gr.Row():
@@ -260,18 +274,19 @@ with app:
260
  ###
261
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
262
 
263
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
264
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
265
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
266
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")#.\
267
- #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
268
 
269
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
270
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
271
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
 
272
 
273
  # If a file has been completed, the function will continue onto the next document
274
- latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
275
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
276
  # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
277
  # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
@@ -284,12 +299,20 @@ with app:
284
  # Page controls at top
285
  annotate_current_page.submit(
286
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
287
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
288
 
289
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
290
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
291
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
292
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
 
 
 
 
 
 
 
 
293
 
294
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
295
  annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
@@ -297,12 +320,12 @@ with app:
297
  # Page controls at bottom
298
  annotate_current_page_bottom.submit(
299
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
300
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
301
 
302
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
303
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
304
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
305
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
306
 
307
  ###
308
  # TABULAR DATA REDACTION
 
13
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14
  from tools.file_redaction import choose_and_run_redactor
15
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16
+ from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
17
  from tools.data_anonymise import anonymise_data_files
18
  from tools.auth import authenticate_user
19
+ from tools.load_spacy_model_custom_recognisers import custom_entities
20
 
21
 
22
  today_rev = datetime.now().strftime("%Y%m%d")
 
30
 
31
  full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
32
 
33
+ # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
34
+ chosen_comprehend_entities.extend(custom_entities)
35
+ full_comprehend_entity_list.extend(custom_entities)
36
+
37
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
38
 
39
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 
122
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
123
 
124
 
125
+ ## Annotator zoom value
126
+ annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
127
+ zoom_true_bool = gr.State(True)
128
+ zoom_false_bool = gr.State(False)
129
+
130
+
131
  ###
132
  # UI DESIGN
133
  ###
 
175
  annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
176
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
177
  annotation_next_page_button = gr.Button("Next page", scale = 3)
178
+ with gr.Row():
179
+ annotate_zoom_in = gr.Button("Zoom in")
180
+ annotate_zoom_out = gr.Button("Zoom out")
181
 
182
  annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
183
 
 
252
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
253
 
254
  with gr.Accordion("Add or remove entity types to redact", open = False):
 
 
255
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
256
+
257
+ in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
258
 
259
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
260
  #with gr.Row():
 
274
  ###
275
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
276
 
277
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
278
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
279
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
280
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
281
+ then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
282
 
283
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
284
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
285
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
286
+ then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
287
 
288
  # If a file has been completed, the function will continue onto the next document
289
+ latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
290
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
291
  # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
292
  # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
 
299
  # Page controls at top
300
  annotate_current_page.submit(
301
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
302
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
303
 
304
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
305
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
306
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
307
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
308
+
309
+ # Zoom in and out on annotator
310
+ annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
311
+ then(update_zoom, inputs=[annotator_zoom_number, zoom_true_bool], outputs=[annotator_zoom_number]).\
312
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
313
+ annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
314
+ then(update_zoom, inputs=[annotator_zoom_number, zoom_false_bool], outputs=[annotator_zoom_number]).\
315
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
316
 
317
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
318
  annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
 
320
  # Page controls at bottom
321
  annotate_current_page_bottom.submit(
322
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
323
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
324
 
325
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
326
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
327
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
328
+ then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
329
 
330
  ###
331
  # TABULAR DATA REDACTION
tools/aws_functions.py CHANGED
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
10
  # Get AWS credentials
11
  bucket_name=""
12
 
13
- RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
14
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
15
 
16
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 
10
  # Get AWS credentials
11
  bucket_name=""
12
 
13
+ RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
14
  print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
15
 
16
  AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
tools/custom_image_analyser_engine.py CHANGED
@@ -13,6 +13,7 @@ from copy import deepcopy
13
  from tools.helper_functions import clean_unicode_text
14
  from tools.aws_functions import comprehend_client
15
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
 
16
  #import string # Import string to get a list of common punctuation characters
17
 
18
  @dataclass
@@ -491,6 +492,14 @@ class CustomImageAnalyzerEngine:
491
  analyzer_results_by_line[i] = analyzer_result
492
 
493
  elif pii_identification_method == "AWS Comprehend":
 
 
 
 
 
 
 
 
494
  if len(line_level_ocr_result.text) >= 3:
495
  # Add line to current batch with a separator
496
  if current_batch:
@@ -509,6 +518,7 @@ class CustomImageAnalyzerEngine:
509
  Text=current_batch,
510
  LanguageCode=text_analyzer_kwargs["language"]
511
  )
 
512
  except Exception as e:
513
  print(e)
514
  time.sleep(3)
 
13
  from tools.helper_functions import clean_unicode_text
14
  from tools.aws_functions import comprehend_client
15
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
16
+ from tools.load_spacy_model_custom_recognisers import custom_entities
17
  #import string # Import string to get a list of common punctuation characters
18
 
19
  @dataclass
 
492
  analyzer_results_by_line[i] = analyzer_result
493
 
494
  elif pii_identification_method == "AWS Comprehend":
495
+
496
+ # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
497
+ text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
498
+
499
+ spacy_analyzer_result = self.analyzer_engine.analyze(
500
+ text=line_level_ocr_result.text, **text_analyzer_kwargs)
501
+ analyzer_results_by_line[i].extend(spacy_analyzer_result)
502
+
503
  if len(line_level_ocr_result.text) >= 3:
504
  # Add line to current batch with a separator
505
  if current_batch:
 
518
  Text=current_batch,
519
  LanguageCode=text_analyzer_kwargs["language"]
520
  )
521
+
522
  except Exception as e:
523
  print(e)
524
  time.sleep(3)
tools/file_conversion.py CHANGED
@@ -11,6 +11,8 @@ import pymupdf
11
  from gradio import Progress
12
  from typing import List, Optional
13
 
 
 
14
  def is_pdf_or_image(filename):
15
  """
16
  Check if a file name is a PDF or an image file.
@@ -42,7 +44,7 @@ def is_pdf(filename):
42
  # %%
43
  ## Convert pdf to image if necessary
44
 
45
- def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
46
 
47
  # Get the number of pages in the PDF
48
  page_count = pdfinfo_from_path(pdf_path)['Pages']
@@ -70,7 +72,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
70
 
71
 
72
  else:
73
- image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
74
 
75
  image = image_l[0]
76
 
@@ -334,7 +336,7 @@ def prepare_image_or_pdf(
334
 
335
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
336
 
337
- def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
338
  file_path_without_ext = get_file_path_end(in_file_path)
339
 
340
  out_file_paths = out_text_file_path
@@ -344,7 +346,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
344
 
345
  pdf_text_image_paths = process_file(out_text_file_path[0])
346
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
347
- pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
348
 
349
  # out_file_paths.append(out_text_image_file_path)
350
 
 
11
  from gradio import Progress
12
  from typing import List, Optional
13
 
14
+ image_dpi = 300.0
15
+
16
  def is_pdf_or_image(filename):
17
  """
18
  Check if a file name is a PDF or an image file.
 
44
  # %%
45
  ## Convert pdf to image if necessary
46
 
47
+ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
48
 
49
  # Get the number of pages in the PDF
50
  page_count = pdfinfo_from_path(pdf_path)['Pages']
 
72
 
73
 
74
  else:
75
+ image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
76
 
77
  image = image_l[0]
78
 
 
336
 
337
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
338
 
339
+ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
340
  file_path_without_ext = get_file_path_end(in_file_path)
341
 
342
  out_file_paths = out_text_file_path
 
346
 
347
  pdf_text_image_paths = process_file(out_text_file_path[0])
348
  out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
349
+ pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
350
 
351
  # out_file_paths.append(out_text_image_file_path)
352
 
tools/file_redaction.py CHANGED
@@ -27,8 +27,8 @@ from collections import defaultdict # For efficient grouping
27
  from presidio_analyzer import RecognizerResult
28
 
29
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
30
- from tools.file_conversion import process_file
31
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
32
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
33
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
34
  # from tools.data_anonymise import generate_decision_process_output
@@ -314,8 +314,8 @@ def choose_and_run_redactor(file_paths:List[str],
314
 
315
  # Save file
316
  if is_pdf(file_path) == False:
317
- out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
318
- pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pymupdf_doc[1:])
319
 
320
  else:
321
  out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
@@ -413,35 +413,40 @@ def choose_and_run_redactor(file_paths:List[str],
413
 
414
  return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
415
 
416
- def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
417
  '''
418
- Convert annotations from pikepdf to pymupdf format
419
  '''
 
 
 
420
 
421
- mediabox_height = pymupdf_page.mediabox[3] - pymupdf_page.mediabox[1]
422
- mediabox_width = pymupdf_page.mediabox[2] - pymupdf_page.mediabox[0]
423
- rect_height = pymupdf_page.rect.height
424
- rect_width = pymupdf_page.rect.width
 
 
425
 
426
- # Adjust coordinates based on scaling factors
427
- page_x_adjust = (rect_width - mediabox_width) / 2 # Center adjustment
428
- page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
429
 
430
- #print("In the pikepdf conversion function")
431
- # Extract the /Rect field
432
- rect_field = annot["/Rect"]
433
-
434
- # Convert the extracted /Rect field to a list of floats (since pikepdf uses Decimal objects)
435
- rect_coordinates = [float(coord) for coord in rect_field]
436
 
437
- # Convert the Y-coordinates (flip using the page height)
438
  x1, y1, x2, y2 = rect_coordinates
439
- x1 = x1 + page_x_adjust
440
- new_y1 = (rect_height - y2) - page_y_adjust
441
- x2 = x2 + page_x_adjust
442
- new_y2 = (rect_height - y1) - page_y_adjust
443
-
444
- return x1, new_y1, x2, new_y2
 
445
 
446
  def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
447
  '''
@@ -496,6 +501,64 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
496
 
497
  return x1, new_y1, x2, new_y2
498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
500
  '''
501
  Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
@@ -587,25 +650,25 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
587
 
588
  # Else it should be a pikepdf annotation object
589
  else:
590
- x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymudf(page, annot)
591
 
592
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
593
 
594
  img_annotation_box = {}
595
 
596
  if image:
597
- image_x1, image_y1, image_x2, image_y2 = convert_pikepdf_to_image_coords(page, annot, image)
598
-
599
- img_annotation_box["xmin"] = image_x1
600
- img_annotation_box["ymin"] = image_y1
601
- img_annotation_box["xmax"] = image_x2
602
- img_annotation_box["ymax"] = image_y2
603
- img_annotation_box["color"] = (0,0,0)
 
 
604
 
605
  if isinstance(annot, Dictionary):
606
- #print("Trying to get label out of annotation", annot["/T"])
607
  img_annotation_box["label"] = str(annot["/T"])
608
- #print("Label is:", img_annotation_box["label"])
609
  else:
610
  img_annotation_box["label"] = "REDACTION"
611
 
@@ -646,6 +709,18 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
646
  merged_bboxes = []
647
  grouped_bboxes = defaultdict(list)
648
 
 
 
 
 
 
 
 
 
 
 
 
 
649
  # Reconstruct bounding boxes for substrings of interest
650
  reconstructed_bboxes = []
651
  for bbox in bboxes:
@@ -735,16 +810,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
735
 
736
  merged_bboxes.append(merged_box)
737
 
738
- # Process signature and handwriting results
739
- if signature_recogniser_results or handwriting_recogniser_results:
740
- if "Redact all identified handwriting" in handwrite_signature_checkbox:
741
- #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
742
- merged_bboxes.extend(handwriting_recogniser_results)
743
-
744
- if "Redact all identified signatures" in handwrite_signature_checkbox:
745
- #print("Signature boxes exist at merge:", signature_recogniser_results)
746
- merged_bboxes.extend(signature_recogniser_results)
747
-
748
  #print("bboxes:", bboxes)
749
 
750
  return merged_bboxes
@@ -1483,6 +1548,21 @@ def redact_text_pdf(
1483
  all_text_line_results.append((i, text_line_analyser_result))
1484
 
1485
  elif pii_identification_method == "AWS Comprehend":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1486
  if len(text_line.text) >= 3:
1487
  # Add separator between lines
1488
  if current_batch:
 
27
  from presidio_analyzer import RecognizerResult
28
 
29
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
30
+ from tools.file_conversion import process_file, image_dpi
31
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
32
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
33
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
34
  # from tools.data_anonymise import generate_decision_process_output
 
314
 
315
  # Save file
316
  if is_pdf(file_path) == False:
317
+ out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_pdf.pdf"
318
+ pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
319
 
320
  else:
321
  out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
 
413
 
414
  return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
415
 
416
+ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
417
  '''
418
+ Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
419
  '''
420
+ # Use cropbox if available, otherwise use mediabox
421
+ reference_box = pymupdf_page.rect
422
+ mediabox = pymupdf_page.mediabox
423
 
424
+ reference_box_height = reference_box.height
425
+ reference_box_width = reference_box.width
426
+
427
+ # Convert PyMuPDF coordinates back to PDF coordinates (bottom-left origin)
428
+ media_height = mediabox.height
429
+ media_width = mediabox.width
430
 
431
+ media_reference_y_diff = media_height - reference_box_height
432
+ media_reference_x_diff = media_width - reference_box_width
 
433
 
434
+ y_diff_ratio = media_reference_y_diff / reference_box_height
435
+ x_diff_ratio = media_reference_x_diff / reference_box_width
436
+
437
+ # Extract the annotation rectangle field
438
+ rect_field = pikepdf_bbox["/Rect"]
439
+ rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
440
 
441
+ # Unpack coordinates
442
  x1, y1, x2, y2 = rect_coordinates
443
+
444
+ new_x1 = x1 - (media_reference_x_diff * x_diff_ratio)
445
+ new_y1 = media_height - y2 - (media_reference_y_diff * y_diff_ratio)
446
+ new_x2 = x2 - (media_reference_x_diff * x_diff_ratio)
447
+ new_y2 = media_height - y1 - (media_reference_y_diff * y_diff_ratio)
448
+
449
+ return new_x1, new_y1, new_x2, new_y2
450
 
451
  def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
452
  '''
 
501
 
502
  return x1, new_y1, x2, new_y2
503
 
504
+ # def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
505
+ # '''
506
+ # Converts coordinates from pymupdf format to image coordinates.
507
+ # '''
508
+
509
+ # rect_height = pymupdf_page.rect.height
510
+ # rect_width = pymupdf_page.rect.width
511
+
512
+ # image_page_width, image_page_height = image.size
513
+
514
+ # # Calculate scaling factors between pymupdf and PIL image
515
+ # scale_width = image_page_width / rect_width
516
+ # scale_height = image_page_height / rect_height
517
+
518
+ # x1_image = x1 * scale_width
519
+ # y1_image = ((rect_height - y2) * scale_height)
520
+ # x2_image = x2 * scale_width
521
+ # y2_image = ((rect_height - y1) * scale_height)
522
+
523
+ # return x1_image, y1_image, x2_image, y2_image
524
+
525
+ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
526
+ '''
527
+ Converts coordinates from pymupdf format to image coordinates,
528
+ accounting for mediabox dimensions.
529
+ '''
530
+
531
+ rect_height = pymupdf_page.rect.height
532
+ rect_width = pymupdf_page.rect.width
533
+
534
+ # Get mediabox dimensions
535
+ mediabox = pymupdf_page.mediabox
536
+ mediabox_width = mediabox.width
537
+ mediabox_height = mediabox.height
538
+
539
+ image_page_width, image_page_height = image.size
540
+
541
+ # Calculate scaling factors using mediabox dimensions
542
+ scale_width = image_page_width / mediabox_width
543
+ scale_height = image_page_height / mediabox_height
544
+
545
+ print("scale_width:", scale_width)
546
+ print("scale_height:", scale_height)
547
+
548
+ rect_to_mediabox_x_scale = mediabox_width / rect_width
549
+ rect_to_mediabox_y_scale = mediabox_height / rect_height
550
+
551
+ print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
552
+ print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
553
+
554
+ # Adjust coordinates based on scaling factors
555
+ x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
556
+ y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
557
+ x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
558
+ y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
559
+
560
+ return x1_image, y1_image, x2_image, y2_image
561
+
562
  def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
563
  '''
564
  Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
 
650
 
651
  # Else it should be a pikepdf annotation object
652
  else:
653
+ x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
654
 
655
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
656
 
657
  img_annotation_box = {}
658
 
659
  if image:
660
+ img_width, img_height = image.size
661
+
662
+ x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
663
+
664
+ img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
665
+ img_annotation_box["ymin"] = image_y1 #* (img_width / rect_width) # Use adjusted y1
666
+ img_annotation_box["xmax"] = x2# * (img_height / rect_height) # Use adjusted x2
667
+ img_annotation_box["ymax"] = image_y2 #* (img_height / rect_height) # Use adjusted y2
668
+ img_annotation_box["color"] = (0, 0, 0)
669
 
670
  if isinstance(annot, Dictionary):
 
671
  img_annotation_box["label"] = str(annot["/T"])
 
672
  else:
673
  img_annotation_box["label"] = "REDACTION"
674
 
 
709
  merged_bboxes = []
710
  grouped_bboxes = defaultdict(list)
711
 
712
+
713
+ # Process signature and handwriting results
714
+ if signature_recogniser_results or handwriting_recogniser_results:
715
+ if "Redact all identified handwriting" in handwrite_signature_checkbox:
716
+ #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
717
+ merged_bboxes.extend(handwriting_recogniser_results)
718
+
719
+ if "Redact all identified signatures" in handwrite_signature_checkbox:
720
+ #print("Signature boxes exist at merge:", signature_recogniser_results)
721
+ merged_bboxes.extend(signature_recogniser_results)
722
+
723
+
724
  # Reconstruct bounding boxes for substrings of interest
725
  reconstructed_bboxes = []
726
  for bbox in bboxes:
 
810
 
811
  merged_bboxes.append(merged_box)
812
 
 
 
 
 
 
 
 
 
 
 
813
  #print("bboxes:", bboxes)
814
 
815
  return merged_bboxes
 
1548
  all_text_line_results.append((i, text_line_analyser_result))
1549
 
1550
  elif pii_identification_method == "AWS Comprehend":
1551
+
1552
+ # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1553
+ custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1554
+
1555
+ text_line_analyser_result = nlp_analyser.analyze(
1556
+ text=text_line.text,
1557
+ language=language,
1558
+ entities=custom_redact_entities,
1559
+ score_threshold=score_threshold,
1560
+ return_decision_process=True,
1561
+ allow_list=allow_list
1562
+ )
1563
+ all_text_line_results.append((i, text_line_analyser_result))
1564
+
1565
+
1566
  if len(text_line.text) >= 3:
1567
  # Add separator between lines
1568
  if current_batch:
tools/helper_functions.py CHANGED
@@ -3,9 +3,20 @@ import re
3
  import gradio as gr
4
  import pandas as pd
5
  import unicodedata
 
6
 
7
  def reset_state_vars():
8
- return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
 
 
 
 
 
 
 
 
 
 
9
 
10
  def get_or_create_env_var(var_name, default_value):
11
  # Get the environment variable if it exists
 
3
  import gradio as gr
4
  import pandas as pd
5
  import unicodedata
6
+ from gradio_image_annotation import image_annotator
7
 
8
  def reset_state_vars():
9
+ return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
10
+ label="Modify redaction boxes",
11
+ label_list=["Redaction"],
12
+ label_colors=[(0, 0, 0)],
13
+ show_label=False,
14
+ sources=None,#["upload"],
15
+ show_clear_button=False,
16
+ show_share_button=False,
17
+ show_remove_button=False,
18
+ interactive=False
19
+ )
20
 
21
  def get_or_create_env_var(var_name, default_value):
22
  # Get the environment variable if it exists
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -10,6 +10,7 @@ import re
10
  # %%
11
  model_name = "en_core_web_lg" #"en_core_web_trf"
12
  score_threshold = 0.001
 
13
 
14
  # %% [markdown]
15
  # #### Custom recognisers
 
10
  # %%
11
  model_name = "en_core_web_lg" #"en_core_web_trf"
12
  score_threshold = 0.001
13
+ custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
14
 
15
  # %% [markdown]
16
  # #### Custom recognisers
tools/redaction_review.py CHANGED
@@ -37,9 +37,22 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
37
  else:
38
  return max_pages, max_pages
39
 
40
- def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
 
 
 
 
 
 
 
 
 
 
 
41
  # print("\nImage annotator object:", image_annotator_object)
42
 
 
 
43
  if not image_annotator_object:
44
  return image_annotator(
45
  label="Modify redaction boxes",
@@ -76,8 +89,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
76
  #label_list=["Redaction"],
77
  #label_colors=[(0, 0, 0)],
78
  show_label=False,
79
- height='100%',
80
- width='100%',
81
  box_min_size=1,
82
  box_selected_thickness=2,
83
  handle_size=4,
 
37
  else:
38
  return max_pages, max_pages
39
 
40
+ def update_zoom(current_zoom_level:int, decrease:bool=True):
41
+ if decrease == False:
42
+ if current_zoom_level >= 50:
43
+ current_zoom_level -= 10
44
+ else:
45
+ if current_zoom_level < 100:
46
+ current_zoom_level += 10
47
+
48
+ return current_zoom_level
49
+
50
+
51
+ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
52
  # print("\nImage annotator object:", image_annotator_object)
53
 
54
+ zoom_str = str(zoom) + '%'
55
+
56
  if not image_annotator_object:
57
  return image_annotator(
58
  label="Modify redaction boxes",
 
89
  #label_list=["Redaction"],
90
  #label_colors=[(0, 0, 0)],
91
  show_label=False,
92
+ height=zoom_str,
93
+ width=zoom_str,
94
  box_min_size=1,
95
  box_selected_thickness=2,
96
  handle_size=4,