Commit
·
ec98119
1
Parent(s):
c71d0c1
Comprehend now uses custom spacy recognisers on top of defaults. Added zoom functionality to annotator. Fixed some pdf mediabox issues and redacted image output issues.
Browse files- app.py +37 -14
- tools/aws_functions.py +1 -1
- tools/custom_image_analyser_engine.py +10 -0
- tools/file_conversion.py +6 -4
- tools/file_redaction.py +126 -46
- tools/helper_functions.py +12 -1
- tools/load_spacy_model_custom_recognisers.py +1 -0
- tools/redaction_review.py +16 -3
app.py
CHANGED
@@ -13,9 +13,10 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
|
|
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
-
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
17 |
from tools.data_anonymise import anonymise_data_files
|
18 |
from tools.auth import authenticate_user
|
|
|
19 |
|
20 |
|
21 |
today_rev = datetime.now().strftime("%Y%m%d")
|
@@ -29,6 +30,10 @@ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT
|
|
29 |
|
30 |
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
|
31 |
|
|
|
|
|
|
|
|
|
32 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
33 |
|
34 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
@@ -117,6 +122,12 @@ with app:
|
|
117 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
118 |
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
###
|
121 |
# UI DESIGN
|
122 |
###
|
@@ -164,6 +175,9 @@ with app:
|
|
164 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
165 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
166 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
|
|
|
|
|
|
|
167 |
|
168 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
169 |
|
@@ -238,9 +252,9 @@ with app:
|
|
238 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
239 |
|
240 |
with gr.Accordion("Add or remove entity types to redact", open = False):
|
241 |
-
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
242 |
-
|
243 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
|
|
|
|
244 |
|
245 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
246 |
#with gr.Row():
|
@@ -260,18 +274,19 @@ with app:
|
|
260 |
###
|
261 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
262 |
|
263 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
|
264 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
265 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
266 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")
|
267 |
-
|
268 |
|
269 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
270 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
271 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
|
|
|
272 |
|
273 |
# If a file has been completed, the function will continue onto the next document
|
274 |
-
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
275 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
276 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
277 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
@@ -284,12 +299,20 @@ with app:
|
|
284 |
# Page controls at top
|
285 |
annotate_current_page.submit(
|
286 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
287 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
288 |
|
289 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
290 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
291 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
292 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
295 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
@@ -297,12 +320,12 @@ with app:
|
|
297 |
# Page controls at bottom
|
298 |
annotate_current_page_bottom.submit(
|
299 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
300 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
301 |
|
302 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
303 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
304 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
305 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
306 |
|
307 |
###
|
308 |
# TABULAR DATA REDACTION
|
|
|
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
14 |
from tools.file_redaction import choose_and_run_redactor
|
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
16 |
+
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
|
17 |
from tools.data_anonymise import anonymise_data_files
|
18 |
from tools.auth import authenticate_user
|
19 |
+
from tools.load_spacy_model_custom_recognisers import custom_entities
|
20 |
|
21 |
|
22 |
today_rev = datetime.now().strftime("%Y%m%d")
|
|
|
30 |
|
31 |
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
|
32 |
|
33 |
+
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
34 |
+
chosen_comprehend_entities.extend(custom_entities)
|
35 |
+
full_comprehend_entity_list.extend(custom_entities)
|
36 |
+
|
37 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
38 |
|
39 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
|
|
122 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
123 |
|
124 |
|
125 |
+
## Annotator zoom value
|
126 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
|
127 |
+
zoom_true_bool = gr.State(True)
|
128 |
+
zoom_false_bool = gr.State(False)
|
129 |
+
|
130 |
+
|
131 |
###
|
132 |
# UI DESIGN
|
133 |
###
|
|
|
175 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
176 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
177 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
|
178 |
+
with gr.Row():
|
179 |
+
annotate_zoom_in = gr.Button("Zoom in")
|
180 |
+
annotate_zoom_out = gr.Button("Zoom out")
|
181 |
|
182 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
183 |
|
|
|
252 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
253 |
|
254 |
with gr.Accordion("Add or remove entity types to redact", open = False):
|
|
|
|
|
255 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
256 |
+
|
257 |
+
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
258 |
|
259 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
260 |
#with gr.Row():
|
|
|
274 |
###
|
275 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
276 |
|
277 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
278 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
279 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
280 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
281 |
+
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
|
282 |
|
283 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
284 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
285 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
286 |
+
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
|
287 |
|
288 |
# If a file has been completed, the function will continue onto the next document
|
289 |
+
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
290 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
291 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
292 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
|
|
299 |
# Page controls at top
|
300 |
annotate_current_page.submit(
|
301 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
302 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
303 |
|
304 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
305 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
306 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
307 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
308 |
+
|
309 |
+
# Zoom in and out on annotator
|
310 |
+
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
311 |
+
then(update_zoom, inputs=[annotator_zoom_number, zoom_true_bool], outputs=[annotator_zoom_number]).\
|
312 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
313 |
+
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
314 |
+
then(update_zoom, inputs=[annotator_zoom_number, zoom_false_bool], outputs=[annotator_zoom_number]).\
|
315 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
316 |
|
317 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
318 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
|
|
320 |
# Page controls at bottom
|
321 |
annotate_current_page_bottom.submit(
|
322 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
323 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
324 |
|
325 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
326 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
327 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
328 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
329 |
|
330 |
###
|
331 |
# TABULAR DATA REDACTION
|
tools/aws_functions.py
CHANGED
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
10 |
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
|
13 |
-
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "
|
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
|
|
10 |
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
|
13 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
|
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -13,6 +13,7 @@ from copy import deepcopy
|
|
13 |
from tools.helper_functions import clean_unicode_text
|
14 |
from tools.aws_functions import comprehend_client
|
15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
|
|
16 |
#import string # Import string to get a list of common punctuation characters
|
17 |
|
18 |
@dataclass
|
@@ -491,6 +492,14 @@ class CustomImageAnalyzerEngine:
|
|
491 |
analyzer_results_by_line[i] = analyzer_result
|
492 |
|
493 |
elif pii_identification_method == "AWS Comprehend":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
if len(line_level_ocr_result.text) >= 3:
|
495 |
# Add line to current batch with a separator
|
496 |
if current_batch:
|
@@ -509,6 +518,7 @@ class CustomImageAnalyzerEngine:
|
|
509 |
Text=current_batch,
|
510 |
LanguageCode=text_analyzer_kwargs["language"]
|
511 |
)
|
|
|
512 |
except Exception as e:
|
513 |
print(e)
|
514 |
time.sleep(3)
|
|
|
13 |
from tools.helper_functions import clean_unicode_text
|
14 |
from tools.aws_functions import comprehend_client
|
15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
16 |
+
from tools.load_spacy_model_custom_recognisers import custom_entities
|
17 |
#import string # Import string to get a list of common punctuation characters
|
18 |
|
19 |
@dataclass
|
|
|
492 |
analyzer_results_by_line[i] = analyzer_result
|
493 |
|
494 |
elif pii_identification_method == "AWS Comprehend":
|
495 |
+
|
496 |
+
# If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
|
497 |
+
text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
498 |
+
|
499 |
+
spacy_analyzer_result = self.analyzer_engine.analyze(
|
500 |
+
text=line_level_ocr_result.text, **text_analyzer_kwargs)
|
501 |
+
analyzer_results_by_line[i].extend(spacy_analyzer_result)
|
502 |
+
|
503 |
if len(line_level_ocr_result.text) >= 3:
|
504 |
# Add line to current batch with a separator
|
505 |
if current_batch:
|
|
|
518 |
Text=current_batch,
|
519 |
LanguageCode=text_analyzer_kwargs["language"]
|
520 |
)
|
521 |
+
|
522 |
except Exception as e:
|
523 |
print(e)
|
524 |
time.sleep(3)
|
tools/file_conversion.py
CHANGED
@@ -11,6 +11,8 @@ import pymupdf
|
|
11 |
from gradio import Progress
|
12 |
from typing import List, Optional
|
13 |
|
|
|
|
|
14 |
def is_pdf_or_image(filename):
|
15 |
"""
|
16 |
Check if a file name is a PDF or an image file.
|
@@ -42,7 +44,7 @@ def is_pdf(filename):
|
|
42 |
# %%
|
43 |
## Convert pdf to image if necessary
|
44 |
|
45 |
-
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
|
46 |
|
47 |
# Get the number of pages in the PDF
|
48 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
@@ -70,7 +72,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
|
|
70 |
|
71 |
|
72 |
else:
|
73 |
-
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=
|
74 |
|
75 |
image = image_l[0]
|
76 |
|
@@ -334,7 +336,7 @@ def prepare_image_or_pdf(
|
|
334 |
|
335 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
336 |
|
337 |
-
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
338 |
file_path_without_ext = get_file_path_end(in_file_path)
|
339 |
|
340 |
out_file_paths = out_text_file_path
|
@@ -344,7 +346,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
344 |
|
345 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
346 |
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
|
347 |
-
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=
|
348 |
|
349 |
# out_file_paths.append(out_text_image_file_path)
|
350 |
|
|
|
11 |
from gradio import Progress
|
12 |
from typing import List, Optional
|
13 |
|
14 |
+
image_dpi = 300.0
|
15 |
+
|
16 |
def is_pdf_or_image(filename):
|
17 |
"""
|
18 |
Check if a file name is a PDF or an image file.
|
|
|
44 |
# %%
|
45 |
## Convert pdf to image if necessary
|
46 |
|
47 |
+
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
48 |
|
49 |
# Get the number of pages in the PDF
|
50 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
|
|
72 |
|
73 |
|
74 |
else:
|
75 |
+
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
76 |
|
77 |
image = image_l[0]
|
78 |
|
|
|
336 |
|
337 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
338 |
|
339 |
+
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
340 |
file_path_without_ext = get_file_path_end(in_file_path)
|
341 |
|
342 |
out_file_paths = out_text_file_path
|
|
|
346 |
|
347 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
348 |
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
|
349 |
+
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
|
350 |
|
351 |
# out_file_paths.append(out_text_image_file_path)
|
352 |
|
tools/file_redaction.py
CHANGED
@@ -27,8 +27,8 @@ from collections import defaultdict # For efficient grouping
|
|
27 |
from presidio_analyzer import RecognizerResult
|
28 |
|
29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
30 |
-
from tools.file_conversion import process_file
|
31 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
32 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
33 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
34 |
# from tools.data_anonymise import generate_decision_process_output
|
@@ -314,8 +314,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
314 |
|
315 |
# Save file
|
316 |
if is_pdf(file_path) == False:
|
317 |
-
out_image_file_path = output_folder + file_path_without_ext + "
|
318 |
-
pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=
|
319 |
|
320 |
else:
|
321 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
|
@@ -413,35 +413,40 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
413 |
|
414 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
415 |
|
416 |
-
def
|
417 |
'''
|
418 |
-
Convert annotations from pikepdf to pymupdf format
|
419 |
'''
|
|
|
|
|
|
|
420 |
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
|
|
|
|
425 |
|
426 |
-
|
427 |
-
|
428 |
-
page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
|
429 |
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
rect_coordinates = [float(coord) for coord in rect_field]
|
436 |
|
437 |
-
#
|
438 |
x1, y1, x2, y2 = rect_coordinates
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
|
|
445 |
|
446 |
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
447 |
'''
|
@@ -496,6 +501,64 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
|
|
496 |
|
497 |
return x1, new_y1, x2, new_y2
|
498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
500 |
'''
|
501 |
Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
|
@@ -587,25 +650,25 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
|
587 |
|
588 |
# Else it should be a pikepdf annotation object
|
589 |
else:
|
590 |
-
x1, pymupdf_y1, x2, pymupdf_y2 =
|
591 |
|
592 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
|
593 |
|
594 |
img_annotation_box = {}
|
595 |
|
596 |
if image:
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
img_annotation_box["
|
602 |
-
img_annotation_box["
|
603 |
-
img_annotation_box["
|
|
|
|
|
604 |
|
605 |
if isinstance(annot, Dictionary):
|
606 |
-
#print("Trying to get label out of annotation", annot["/T"])
|
607 |
img_annotation_box["label"] = str(annot["/T"])
|
608 |
-
#print("Label is:", img_annotation_box["label"])
|
609 |
else:
|
610 |
img_annotation_box["label"] = "REDACTION"
|
611 |
|
@@ -646,6 +709,18 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
646 |
merged_bboxes = []
|
647 |
grouped_bboxes = defaultdict(list)
|
648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
# Reconstruct bounding boxes for substrings of interest
|
650 |
reconstructed_bboxes = []
|
651 |
for bbox in bboxes:
|
@@ -735,16 +810,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
735 |
|
736 |
merged_bboxes.append(merged_box)
|
737 |
|
738 |
-
# Process signature and handwriting results
|
739 |
-
if signature_recogniser_results or handwriting_recogniser_results:
|
740 |
-
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
741 |
-
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
742 |
-
merged_bboxes.extend(handwriting_recogniser_results)
|
743 |
-
|
744 |
-
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
745 |
-
#print("Signature boxes exist at merge:", signature_recogniser_results)
|
746 |
-
merged_bboxes.extend(signature_recogniser_results)
|
747 |
-
|
748 |
#print("bboxes:", bboxes)
|
749 |
|
750 |
return merged_bboxes
|
@@ -1483,6 +1548,21 @@ def redact_text_pdf(
|
|
1483 |
all_text_line_results.append((i, text_line_analyser_result))
|
1484 |
|
1485 |
elif pii_identification_method == "AWS Comprehend":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1486 |
if len(text_line.text) >= 3:
|
1487 |
# Add separator between lines
|
1488 |
if current_batch:
|
|
|
27 |
from presidio_analyzer import RecognizerResult
|
28 |
|
29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
30 |
+
from tools.file_conversion import process_file, image_dpi
|
31 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
|
32 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
33 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
34 |
# from tools.data_anonymise import generate_decision_process_output
|
|
|
314 |
|
315 |
# Save file
|
316 |
if is_pdf(file_path) == False:
|
317 |
+
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_pdf.pdf"
|
318 |
+
pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
319 |
|
320 |
else:
|
321 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
|
|
|
413 |
|
414 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
415 |
|
416 |
+
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
417 |
'''
|
418 |
+
Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
|
419 |
'''
|
420 |
+
# Use cropbox if available, otherwise use mediabox
|
421 |
+
reference_box = pymupdf_page.rect
|
422 |
+
mediabox = pymupdf_page.mediabox
|
423 |
|
424 |
+
reference_box_height = reference_box.height
|
425 |
+
reference_box_width = reference_box.width
|
426 |
+
|
427 |
+
# Convert PyMuPDF coordinates back to PDF coordinates (bottom-left origin)
|
428 |
+
media_height = mediabox.height
|
429 |
+
media_width = mediabox.width
|
430 |
|
431 |
+
media_reference_y_diff = media_height - reference_box_height
|
432 |
+
media_reference_x_diff = media_width - reference_box_width
|
|
|
433 |
|
434 |
+
y_diff_ratio = media_reference_y_diff / reference_box_height
|
435 |
+
x_diff_ratio = media_reference_x_diff / reference_box_width
|
436 |
+
|
437 |
+
# Extract the annotation rectangle field
|
438 |
+
rect_field = pikepdf_bbox["/Rect"]
|
439 |
+
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
440 |
|
441 |
+
# Unpack coordinates
|
442 |
x1, y1, x2, y2 = rect_coordinates
|
443 |
+
|
444 |
+
new_x1 = x1 - (media_reference_x_diff * x_diff_ratio)
|
445 |
+
new_y1 = media_height - y2 - (media_reference_y_diff * y_diff_ratio)
|
446 |
+
new_x2 = x2 - (media_reference_x_diff * x_diff_ratio)
|
447 |
+
new_y2 = media_height - y1 - (media_reference_y_diff * y_diff_ratio)
|
448 |
+
|
449 |
+
return new_x1, new_y1, new_x2, new_y2
|
450 |
|
451 |
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
452 |
'''
|
|
|
501 |
|
502 |
return x1, new_y1, x2, new_y2
|
503 |
|
504 |
+
# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
505 |
+
# '''
|
506 |
+
# Converts coordinates from pymupdf format to image coordinates.
|
507 |
+
# '''
|
508 |
+
|
509 |
+
# rect_height = pymupdf_page.rect.height
|
510 |
+
# rect_width = pymupdf_page.rect.width
|
511 |
+
|
512 |
+
# image_page_width, image_page_height = image.size
|
513 |
+
|
514 |
+
# # Calculate scaling factors between pymupdf and PIL image
|
515 |
+
# scale_width = image_page_width / rect_width
|
516 |
+
# scale_height = image_page_height / rect_height
|
517 |
+
|
518 |
+
# x1_image = x1 * scale_width
|
519 |
+
# y1_image = ((rect_height - y2) * scale_height)
|
520 |
+
# x2_image = x2 * scale_width
|
521 |
+
# y2_image = ((rect_height - y1) * scale_height)
|
522 |
+
|
523 |
+
# return x1_image, y1_image, x2_image, y2_image
|
524 |
+
|
525 |
+
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
526 |
+
'''
|
527 |
+
Converts coordinates from pymupdf format to image coordinates,
|
528 |
+
accounting for mediabox dimensions.
|
529 |
+
'''
|
530 |
+
|
531 |
+
rect_height = pymupdf_page.rect.height
|
532 |
+
rect_width = pymupdf_page.rect.width
|
533 |
+
|
534 |
+
# Get mediabox dimensions
|
535 |
+
mediabox = pymupdf_page.mediabox
|
536 |
+
mediabox_width = mediabox.width
|
537 |
+
mediabox_height = mediabox.height
|
538 |
+
|
539 |
+
image_page_width, image_page_height = image.size
|
540 |
+
|
541 |
+
# Calculate scaling factors using mediabox dimensions
|
542 |
+
scale_width = image_page_width / mediabox_width
|
543 |
+
scale_height = image_page_height / mediabox_height
|
544 |
+
|
545 |
+
print("scale_width:", scale_width)
|
546 |
+
print("scale_height:", scale_height)
|
547 |
+
|
548 |
+
rect_to_mediabox_x_scale = mediabox_width / rect_width
|
549 |
+
rect_to_mediabox_y_scale = mediabox_height / rect_height
|
550 |
+
|
551 |
+
print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
|
552 |
+
print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
|
553 |
+
|
554 |
+
# Adjust coordinates based on scaling factors
|
555 |
+
x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
|
556 |
+
y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
|
557 |
+
x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
|
558 |
+
y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
|
559 |
+
|
560 |
+
return x1_image, y1_image, x2_image, y2_image
|
561 |
+
|
562 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
563 |
'''
|
564 |
Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
|
|
|
650 |
|
651 |
# Else it should be a pikepdf annotation object
|
652 |
else:
|
653 |
+
x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
|
654 |
|
655 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
|
656 |
|
657 |
img_annotation_box = {}
|
658 |
|
659 |
if image:
|
660 |
+
img_width, img_height = image.size
|
661 |
+
|
662 |
+
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
663 |
+
|
664 |
+
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
665 |
+
img_annotation_box["ymin"] = image_y1 #* (img_width / rect_width) # Use adjusted y1
|
666 |
+
img_annotation_box["xmax"] = x2# * (img_height / rect_height) # Use adjusted x2
|
667 |
+
img_annotation_box["ymax"] = image_y2 #* (img_height / rect_height) # Use adjusted y2
|
668 |
+
img_annotation_box["color"] = (0, 0, 0)
|
669 |
|
670 |
if isinstance(annot, Dictionary):
|
|
|
671 |
img_annotation_box["label"] = str(annot["/T"])
|
|
|
672 |
else:
|
673 |
img_annotation_box["label"] = "REDACTION"
|
674 |
|
|
|
709 |
merged_bboxes = []
|
710 |
grouped_bboxes = defaultdict(list)
|
711 |
|
712 |
+
|
713 |
+
# Process signature and handwriting results
|
714 |
+
if signature_recogniser_results or handwriting_recogniser_results:
|
715 |
+
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
716 |
+
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
717 |
+
merged_bboxes.extend(handwriting_recogniser_results)
|
718 |
+
|
719 |
+
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
720 |
+
#print("Signature boxes exist at merge:", signature_recogniser_results)
|
721 |
+
merged_bboxes.extend(signature_recogniser_results)
|
722 |
+
|
723 |
+
|
724 |
# Reconstruct bounding boxes for substrings of interest
|
725 |
reconstructed_bboxes = []
|
726 |
for bbox in bboxes:
|
|
|
810 |
|
811 |
merged_bboxes.append(merged_box)
|
812 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
813 |
#print("bboxes:", bboxes)
|
814 |
|
815 |
return merged_bboxes
|
|
|
1548 |
all_text_line_results.append((i, text_line_analyser_result))
|
1549 |
|
1550 |
elif pii_identification_method == "AWS Comprehend":
|
1551 |
+
|
1552 |
+
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
1553 |
+
custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
1554 |
+
|
1555 |
+
text_line_analyser_result = nlp_analyser.analyze(
|
1556 |
+
text=text_line.text,
|
1557 |
+
language=language,
|
1558 |
+
entities=custom_redact_entities,
|
1559 |
+
score_threshold=score_threshold,
|
1560 |
+
return_decision_process=True,
|
1561 |
+
allow_list=allow_list
|
1562 |
+
)
|
1563 |
+
all_text_line_results.append((i, text_line_analyser_result))
|
1564 |
+
|
1565 |
+
|
1566 |
if len(text_line.text) >= 3:
|
1567 |
# Add separator between lines
|
1568 |
if current_batch:
|
tools/helper_functions.py
CHANGED
@@ -3,9 +3,20 @@ import re
|
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import unicodedata
|
|
|
6 |
|
7 |
def reset_state_vars():
|
8 |
-
return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def get_or_create_env_var(var_name, default_value):
|
11 |
# Get the environment variable if it exists
|
|
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import unicodedata
|
6 |
+
from gradio_image_annotation import image_annotator
|
7 |
|
8 |
def reset_state_vars():
|
9 |
+
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
10 |
+
label="Modify redaction boxes",
|
11 |
+
label_list=["Redaction"],
|
12 |
+
label_colors=[(0, 0, 0)],
|
13 |
+
show_label=False,
|
14 |
+
sources=None,#["upload"],
|
15 |
+
show_clear_button=False,
|
16 |
+
show_share_button=False,
|
17 |
+
show_remove_button=False,
|
18 |
+
interactive=False
|
19 |
+
)
|
20 |
|
21 |
def get_or_create_env_var(var_name, default_value):
|
22 |
# Get the environment variable if it exists
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -10,6 +10,7 @@ import re
|
|
10 |
# %%
|
11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
|
12 |
score_threshold = 0.001
|
|
|
13 |
|
14 |
# %% [markdown]
|
15 |
# #### Custom recognisers
|
|
|
10 |
# %%
|
11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
|
12 |
score_threshold = 0.001
|
13 |
+
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
|
14 |
|
15 |
# %% [markdown]
|
16 |
# #### Custom recognisers
|
tools/redaction_review.py
CHANGED
@@ -37,9 +37,22 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
|
|
37 |
else:
|
38 |
return max_pages, max_pages
|
39 |
|
40 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# print("\nImage annotator object:", image_annotator_object)
|
42 |
|
|
|
|
|
43 |
if not image_annotator_object:
|
44 |
return image_annotator(
|
45 |
label="Modify redaction boxes",
|
@@ -76,8 +89,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
|
76 |
#label_list=["Redaction"],
|
77 |
#label_colors=[(0, 0, 0)],
|
78 |
show_label=False,
|
79 |
-
height=
|
80 |
-
width=
|
81 |
box_min_size=1,
|
82 |
box_selected_thickness=2,
|
83 |
handle_size=4,
|
|
|
37 |
else:
|
38 |
return max_pages, max_pages
|
39 |
|
40 |
+
def update_zoom(current_zoom_level:int, decrease:bool=True):
|
41 |
+
if decrease == False:
|
42 |
+
if current_zoom_level >= 50:
|
43 |
+
current_zoom_level -= 10
|
44 |
+
else:
|
45 |
+
if current_zoom_level < 100:
|
46 |
+
current_zoom_level += 10
|
47 |
+
|
48 |
+
return current_zoom_level
|
49 |
+
|
50 |
+
|
51 |
+
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
|
52 |
# print("\nImage annotator object:", image_annotator_object)
|
53 |
|
54 |
+
zoom_str = str(zoom) + '%'
|
55 |
+
|
56 |
if not image_annotator_object:
|
57 |
return image_annotator(
|
58 |
label="Modify redaction boxes",
|
|
|
89 |
#label_list=["Redaction"],
|
90 |
#label_colors=[(0, 0, 0)],
|
91 |
show_label=False,
|
92 |
+
height=zoom_str,
|
93 |
+
width=zoom_str,
|
94 |
box_min_size=1,
|
95 |
box_selected_thickness=2,
|
96 |
handle_size=4,
|