Commit
·
c3a8cd7
1
Parent(s):
6ac4be4
You can now have output redaction boxes in grey according to an environment variable. Review files are now saved every time page is changed.
Browse files- app.py +21 -8
- tools/auth.py +3 -3
- tools/file_conversion.py +7 -3
- tools/redaction_review.py +52 -57
app.py
CHANGED
@@ -13,7 +13,7 @@ from gradio_image_annotation.image_annotator import AnnotatedImageData
|
|
13 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
-
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
17 |
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
@@ -41,6 +41,8 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
|
|
41 |
|
42 |
language = 'en'
|
43 |
|
|
|
|
|
44 |
host_name = socket.gethostname()
|
45 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
46 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
@@ -84,6 +86,8 @@ with app:
|
|
84 |
log_files_output_list_state = gr.State([])
|
85 |
|
86 |
review_file_state = gr.State(pd.DataFrame())
|
|
|
|
|
87 |
|
88 |
# Logging state
|
89 |
log_file_name = 'log.csv'
|
@@ -197,7 +201,7 @@ with app:
|
|
197 |
# Object annotation
|
198 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
199 |
|
200 |
-
with gr.Accordion(label = "Review redaction file", open=
|
201 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
202 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
203 |
|
@@ -344,14 +348,18 @@ with app:
|
|
344 |
# Page controls at top
|
345 |
annotate_current_page.submit(
|
346 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
347 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
348 |
|
349 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
350 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
351 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
|
|
352 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
353 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
354 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
355 |
|
356 |
# Zoom in and out on annotator
|
357 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
@@ -368,18 +376,23 @@ with app:
|
|
368 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
369 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
370 |
|
|
|
|
|
371 |
# Page controls at bottom
|
372 |
annotate_current_page_bottom.submit(
|
373 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
374 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
375 |
|
376 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
377 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
378 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
379 |
|
380 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
381 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
382 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
383 |
|
384 |
# Review side bar controls
|
385 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
|
|
13 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
+
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
17 |
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
|
|
41 |
|
42 |
language = 'en'
|
43 |
|
44 |
+
|
45 |
+
|
46 |
host_name = socket.gethostname()
|
47 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
48 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
|
|
86 |
log_files_output_list_state = gr.State([])
|
87 |
|
88 |
review_file_state = gr.State(pd.DataFrame())
|
89 |
+
|
90 |
+
do_not_save_pdf_state = gr.State(False)
|
91 |
|
92 |
# Logging state
|
93 |
log_file_name = 'log.csv'
|
|
|
201 |
# Object annotation
|
202 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
203 |
|
204 |
+
with gr.Accordion(label = "Review redaction file", open=True):
|
205 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
206 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
207 |
|
|
|
348 |
# Page controls at top
|
349 |
annotate_current_page.submit(
|
350 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
351 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
352 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
353 |
|
354 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
355 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
356 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
357 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
358 |
+
|
359 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
360 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
361 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
362 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
363 |
|
364 |
# Zoom in and out on annotator
|
365 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
|
|
376 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
377 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
378 |
|
379 |
+
do_not_save_pdf_state
|
380 |
+
|
381 |
# Page controls at bottom
|
382 |
annotate_current_page_bottom.submit(
|
383 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
384 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
385 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
386 |
|
387 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
388 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
389 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
390 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
391 |
|
392 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
393 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
394 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
395 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
396 |
|
397 |
# Review side bar controls
|
398 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
tools/auth.py
CHANGED
@@ -7,13 +7,13 @@ import base64
|
|
7 |
from tools.helper_functions import get_or_create_env_var
|
8 |
|
9 |
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
10 |
-
print(f'The value of AWS_CLIENT_ID is {client_id}')
|
11 |
|
12 |
client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
|
13 |
-
print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
|
14 |
|
15 |
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
16 |
-
print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
17 |
|
18 |
def calculate_secret_hash(client_id, client_secret, username):
|
19 |
message = username + client_id
|
|
|
7 |
from tools.helper_functions import get_or_create_env_var
|
8 |
|
9 |
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
10 |
+
#print(f'The value of AWS_CLIENT_ID is {client_id}')
|
11 |
|
12 |
client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
|
13 |
+
#print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
|
14 |
|
15 |
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
16 |
+
#print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
17 |
|
18 |
def calculate_secret_hash(client_id, client_secret, username):
|
19 |
message = username + client_id
|
tools/file_conversion.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
-
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option,
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
import os
|
@@ -48,7 +48,8 @@ def is_pdf(filename):
|
|
48 |
# %%
|
49 |
## Convert pdf to image if necessary
|
50 |
|
51 |
-
|
|
|
52 |
|
53 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
54 |
try:
|
@@ -261,7 +262,10 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
|
|
261 |
else:
|
262 |
out_colour = img_annotation_box["color"]
|
263 |
else:
|
264 |
-
|
|
|
|
|
|
|
265 |
|
266 |
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
267 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
import os
|
|
|
48 |
# %%
|
49 |
## Convert pdf to image if necessary
|
50 |
|
51 |
+
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
52 |
+
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
53 |
|
54 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
55 |
try:
|
|
|
262 |
else:
|
263 |
out_colour = img_annotation_box["color"]
|
264 |
else:
|
265 |
+
if CUSTOM_BOX_COLOUR == "grey":
|
266 |
+
out_colour = (0.5, 0.5, 0.5)
|
267 |
+
else:
|
268 |
+
out_colour = (0,0,0)
|
269 |
|
270 |
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
271 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
tools/redaction_review.py
CHANGED
@@ -55,12 +55,6 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
55 |
'''
|
56 |
recogniser_entities = []
|
57 |
recogniser_dataframe = pd.DataFrame()
|
58 |
-
#recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
|
59 |
-
#recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
|
60 |
-
|
61 |
-
#print("recogniser_dataframe_gr", recogniser_dataframe_gr)
|
62 |
-
#print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
|
63 |
-
#print("recogniser_dataframe_gr.iloc[0,0]:", recogniser_dataframe_gr.iloc[0,0])
|
64 |
|
65 |
if recogniser_dataframe_gr.iloc[0,0] == "":
|
66 |
try:
|
@@ -228,7 +222,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
228 |
|
229 |
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
230 |
|
231 |
-
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
|
232 |
'''
|
233 |
Apply modified redactions to a pymupdf and export review files
|
234 |
'''
|
@@ -251,75 +245,76 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
251 |
file_paths = [file_paths]
|
252 |
|
253 |
for file_path in file_paths:
|
254 |
-
print("file_path:", file_path)
|
255 |
file_base = get_file_path_end(file_path)
|
256 |
|
257 |
file_extension = os.path.splitext(file_path)[1].lower()
|
258 |
|
259 |
-
|
260 |
-
|
261 |
-
|
|
|
262 |
|
263 |
-
|
264 |
|
265 |
-
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
|
273 |
-
|
274 |
|
275 |
-
|
276 |
|
277 |
-
|
278 |
|
279 |
-
|
280 |
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
|
289 |
-
|
290 |
|
291 |
-
|
292 |
|
293 |
-
|
294 |
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
|
312 |
-
|
313 |
-
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
|
324 |
try:
|
325 |
print("Saving annotations to JSON")
|
@@ -331,7 +326,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
331 |
|
332 |
print("Saving annotations to CSV review file")
|
333 |
|
334 |
-
print("review_file_state:", review_file_state)
|
335 |
|
336 |
# Convert json to csv and also save this
|
337 |
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|
|
|
55 |
'''
|
56 |
recogniser_entities = []
|
57 |
recogniser_dataframe = pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if recogniser_dataframe_gr.iloc[0,0] == "":
|
60 |
try:
|
|
|
222 |
|
223 |
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
224 |
|
225 |
+
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
|
226 |
'''
|
227 |
Apply modified redactions to a pymupdf and export review files
|
228 |
'''
|
|
|
245 |
file_paths = [file_paths]
|
246 |
|
247 |
for file_path in file_paths:
|
248 |
+
#print("file_path:", file_path)
|
249 |
file_base = get_file_path_end(file_path)
|
250 |
|
251 |
file_extension = os.path.splitext(file_path)[1].lower()
|
252 |
|
253 |
+
if save_pdf == True:
|
254 |
+
# If working with image docs
|
255 |
+
if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
|
256 |
+
image = Image.open(file_paths[-1])
|
257 |
|
258 |
+
#image = pdf_doc
|
259 |
|
260 |
+
draw = ImageDraw.Draw(image)
|
261 |
|
262 |
+
for img_annotation_box in image_annotated['boxes']:
|
263 |
+
coords = [img_annotation_box["xmin"],
|
264 |
+
img_annotation_box["ymin"],
|
265 |
+
img_annotation_box["xmax"],
|
266 |
+
img_annotation_box["ymax"]]
|
267 |
|
268 |
+
fill = img_annotation_box["color"]
|
269 |
|
270 |
+
draw.rectangle(coords, fill=fill)
|
271 |
|
272 |
+
image.save(output_folder + file_base + "_redacted.png")
|
273 |
|
274 |
+
doc = [image]
|
275 |
|
276 |
+
elif file_extension in '.csv':
|
277 |
+
print("This is a csv")
|
278 |
+
pdf_doc = []
|
279 |
|
280 |
+
# If working with pdfs
|
281 |
+
elif is_pdf(file_path) == True:
|
282 |
+
pdf_doc = pymupdf.open(file_path)
|
283 |
|
284 |
+
number_of_pages = pdf_doc.page_count
|
285 |
|
286 |
+
print("Saving pages to file.")
|
287 |
|
288 |
+
for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
|
289 |
|
290 |
+
#print("Saving page", str(i))
|
291 |
+
|
292 |
+
image_loc = all_image_annotations[i]['image']
|
293 |
+
#print("Image location:", image_loc)
|
294 |
|
295 |
+
# Load in image object
|
296 |
+
if isinstance(image_loc, np.ndarray):
|
297 |
+
image = Image.fromarray(image_loc.astype('uint8'))
|
298 |
+
#all_image_annotations[i]['image'] = image_loc.tolist()
|
299 |
+
elif isinstance(image_loc, Image.Image):
|
300 |
+
image = image_loc
|
301 |
+
#image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
|
302 |
+
#image_loc.save(image_out_folder)
|
303 |
+
#all_image_annotations[i]['image'] = image_out_folder
|
304 |
+
elif isinstance(image_loc, str):
|
305 |
+
image = Image.open(image_loc)
|
306 |
|
307 |
+
pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
|
308 |
+
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
309 |
|
310 |
+
else:
|
311 |
+
print("File type not recognised.")
|
312 |
+
|
313 |
+
#try:
|
314 |
+
if pdf_doc:
|
315 |
+
out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
|
316 |
+
pdf_doc.save(out_pdf_file_path)
|
317 |
+
output_files.append(out_pdf_file_path)
|
318 |
|
319 |
try:
|
320 |
print("Saving annotations to JSON")
|
|
|
326 |
|
327 |
print("Saving annotations to CSV review file")
|
328 |
|
329 |
+
#print("review_file_state:", review_file_state)
|
330 |
|
331 |
# Convert json to csv and also save this
|
332 |
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|