Commit
·
6b28cfa
1
Parent(s):
a265560
Added capabilities to export to and import from Adobe .xfdf files
Browse files- app.py +22 -5
- tools/helper_functions.py +5 -0
- tools/redaction_review.py +336 -32
app.py
CHANGED
@@ -10,11 +10,11 @@ from datetime import datetime
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
17 |
-
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
@@ -154,6 +154,8 @@ with app:
|
|
154 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
155 |
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
156 |
|
|
|
|
|
157 |
###
|
158 |
# UI DESIGN
|
159 |
###
|
@@ -255,7 +257,12 @@ with app:
|
|
255 |
#with gr.Column(scale=1):
|
256 |
with gr.Row():
|
257 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
258 |
-
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
###
|
261 |
# TEXT / TABULAR DATA TAB
|
@@ -361,7 +368,8 @@ with app:
|
|
361 |
###
|
362 |
|
363 |
# Upload previous files for modifying redactions
|
364 |
-
upload_previous_review_file_btn.click(fn=
|
|
|
365 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
366 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
367 |
|
@@ -419,7 +427,16 @@ with app:
|
|
419 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
420 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
421 |
|
422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
###
|
424 |
# TABULAR DATA REDACTION
|
425 |
###
|
|
|
10 |
from gradio_image_annotation import image_annotator
|
11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
12 |
|
13 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
17 |
+
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
|
|
154 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
155 |
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
156 |
|
157 |
+
|
158 |
+
|
159 |
###
|
160 |
# UI DESIGN
|
161 |
###
|
|
|
257 |
#with gr.Column(scale=1):
|
258 |
with gr.Row():
|
259 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
260 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
261 |
+
|
262 |
+
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
263 |
+
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
264 |
+
adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple')
|
265 |
+
convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
|
266 |
|
267 |
###
|
268 |
# TEXT / TABULAR DATA TAB
|
|
|
368 |
###
|
369 |
|
370 |
# Upload previous files for modifying redactions
|
371 |
+
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
372 |
+
then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
373 |
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
374 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
375 |
|
|
|
427 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
428 |
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
429 |
|
430 |
+
# Convert review file to xfdf Adobe format
|
431 |
+
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
432 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
433 |
+
then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
|
434 |
+
|
435 |
+
# Convert xfdf Adobe file back to review_file.csv
|
436 |
+
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
437 |
+
then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
438 |
+
then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
|
439 |
+
|
440 |
###
|
441 |
# TABULAR DATA REDACTION
|
442 |
###
|
tools/helper_functions.py
CHANGED
@@ -22,6 +22,9 @@ def reset_state_vars():
|
|
22 |
interactive=False
|
23 |
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
24 |
|
|
|
|
|
|
|
25 |
def get_or_create_env_var(var_name, default_value):
|
26 |
# Get the environment variable if it exists
|
27 |
value = os.environ.get(var_name)
|
@@ -81,6 +84,8 @@ def detect_file_type(filename):
|
|
81 |
return 'jpeg'
|
82 |
elif filename.endswith('.png'):
|
83 |
return 'png'
|
|
|
|
|
84 |
else:
|
85 |
raise ValueError("Unsupported file type.")
|
86 |
|
|
|
22 |
interactive=False
|
23 |
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
24 |
|
25 |
+
def reset_review_vars():
|
26 |
+
return [], pd.DataFrame(), pd.DataFrame()
|
27 |
+
|
28 |
def get_or_create_env_var(var_name, default_value):
|
29 |
# Get the environment variable if it exists
|
30 |
value = os.environ.get(var_name)
|
|
|
84 |
return 'jpeg'
|
85 |
elif filename.endswith('.png'):
|
86 |
return 'png'
|
87 |
+
elif filename.endswith('.xfdf'):
|
88 |
+
return 'xfdf'
|
89 |
else:
|
90 |
raise ValueError("Unsupported file type.")
|
91 |
|
tools/redaction_review.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
-
from xml.etree.ElementTree import Element, SubElement, tostring
|
5 |
from xml.dom import minidom
|
6 |
import uuid
|
7 |
from typing import List
|
8 |
from gradio_image_annotation import image_annotator
|
9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
10 |
-
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
|
11 |
-
from tools.helper_functions import get_file_path_end, output_folder
|
12 |
from tools.file_redaction import redact_page_with_pymupdf
|
13 |
import json
|
14 |
import os
|
@@ -383,10 +383,46 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
383 |
row_value_page = evt.row_value[0] # This is the page number value
|
384 |
return row_value_page
|
385 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
386 |
|
387 |
|
388 |
-
|
389 |
-
|
|
|
|
|
|
|
390 |
# Create root element
|
391 |
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
392 |
|
@@ -397,47 +433,315 @@ def create_xfdf(df, pdf_path):
|
|
397 |
# Add annots
|
398 |
annots = SubElement(xfdf, 'annots')
|
399 |
|
400 |
-
# Process each row in dataframe
|
401 |
for _, row in df.iterrows():
|
402 |
-
|
403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
404 |
|
405 |
-
# Generate unique ID
|
406 |
annot_id = str(uuid.uuid4())
|
407 |
-
|
408 |
|
409 |
# Set page number (subtract 1 as PDF pages are 0-based)
|
410 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
411 |
|
412 |
-
# Set coordinates
|
413 |
-
|
414 |
-
text_annot.set('rect', f"{row['xmin']},{row['ymin']},{row['xmax']},{row['ymax']}")
|
415 |
|
416 |
-
# Set
|
417 |
-
|
418 |
-
|
|
|
|
|
419 |
|
420 |
-
# Set
|
421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
|
423 |
-
#
|
424 |
-
|
425 |
-
|
426 |
-
|
|
|
|
|
|
|
|
|
|
|
427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
# Convert to pretty XML string
|
429 |
xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
|
430 |
|
431 |
return xml_str
|
432 |
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
|
438 |
-
|
439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
|
441 |
-
#
|
442 |
-
|
443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
+
from xml.etree.ElementTree import Element, SubElement, tostring, parse
|
5 |
from xml.dom import minidom
|
6 |
import uuid
|
7 |
from typing import List
|
8 |
from gradio_image_annotation import image_annotator
|
9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
10 |
+
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
|
11 |
+
from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
|
12 |
from tools.file_redaction import redact_page_with_pymupdf
|
13 |
import json
|
14 |
import os
|
|
|
383 |
row_value_page = evt.row_value[0] # This is the page number value
|
384 |
return row_value_page
|
385 |
|
386 |
+
def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
387 |
+
'''
|
388 |
+
Converts coordinates from image space to Adobe PDF space.
|
389 |
+
|
390 |
+
Parameters:
|
391 |
+
- pdf_page_width: Width of the PDF page
|
392 |
+
- pdf_page_height: Height of the PDF page
|
393 |
+
- image_width: Width of the source image
|
394 |
+
- image_height: Height of the source image
|
395 |
+
- x1, y1, x2, y2: Coordinates in image space
|
396 |
+
|
397 |
+
Returns:
|
398 |
+
- Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
|
399 |
+
'''
|
400 |
+
|
401 |
+
# Calculate scaling factors
|
402 |
+
scale_width = pdf_page_width / image_width
|
403 |
+
scale_height = pdf_page_height / image_height
|
404 |
+
|
405 |
+
# Convert coordinates
|
406 |
+
pdf_x1 = x1 * scale_width
|
407 |
+
pdf_x2 = x2 * scale_width
|
408 |
+
|
409 |
+
# Convert Y coordinates (flip vertical axis)
|
410 |
+
# Adobe coordinates start from bottom-left
|
411 |
+
pdf_y1 = pdf_page_height - (y1 * scale_height)
|
412 |
+
pdf_y2 = pdf_page_height - (y2 * scale_height)
|
413 |
+
|
414 |
+
# Make sure y1 is always less than y2 for Adobe's coordinate system
|
415 |
+
if pdf_y1 > pdf_y2:
|
416 |
+
pdf_y1, pdf_y2 = pdf_y2, pdf_y1
|
417 |
+
|
418 |
+
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
419 |
|
420 |
|
421 |
+
def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
|
422 |
+
'''
|
423 |
+
Create an xfdf file from a review csv file and a pdf
|
424 |
+
'''
|
425 |
+
|
426 |
# Create root element
|
427 |
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
428 |
|
|
|
433 |
# Add annots
|
434 |
annots = SubElement(xfdf, 'annots')
|
435 |
|
|
|
436 |
for _, row in df.iterrows():
|
437 |
+
page_python_format = int(row["page"])-1
|
438 |
+
|
439 |
+
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
440 |
+
|
441 |
+
pdf_page_height = pymupdf_page.rect.height
|
442 |
+
pdf_page_width = pymupdf_page.rect.width
|
443 |
+
|
444 |
+
image = image_paths[page_python_format]
|
445 |
+
|
446 |
+
#print("image:", image)
|
447 |
+
|
448 |
+
if isinstance(image, str):
|
449 |
+
image = Image.open(image)
|
450 |
+
|
451 |
+
image_page_width, image_page_height = image.size
|
452 |
+
|
453 |
+
# Create redaction annotation
|
454 |
+
redact_annot = SubElement(annots, 'redact')
|
455 |
|
456 |
+
# Generate unique ID
|
457 |
annot_id = str(uuid.uuid4())
|
458 |
+
redact_annot.set('name', annot_id)
|
459 |
|
460 |
# Set page number (subtract 1 as PDF pages are 0-based)
|
461 |
+
redact_annot.set('page', str(int(row['page']) - 1))
|
462 |
+
|
463 |
+
# Convert coordinates
|
464 |
+
x1, y1, x2, y2 = convert_image_coords_to_adobe(
|
465 |
+
pdf_page_width,
|
466 |
+
pdf_page_height,
|
467 |
+
image_page_width,
|
468 |
+
image_page_height,
|
469 |
+
row['xmin'],
|
470 |
+
row['ymin'],
|
471 |
+
row['xmax'],
|
472 |
+
row['ymax']
|
473 |
+
)
|
474 |
+
|
475 |
+
if CUSTOM_BOX_COLOUR == "grey":
|
476 |
+
colour_str = "0.5,0.5,0.5"
|
477 |
+
else:
|
478 |
+
colour_str = row['color'].strip('()').replace(' ', '')
|
479 |
|
480 |
+
# Set coordinates
|
481 |
+
redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}")
|
|
|
482 |
|
483 |
+
# Set redaction properties
|
484 |
+
redact_annot.set('title', row['label']) # The type of redaction (e.g., "PERSON")
|
485 |
+
redact_annot.set('contents', row['text']) # The redacted text
|
486 |
+
redact_annot.set('subject', row['label']) # The redacted text
|
487 |
+
redact_annot.set('mimetype', "Form")
|
488 |
|
489 |
+
# Set appearance properties
|
490 |
+
redact_annot.set('border-color', colour_str) # Black border
|
491 |
+
redact_annot.set('repeat', 'false')
|
492 |
+
redact_annot.set('interior-color', colour_str)
|
493 |
+
#redact_annot.set('fill-color', colour_str)
|
494 |
+
#redact_annot.set('outline-color', colour_str)
|
495 |
+
redact_annot.set('overlay-color', colour_str)
|
496 |
+
redact_annot.set('overlay-text', row['label'])
|
497 |
+
redact_annot.set('opacity', "0.5")
|
498 |
+
|
499 |
+
# Add appearance dictionary
|
500 |
+
# appearanceDict = SubElement(redact_annot, 'appearancedict')
|
501 |
|
502 |
+
# # Normal appearance
|
503 |
+
# normal = SubElement(appearanceDict, 'normal')
|
504 |
+
# #normal.set('appearance', 'redact')
|
505 |
+
|
506 |
+
# # Color settings for the mark (before applying redaction)
|
507 |
+
# markAppearance = SubElement(redact_annot, 'markappearance')
|
508 |
+
# markAppearance.set('stroke-color', colour_str) # Red outline
|
509 |
+
# markAppearance.set('fill-color', colour_str) # Light red fill
|
510 |
+
# markAppearance.set('opacity', '0.5') # 50% opacity
|
511 |
|
512 |
+
# # Final redaction appearance (after applying)
|
513 |
+
# redactAppearance = SubElement(redact_annot, 'redactAppearance')
|
514 |
+
# redactAppearance.set('fillColor', colour_str) # Black fill
|
515 |
+
# redactAppearance.set('fontName', 'Helvetica')
|
516 |
+
# redactAppearance.set('fontSize', '12')
|
517 |
+
# redactAppearance.set('textAlignment', 'left')
|
518 |
+
# redactAppearance.set('textColor', colour_str) # White text
|
519 |
+
|
520 |
# Convert to pretty XML string
|
521 |
xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
|
522 |
|
523 |
return xml_str
|
524 |
|
525 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
|
526 |
+
'''
|
527 |
+
Load in files to convert a review file into an Adobe comment file format
|
528 |
+
'''
|
529 |
+
output_paths = []
|
530 |
+
pdf_name = ""
|
531 |
+
|
532 |
+
if isinstance(input_files, str):
|
533 |
+
file_paths_list = [input_files]
|
534 |
+
else:
|
535 |
+
file_paths_list = input_files
|
536 |
+
|
537 |
+
# Sort the file paths so that the pdfs come first
|
538 |
+
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
539 |
+
|
540 |
+
for file in file_paths_list:
|
541 |
+
|
542 |
+
if isinstance(file, str):
|
543 |
+
file_path = file
|
544 |
+
else:
|
545 |
+
file_path = file.name
|
546 |
|
547 |
+
file_path_name = get_file_path_end(file_path)
|
548 |
+
file_path_end = detect_file_type(file_path)
|
549 |
+
|
550 |
+
if file_path_end == "pdf":
|
551 |
+
pdf_name = os.path.basename(file_path)
|
552 |
+
|
553 |
+
if file_path_end == "csv":
|
554 |
+
# If no pdf name, just get the name of the file path
|
555 |
+
if not pdf_name:
|
556 |
+
pdf_name = file_path_name
|
557 |
+
# Read CSV file
|
558 |
+
df = pd.read_csv(file_path)
|
559 |
|
560 |
+
df.fillna('', inplace=True) # Replace NaN with an empty string
|
561 |
+
|
562 |
+
xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
|
563 |
+
|
564 |
+
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
565 |
+
|
566 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
567 |
+
f.write(xfdf_content)
|
568 |
+
|
569 |
+
output_paths.append(output_path)
|
570 |
+
|
571 |
+
return output_paths
|
572 |
+
|
573 |
+
|
574 |
+
### Convert xfdf coordinates back to image for app
|
575 |
+
|
576 |
+
def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
577 |
+
'''
|
578 |
+
Converts coordinates from Adobe PDF space to image space.
|
579 |
+
|
580 |
+
Parameters:
|
581 |
+
- pdf_page_width: Width of the PDF page
|
582 |
+
- pdf_page_height: Height of the PDF page
|
583 |
+
- image_width: Width of the source image
|
584 |
+
- image_height: Height of the source image
|
585 |
+
- x1, y1, x2, y2: Coordinates in Adobe PDF space
|
586 |
+
|
587 |
+
Returns:
|
588 |
+
- Tuple of converted coordinates (x1, y1, x2, y2) in image space
|
589 |
+
'''
|
590 |
+
|
591 |
+
# Calculate scaling factors
|
592 |
+
scale_width = image_width / pdf_page_width
|
593 |
+
scale_height = image_height / pdf_page_height
|
594 |
+
|
595 |
+
# Convert coordinates
|
596 |
+
image_x1 = x1 * scale_width
|
597 |
+
image_x2 = x2 * scale_width
|
598 |
+
|
599 |
+
# Convert Y coordinates (flip vertical axis)
|
600 |
+
# Adobe coordinates start from bottom-left
|
601 |
+
image_y1 = (pdf_page_height - y1) * scale_height
|
602 |
+
image_y2 = (pdf_page_height - y2) * scale_height
|
603 |
+
|
604 |
+
# Make sure y1 is always less than y2 for image's coordinate system
|
605 |
+
if image_y1 > image_y2:
|
606 |
+
image_y1, image_y2 = image_y2, image_y1
|
607 |
+
|
608 |
+
return image_x1, image_y1, image_x2, image_y2
|
609 |
+
|
610 |
+
def parse_xfdf(xfdf_path):
|
611 |
+
'''
|
612 |
+
Parse the XFDF file and extract redaction annotations.
|
613 |
+
|
614 |
+
Parameters:
|
615 |
+
- xfdf_path: Path to the XFDF file
|
616 |
+
|
617 |
+
Returns:
|
618 |
+
- List of dictionaries containing redaction information
|
619 |
+
'''
|
620 |
+
tree = parse(xfdf_path)
|
621 |
+
root = tree.getroot()
|
622 |
+
|
623 |
+
# Define the namespace
|
624 |
+
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
|
625 |
+
|
626 |
+
redactions = []
|
627 |
+
|
628 |
+
# Find all redact elements using the namespace
|
629 |
+
for redact in root.findall('.//xfdf:redact', namespaces=namespace):
|
630 |
+
|
631 |
+
#print("redact:", redact)
|
632 |
+
|
633 |
+
redaction_info = {
|
634 |
+
'image': '', # Image will be filled in later
|
635 |
+
'page': int(redact.get('page')) + 1, # Convert to 1-based index
|
636 |
+
'xmin': float(redact.get('rect').split(',')[0]),
|
637 |
+
'ymin': float(redact.get('rect').split(',')[1]),
|
638 |
+
'xmax': float(redact.get('rect').split(',')[2]),
|
639 |
+
'ymax': float(redact.get('rect').split(',')[3]),
|
640 |
+
'label': redact.get('title'),
|
641 |
+
'text': redact.get('contents'),
|
642 |
+
'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
|
643 |
+
}
|
644 |
+
redactions.append(redaction_info)
|
645 |
+
|
646 |
+
print("redactions:", redactions)
|
647 |
+
|
648 |
+
return redactions
|
649 |
+
|
650 |
+
def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
|
651 |
+
'''
|
652 |
+
Convert redaction annotations from XFDF and associated images into a DataFrame.
|
653 |
+
|
654 |
+
Parameters:
|
655 |
+
- xfdf_path: Path to the XFDF file
|
656 |
+
- pdf_doc: PyMuPDF document object
|
657 |
+
- image_paths: List of PIL Image objects corresponding to PDF pages
|
658 |
+
|
659 |
+
Returns:
|
660 |
+
- DataFrame containing redaction information
|
661 |
+
'''
|
662 |
+
output_paths = []
|
663 |
+
xfdf_paths = []
|
664 |
+
df = pd.DataFrame()
|
665 |
+
|
666 |
+
#print("Image paths:", image_paths)
|
667 |
+
|
668 |
+
# Sort the file paths so that the pdfs come first
|
669 |
+
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
|
670 |
+
|
671 |
+
for file in file_paths_list:
|
672 |
+
|
673 |
+
if isinstance(file, str):
|
674 |
+
file_path = file
|
675 |
+
else:
|
676 |
+
file_path = file.name
|
677 |
+
|
678 |
+
file_path_name = get_file_path_end(file_path)
|
679 |
+
file_path_end = detect_file_type(file_path)
|
680 |
+
|
681 |
+
if file_path_end == "pdf":
|
682 |
+
pdf_name = os.path.basename(file_path)
|
683 |
+
#print("pymupdf_doc:", pymupdf_doc)
|
684 |
+
|
685 |
+
# Add pdf to outputs
|
686 |
+
output_paths.append(file_path)
|
687 |
+
|
688 |
+
if file_path_end == "xfdf":
|
689 |
+
|
690 |
+
if not pdf_name:
|
691 |
+
message = "Original PDF needed to convert from .xfdf format"
|
692 |
+
print(message)
|
693 |
+
raise ValueError(message)
|
694 |
+
|
695 |
+
xfdf_path = file
|
696 |
+
|
697 |
+
# if isinstance(xfdf_paths, str):
|
698 |
+
# xfdf_path = xfdf_paths.name
|
699 |
+
# else:
|
700 |
+
# xfdf_path = xfdf_paths[0].name
|
701 |
+
|
702 |
+
file_path_name = get_file_path_end(xfdf_path)
|
703 |
+
|
704 |
+
#print("file_path_name:", file_path_name)
|
705 |
+
|
706 |
+
# Parse the XFDF file
|
707 |
+
redactions = parse_xfdf(xfdf_path)
|
708 |
+
|
709 |
+
# Create a DataFrame from the redaction information
|
710 |
+
df = pd.DataFrame(redactions)
|
711 |
+
|
712 |
+
df.fillna('', inplace=True) # Replace NaN with an empty string
|
713 |
+
|
714 |
+
for _, row in df.iterrows():
|
715 |
+
page_python_format = int(row["page"])-1
|
716 |
+
|
717 |
+
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
718 |
+
|
719 |
+
pdf_page_height = pymupdf_page.rect.height
|
720 |
+
pdf_page_width = pymupdf_page.rect.width
|
721 |
+
|
722 |
+
image_path = image_paths[page_python_format]
|
723 |
+
|
724 |
+
#print("image_path:", image_path)
|
725 |
+
|
726 |
+
if isinstance(image_path, str):
|
727 |
+
image = Image.open(image_path)
|
728 |
+
|
729 |
+
image_page_width, image_page_height = image.size
|
730 |
+
|
731 |
+
# Convert to image coordinates
|
732 |
+
image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax'])
|
733 |
+
|
734 |
+
df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
|
735 |
+
|
736 |
+
# Optionally, you can add the image path or other relevant information
|
737 |
+
#print("Image path:", image_path)
|
738 |
+
df.loc[_, 'image'] = image_path
|
739 |
+
|
740 |
+
#print('row:', row)
|
741 |
+
|
742 |
+
out_file_path = output_folder + file_path_name + "_review_file.csv"
|
743 |
+
df.to_csv(out_file_path, index=None)
|
744 |
+
|
745 |
+
output_paths.append(out_file_path)
|
746 |
+
|
747 |
+
return output_paths
|