seanpedrickcase commited on
Commit
6b28cfa
·
1 Parent(s): a265560

Added capabilities to export to and import from Adobe .xfdf files

Browse files
Files changed (3) hide show
  1. app.py +22 -5
  2. tools/helper_functions.py +5 -0
  3. tools/redaction_review.py +336 -32
app.py CHANGED
@@ -10,11 +10,11 @@ from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
- from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -154,6 +154,8 @@ with app:
154
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
155
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
156
 
 
 
157
  ###
158
  # UI DESIGN
159
  ###
@@ -255,7 +257,12 @@ with app:
255
  #with gr.Column(scale=1):
256
  with gr.Row():
257
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
258
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
 
 
 
 
 
259
 
260
  ###
261
  # TEXT / TABULAR DATA TAB
@@ -361,7 +368,8 @@ with app:
361
  ###
362
 
363
  # Upload previous files for modifying redactions
364
- upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
 
365
  then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
366
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
367
 
@@ -419,7 +427,16 @@ with app:
419
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
420
  then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
421
 
422
-
 
 
 
 
 
 
 
 
 
423
  ###
424
  # TABULAR DATA REDACTION
425
  ###
 
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
+ from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
 
154
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
155
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
156
 
157
+
158
+
159
  ###
160
  # UI DESIGN
161
  ###
 
257
  #with gr.Column(scale=1):
258
  with gr.Row():
259
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
260
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
261
+
262
+ with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
263
+ convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
264
+ adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple')
265
+ convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
266
 
267
  ###
268
  # TEXT / TABULAR DATA TAB
 
368
  ###
369
 
370
  # Upload previous files for modifying redactions
371
+ upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
372
+ then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
373
  then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
374
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
375
 
 
427
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
428
  then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
429
 
430
+ # Convert review file to xfdf Adobe format
431
+ convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
432
+ then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
433
+ then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
434
+
435
+ # Convert xfdf Adobe file back to review_file.csv
436
+ convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
437
+ then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
438
+ then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
439
+
440
  ###
441
  # TABULAR DATA REDACTION
442
  ###
tools/helper_functions.py CHANGED
@@ -22,6 +22,9 @@ def reset_state_vars():
22
  interactive=False
23
  ), [], [], [], pd.DataFrame(), pd.DataFrame()
24
 
 
 
 
25
  def get_or_create_env_var(var_name, default_value):
26
  # Get the environment variable if it exists
27
  value = os.environ.get(var_name)
@@ -81,6 +84,8 @@ def detect_file_type(filename):
81
  return 'jpeg'
82
  elif filename.endswith('.png'):
83
  return 'png'
 
 
84
  else:
85
  raise ValueError("Unsupported file type.")
86
 
 
22
  interactive=False
23
  ), [], [], [], pd.DataFrame(), pd.DataFrame()
24
 
25
+ def reset_review_vars():
26
+ return [], pd.DataFrame(), pd.DataFrame()
27
+
28
  def get_or_create_env_var(var_name, default_value):
29
  # Get the environment variable if it exists
30
  value = os.environ.get(var_name)
 
84
  return 'jpeg'
85
  elif filename.endswith('.png'):
86
  return 'png'
87
+ elif filename.endswith('.xfdf'):
88
+ return 'xfdf'
89
  else:
90
  raise ValueError("Unsupported file type.")
91
 
tools/redaction_review.py CHANGED
@@ -1,14 +1,14 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
- from xml.etree.ElementTree import Element, SubElement, tostring
5
  from xml.dom import minidom
6
  import uuid
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
- from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
11
- from tools.helper_functions import get_file_path_end, output_folder
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
14
  import os
@@ -383,10 +383,46 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
383
  row_value_page = evt.row_value[0] # This is the page number value
384
  return row_value_page
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
 
388
-
389
- def create_xfdf(df, pdf_path):
 
 
 
390
  # Create root element
391
  xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
392
 
@@ -397,47 +433,315 @@ def create_xfdf(df, pdf_path):
397
  # Add annots
398
  annots = SubElement(xfdf, 'annots')
399
 
400
- # Process each row in dataframe
401
  for _, row in df.iterrows():
402
- # Create text annotation
403
- text_annot = SubElement(annots, 'text')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
- # Generate unique ID for each annotation
406
  annot_id = str(uuid.uuid4())
407
- text_annot.set('name', annot_id)
408
 
409
  # Set page number (subtract 1 as PDF pages are 0-based)
410
- text_annot.set('page', str(int(row['page']) - 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
- # Set coordinates (convert to PDF coordinate system)
413
- # Note: You might need to adjust these calculations based on your PDF dimensions
414
- text_annot.set('rect', f"{row['xmin']},{row['ymin']},{row['xmax']},{row['ymax']}")
415
 
416
- # Set color (convert RGB tuple string to comma-separated values)
417
- color_str = row['color'].strip('()').replace(' ', '')
418
- text_annot.set('color', color_str)
 
 
419
 
420
- # Set text content
421
- text_annot.set('contents', f"{row['label']}: {row['text']}")
 
 
 
 
 
 
 
 
 
 
422
 
423
- # Set additional properties
424
- text_annot.set('flags', "print")
425
- text_annot.set('date', "D:20240123000000")
426
- text_annot.set('title', "Annotation")
 
 
 
 
 
427
 
 
 
 
 
 
 
 
 
428
  # Convert to pretty XML string
429
  xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
430
 
431
  return xml_str
432
 
433
- # Example usage:
434
- # Assuming your dataframe is named 'df' and you want to create annotations for 'example.pdf'
435
- def convert_df_to_xfdf(df, pdf_path, output_path):
436
- xfdf_content = create_xfdf(df, pdf_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
- with open(output_path, 'w', encoding='utf-8') as f:
439
- f.write(xfdf_content)
 
 
 
 
 
 
 
 
 
 
440
 
441
- # Usage example:
442
- # df = your_dataframe
443
- # convert_df_to_xfdf(df, 'path/to/your.pdf', 'output.xfdf')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
+ from xml.etree.ElementTree import Element, SubElement, tostring, parse
5
  from xml.dom import minidom
6
  import uuid
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
+ from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
11
+ from tools.helper_functions import get_file_path_end, output_folder, detect_file_type
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
14
  import os
 
383
  row_value_page = evt.row_value[0] # This is the page number value
384
  return row_value_page
385
 
386
+ def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
387
+ '''
388
+ Converts coordinates from image space to Adobe PDF space.
389
+
390
+ Parameters:
391
+ - pdf_page_width: Width of the PDF page
392
+ - pdf_page_height: Height of the PDF page
393
+ - image_width: Width of the source image
394
+ - image_height: Height of the source image
395
+ - x1, y1, x2, y2: Coordinates in image space
396
+
397
+ Returns:
398
+ - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
399
+ '''
400
+
401
+ # Calculate scaling factors
402
+ scale_width = pdf_page_width / image_width
403
+ scale_height = pdf_page_height / image_height
404
+
405
+ # Convert coordinates
406
+ pdf_x1 = x1 * scale_width
407
+ pdf_x2 = x2 * scale_width
408
+
409
+ # Convert Y coordinates (flip vertical axis)
410
+ # Adobe coordinates start from bottom-left
411
+ pdf_y1 = pdf_page_height - (y1 * scale_height)
412
+ pdf_y2 = pdf_page_height - (y2 * scale_height)
413
+
414
+ # Make sure y1 is always less than y2 for Adobe's coordinate system
415
+ if pdf_y1 > pdf_y2:
416
+ pdf_y1, pdf_y2 = pdf_y2, pdf_y1
417
+
418
+ return pdf_x1, pdf_y1, pdf_x2, pdf_y2
419
 
420
 
421
+ def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
422
+ '''
423
+ Create an xfdf file from a review csv file and a pdf
424
+ '''
425
+
426
  # Create root element
427
  xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
428
 
 
433
  # Add annots
434
  annots = SubElement(xfdf, 'annots')
435
 
 
436
  for _, row in df.iterrows():
437
+ page_python_format = int(row["page"])-1
438
+
439
+ pymupdf_page = pymupdf_doc.load_page(page_python_format)
440
+
441
+ pdf_page_height = pymupdf_page.rect.height
442
+ pdf_page_width = pymupdf_page.rect.width
443
+
444
+ image = image_paths[page_python_format]
445
+
446
+ #print("image:", image)
447
+
448
+ if isinstance(image, str):
449
+ image = Image.open(image)
450
+
451
+ image_page_width, image_page_height = image.size
452
+
453
+ # Create redaction annotation
454
+ redact_annot = SubElement(annots, 'redact')
455
 
456
+ # Generate unique ID
457
  annot_id = str(uuid.uuid4())
458
+ redact_annot.set('name', annot_id)
459
 
460
  # Set page number (subtract 1 as PDF pages are 0-based)
461
+ redact_annot.set('page', str(int(row['page']) - 1))
462
+
463
+ # Convert coordinates
464
+ x1, y1, x2, y2 = convert_image_coords_to_adobe(
465
+ pdf_page_width,
466
+ pdf_page_height,
467
+ image_page_width,
468
+ image_page_height,
469
+ row['xmin'],
470
+ row['ymin'],
471
+ row['xmax'],
472
+ row['ymax']
473
+ )
474
+
475
+ if CUSTOM_BOX_COLOUR == "grey":
476
+ colour_str = "0.5,0.5,0.5"
477
+ else:
478
+ colour_str = row['color'].strip('()').replace(' ', '')
479
 
480
+ # Set coordinates
481
+ redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}")
 
482
 
483
+ # Set redaction properties
484
+ redact_annot.set('title', row['label']) # The type of redaction (e.g., "PERSON")
485
+ redact_annot.set('contents', row['text']) # The redacted text
486
+ redact_annot.set('subject', row['label']) # The redacted text
487
+ redact_annot.set('mimetype', "Form")
488
 
489
+ # Set appearance properties
490
+ redact_annot.set('border-color', colour_str) # Black border
491
+ redact_annot.set('repeat', 'false')
492
+ redact_annot.set('interior-color', colour_str)
493
+ #redact_annot.set('fill-color', colour_str)
494
+ #redact_annot.set('outline-color', colour_str)
495
+ redact_annot.set('overlay-color', colour_str)
496
+ redact_annot.set('overlay-text', row['label'])
497
+ redact_annot.set('opacity', "0.5")
498
+
499
+ # Add appearance dictionary
500
+ # appearanceDict = SubElement(redact_annot, 'appearancedict')
501
 
502
+ # # Normal appearance
503
+ # normal = SubElement(appearanceDict, 'normal')
504
+ # #normal.set('appearance', 'redact')
505
+
506
+ # # Color settings for the mark (before applying redaction)
507
+ # markAppearance = SubElement(redact_annot, 'markappearance')
508
+ # markAppearance.set('stroke-color', colour_str) # Red outline
509
+ # markAppearance.set('fill-color', colour_str) # Light red fill
510
+ # markAppearance.set('opacity', '0.5') # 50% opacity
511
 
512
+ # # Final redaction appearance (after applying)
513
+ # redactAppearance = SubElement(redact_annot, 'redactAppearance')
514
+ # redactAppearance.set('fillColor', colour_str) # Black fill
515
+ # redactAppearance.set('fontName', 'Helvetica')
516
+ # redactAppearance.set('fontSize', '12')
517
+ # redactAppearance.set('textAlignment', 'left')
518
+ # redactAppearance.set('textColor', colour_str) # White text
519
+
520
  # Convert to pretty XML string
521
  xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
522
 
523
  return xml_str
524
 
525
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
526
+ '''
527
+ Load in files to convert a review file into an Adobe comment file format
528
+ '''
529
+ output_paths = []
530
+ pdf_name = ""
531
+
532
+ if isinstance(input_files, str):
533
+ file_paths_list = [input_files]
534
+ else:
535
+ file_paths_list = input_files
536
+
537
+ # Sort the file paths so that the pdfs come first
538
+ file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
539
+
540
+ for file in file_paths_list:
541
+
542
+ if isinstance(file, str):
543
+ file_path = file
544
+ else:
545
+ file_path = file.name
546
 
547
+ file_path_name = get_file_path_end(file_path)
548
+ file_path_end = detect_file_type(file_path)
549
+
550
+ if file_path_end == "pdf":
551
+ pdf_name = os.path.basename(file_path)
552
+
553
+ if file_path_end == "csv":
554
+ # If no pdf name, just get the name of the file path
555
+ if not pdf_name:
556
+ pdf_name = file_path_name
557
+ # Read CSV file
558
+ df = pd.read_csv(file_path)
559
 
560
+ df.fillna('', inplace=True) # Replace NaN with an empty string
561
+
562
+ xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
563
+
564
+ output_path = output_folder + file_path_name + "_adobe.xfdf"
565
+
566
+ with open(output_path, 'w', encoding='utf-8') as f:
567
+ f.write(xfdf_content)
568
+
569
+ output_paths.append(output_path)
570
+
571
+ return output_paths
572
+
573
+
574
+ ### Convert xfdf coordinates back to image for app
575
+
576
+ def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
577
+ '''
578
+ Converts coordinates from Adobe PDF space to image space.
579
+
580
+ Parameters:
581
+ - pdf_page_width: Width of the PDF page
582
+ - pdf_page_height: Height of the PDF page
583
+ - image_width: Width of the source image
584
+ - image_height: Height of the source image
585
+ - x1, y1, x2, y2: Coordinates in Adobe PDF space
586
+
587
+ Returns:
588
+ - Tuple of converted coordinates (x1, y1, x2, y2) in image space
589
+ '''
590
+
591
+ # Calculate scaling factors
592
+ scale_width = image_width / pdf_page_width
593
+ scale_height = image_height / pdf_page_height
594
+
595
+ # Convert coordinates
596
+ image_x1 = x1 * scale_width
597
+ image_x2 = x2 * scale_width
598
+
599
+ # Convert Y coordinates (flip vertical axis)
600
+ # Adobe coordinates start from bottom-left
601
+ image_y1 = (pdf_page_height - y1) * scale_height
602
+ image_y2 = (pdf_page_height - y2) * scale_height
603
+
604
+ # Make sure y1 is always less than y2 for image's coordinate system
605
+ if image_y1 > image_y2:
606
+ image_y1, image_y2 = image_y2, image_y1
607
+
608
+ return image_x1, image_y1, image_x2, image_y2
609
+
610
+ def parse_xfdf(xfdf_path):
611
+ '''
612
+ Parse the XFDF file and extract redaction annotations.
613
+
614
+ Parameters:
615
+ - xfdf_path: Path to the XFDF file
616
+
617
+ Returns:
618
+ - List of dictionaries containing redaction information
619
+ '''
620
+ tree = parse(xfdf_path)
621
+ root = tree.getroot()
622
+
623
+ # Define the namespace
624
+ namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
625
+
626
+ redactions = []
627
+
628
+ # Find all redact elements using the namespace
629
+ for redact in root.findall('.//xfdf:redact', namespaces=namespace):
630
+
631
+ #print("redact:", redact)
632
+
633
+ redaction_info = {
634
+ 'image': '', # Image will be filled in later
635
+ 'page': int(redact.get('page')) + 1, # Convert to 1-based index
636
+ 'xmin': float(redact.get('rect').split(',')[0]),
637
+ 'ymin': float(redact.get('rect').split(',')[1]),
638
+ 'xmax': float(redact.get('rect').split(',')[2]),
639
+ 'ymax': float(redact.get('rect').split(',')[3]),
640
+ 'label': redact.get('title'),
641
+ 'text': redact.get('contents'),
642
+ 'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
643
+ }
644
+ redactions.append(redaction_info)
645
+
646
+ print("redactions:", redactions)
647
+
648
+ return redactions
649
+
650
+ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
651
+ '''
652
+ Convert redaction annotations from XFDF and associated images into a DataFrame.
653
+
654
+ Parameters:
655
+ - xfdf_path: Path to the XFDF file
656
+ - pdf_doc: PyMuPDF document object
657
+ - image_paths: List of PIL Image objects corresponding to PDF pages
658
+
659
+ Returns:
660
+ - DataFrame containing redaction information
661
+ '''
662
+ output_paths = []
663
+ xfdf_paths = []
664
+ df = pd.DataFrame()
665
+
666
+ #print("Image paths:", image_paths)
667
+
668
+ # Sort the file paths so that the pdfs come first
669
+ file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
670
+
671
+ for file in file_paths_list:
672
+
673
+ if isinstance(file, str):
674
+ file_path = file
675
+ else:
676
+ file_path = file.name
677
+
678
+ file_path_name = get_file_path_end(file_path)
679
+ file_path_end = detect_file_type(file_path)
680
+
681
+ if file_path_end == "pdf":
682
+ pdf_name = os.path.basename(file_path)
683
+ #print("pymupdf_doc:", pymupdf_doc)
684
+
685
+ # Add pdf to outputs
686
+ output_paths.append(file_path)
687
+
688
+ if file_path_end == "xfdf":
689
+
690
+ if not pdf_name:
691
+ message = "Original PDF needed to convert from .xfdf format"
692
+ print(message)
693
+ raise ValueError(message)
694
+
695
+ xfdf_path = file
696
+
697
+ # if isinstance(xfdf_paths, str):
698
+ # xfdf_path = xfdf_paths.name
699
+ # else:
700
+ # xfdf_path = xfdf_paths[0].name
701
+
702
+ file_path_name = get_file_path_end(xfdf_path)
703
+
704
+ #print("file_path_name:", file_path_name)
705
+
706
+ # Parse the XFDF file
707
+ redactions = parse_xfdf(xfdf_path)
708
+
709
+ # Create a DataFrame from the redaction information
710
+ df = pd.DataFrame(redactions)
711
+
712
+ df.fillna('', inplace=True) # Replace NaN with an empty string
713
+
714
+ for _, row in df.iterrows():
715
+ page_python_format = int(row["page"])-1
716
+
717
+ pymupdf_page = pymupdf_doc.load_page(page_python_format)
718
+
719
+ pdf_page_height = pymupdf_page.rect.height
720
+ pdf_page_width = pymupdf_page.rect.width
721
+
722
+ image_path = image_paths[page_python_format]
723
+
724
+ #print("image_path:", image_path)
725
+
726
+ if isinstance(image_path, str):
727
+ image = Image.open(image_path)
728
+
729
+ image_page_width, image_page_height = image.size
730
+
731
+ # Convert to image coordinates
732
+ image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax'])
733
+
734
+ df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
735
+
736
+ # Optionally, you can add the image path or other relevant information
737
+ #print("Image path:", image_path)
738
+ df.loc[_, 'image'] = image_path
739
+
740
+ #print('row:', row)
741
+
742
+ out_file_path = output_folder + file_path_name + "_review_file.csv"
743
+ df.to_csv(out_file_path, index=None)
744
+
745
+ output_paths.append(out_file_path)
746
+
747
+ return output_paths