|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
from xml.etree.ElementTree import Element, SubElement, tostring, parse |
|
from xml.dom import minidom |
|
import uuid |
|
from typing import List |
|
from gradio_image_annotation import image_annotator |
|
from gradio_image_annotation.image_annotator import AnnotatedImageData |
|
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR |
|
from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type |
|
from tools.file_redaction import redact_page_with_pymupdf |
|
import json |
|
import os |
|
import pymupdf |
|
from fitz import Document |
|
from PIL import ImageDraw, Image |
|
from collections import defaultdict |
|
|
|
Image.MAX_IMAGE_PIXELS = None |
|
|
|
def decrease_page(number:int): |
|
''' |
|
Decrease page number for review redactions page. |
|
''' |
|
|
|
if number > 1: |
|
return number - 1, number - 1 |
|
else: |
|
return 1, 1 |
|
|
|
def increase_page(number:int, image_annotator_object:AnnotatedImageData): |
|
''' |
|
Increase page number for review redactions page. |
|
''' |
|
|
|
if not image_annotator_object: |
|
return 1, 1 |
|
|
|
max_pages = len(image_annotator_object) |
|
|
|
if number < max_pages: |
|
return number + 1, number + 1 |
|
else: |
|
return max_pages, max_pages |
|
|
|
def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool=True): |
|
if decrease == False: |
|
if current_zoom_level >= 70: |
|
current_zoom_level -= 10 |
|
else: |
|
if current_zoom_level < 110: |
|
current_zoom_level += 10 |
|
|
|
return current_zoom_level, annotate_current_page |
|
|
|
def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]: |
|
''' |
|
Remove items from the annotator object where the same page exists twice. |
|
''' |
|
|
|
image_groups = defaultdict(list) |
|
for item in data: |
|
image_groups[item['image']].append(item) |
|
|
|
|
|
result = [] |
|
for image, items in image_groups.items(): |
|
|
|
non_empty_boxes = [item for item in items if item.get('boxes')] |
|
|
|
|
|
for item in non_empty_boxes: |
|
if 'boxes' in item: |
|
item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']] |
|
|
|
if non_empty_boxes: |
|
|
|
result.append(non_empty_boxes[0]) |
|
else: |
|
|
|
result.append(items[0]) |
|
|
|
return result |
|
|
|
def get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr): |
|
recogniser_entities_list = ["Redaction"] |
|
recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True) |
|
recogniser_dataframe_out = recogniser_dataframe_gr |
|
|
|
try: |
|
review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]] |
|
recogniser_entities = review_dataframe["label"].unique().tolist() |
|
recogniser_entities.append("ALL") |
|
recogniser_entities_for_drop = sorted(recogniser_entities) |
|
|
|
|
|
recogniser_dataframe_out = gr.Dataframe(review_dataframe) |
|
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_for_drop[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True) |
|
|
|
recogniser_entities_list = [entity for entity in recogniser_entities_for_drop if entity != 'Redaction' and entity != 'ALL'] |
|
recogniser_entities_list.insert(0, 'Redaction') |
|
|
|
except Exception as e: |
|
print("Could not extract recogniser information:", e) |
|
recogniser_dataframe_out = recogniser_dataframe_gr |
|
recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True) |
|
recogniser_entities_list = ["Redaction"] |
|
|
|
return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list |
|
|
|
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100): |
|
''' |
|
Update a gradio_image_annotation object with new annotation data |
|
''' |
|
recogniser_entities_list = ["Redaction"] |
|
recogniser_dataframe_out = pd.DataFrame() |
|
|
|
if recogniser_dataframe_gr.empty: |
|
recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr) |
|
elif recogniser_dataframe_gr.iloc[0,0] == "": |
|
recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr) |
|
else: |
|
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr) |
|
recogniser_dataframe_out = gr.Dataframe(review_dataframe) |
|
recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist() |
|
|
|
recogniser_entities_list = sorted(recogniser_entities_list) |
|
recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction'] |
|
recogniser_entities_list.insert(0, 'Redaction') |
|
|
|
|
|
zoom_str = str(zoom) + '%' |
|
recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))] |
|
|
|
if not image_annotator_object: |
|
page_num_reported = 1 |
|
|
|
out_image_annotator = image_annotator( |
|
None, |
|
boxes_alpha=0.1, |
|
box_thickness=1, |
|
label_list=recogniser_entities_list, |
|
label_colors=recogniser_colour_list, |
|
show_label=False, |
|
height=zoom_str, |
|
width=zoom_str, |
|
box_min_size=1, |
|
box_selected_thickness=2, |
|
handle_size=4, |
|
sources=None, |
|
show_clear_button=False, |
|
show_share_button=False, |
|
show_remove_button=False, |
|
handles_cursor=True, |
|
interactive=True |
|
) |
|
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0) |
|
|
|
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr |
|
|
|
|
|
|
|
if page_num is None: |
|
page_num = 0 |
|
|
|
|
|
if page_num > 0: |
|
page_num_reported = page_num |
|
|
|
elif page_num == 0: page_num_reported = 1 |
|
|
|
else: |
|
page_num = 0 |
|
page_num_reported = 1 |
|
|
|
page_max_reported = len(image_annotator_object) |
|
|
|
if page_num_reported > page_max_reported: |
|
page_num_reported = page_max_reported |
|
|
|
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object) |
|
|
|
|
|
|
|
out_image_annotator = image_annotator( |
|
value = image_annotator_object[page_num_reported - 1], |
|
boxes_alpha=0.1, |
|
box_thickness=1, |
|
label_list=recogniser_entities_list, |
|
label_colors=recogniser_colour_list, |
|
show_label=False, |
|
height=zoom_str, |
|
width=zoom_str, |
|
box_min_size=1, |
|
box_selected_thickness=2, |
|
handle_size=4, |
|
sources=None, |
|
show_clear_button=False, |
|
show_share_button=False, |
|
show_remove_button=False, |
|
handles_cursor=True, |
|
interactive=True |
|
) |
|
|
|
number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0) |
|
|
|
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr |
|
|
|
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True),recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), clear_all:bool=False): |
|
''' |
|
Overwrite current image annotations with modifications |
|
''' |
|
|
|
if not current_page: |
|
current_page = 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"] |
|
|
|
if clear_all == False: |
|
all_image_annotations[previous_page - 1] = image_annotated |
|
else: |
|
all_image_annotations[previous_page - 1]["boxes"] = [] |
|
|
|
|
|
|
|
|
|
try: |
|
review_dataframe = convert_review_json_to_pandas_df(all_image_annotations)[["page", "label"]] |
|
|
|
recogniser_entities = review_dataframe["label"].unique().tolist() |
|
recogniser_entities.append("ALL") |
|
recogniser_entities = sorted(recogniser_entities) |
|
|
|
recogniser_dataframe_out = gr.Dataframe(review_dataframe) |
|
|
|
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_drop, choices=recogniser_entities, allow_custom_value=True, interactive=True) |
|
except Exception as e: |
|
print("Could not extract recogniser information:", e) |
|
recogniser_dataframe_out = recogniser_dataframe |
|
|
|
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out |
|
|
|
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)): |
|
''' |
|
Apply modified redactions to a pymupdf and export review files |
|
''' |
|
|
|
|
|
output_files = [] |
|
output_log_files = [] |
|
pdf_doc = [] |
|
|
|
|
|
|
|
image_annotated['image'] = all_image_annotations[current_page - 1]["image"] |
|
|
|
all_image_annotations[current_page - 1] = image_annotated |
|
|
|
if not image_annotated: |
|
print("No image annotations found") |
|
return doc, all_image_annotations |
|
|
|
if isinstance(file_paths, str): |
|
file_paths = [file_paths] |
|
|
|
for file_path in file_paths: |
|
|
|
file_name_without_ext = get_file_name_without_type(file_path) |
|
file_name_with_ext = os.path.basename(file_path) |
|
|
|
file_extension = os.path.splitext(file_path)[1].lower() |
|
|
|
if save_pdf == True: |
|
|
|
if (is_pdf(file_path) == False) & (file_extension not in '.csv'): |
|
image = Image.open(file_paths[-1]) |
|
|
|
|
|
|
|
draw = ImageDraw.Draw(image) |
|
|
|
for img_annotation_box in image_annotated['boxes']: |
|
coords = [img_annotation_box["xmin"], |
|
img_annotation_box["ymin"], |
|
img_annotation_box["xmax"], |
|
img_annotation_box["ymax"]] |
|
|
|
fill = img_annotation_box["color"] |
|
|
|
draw.rectangle(coords, fill=fill) |
|
|
|
output_image_path = output_folder + file_name_without_ext + "_redacted.png" |
|
image.save(output_folder + file_name_without_ext + "_redacted.png") |
|
|
|
output_files.append(output_image_path) |
|
|
|
print("Redactions saved to image file") |
|
|
|
doc = [image] |
|
|
|
elif file_extension in '.csv': |
|
print("This is a csv") |
|
pdf_doc = [] |
|
|
|
|
|
elif is_pdf(file_path) == True: |
|
pdf_doc = pymupdf.open(file_path) |
|
orig_pdf_file_path = file_path |
|
|
|
output_files.append(orig_pdf_file_path) |
|
|
|
number_of_pages = pdf_doc.page_count |
|
|
|
print("Saving pages to file.") |
|
|
|
for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"): |
|
|
|
|
|
|
|
image_loc = all_image_annotations[i]['image'] |
|
|
|
|
|
|
|
if isinstance(image_loc, np.ndarray): |
|
image = Image.fromarray(image_loc.astype('uint8')) |
|
|
|
elif isinstance(image_loc, Image.Image): |
|
image = image_loc |
|
|
|
|
|
|
|
elif isinstance(image_loc, str): |
|
image = Image.open(image_loc) |
|
|
|
pymupdf_page = pdf_doc.load_page(i) |
|
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image) |
|
|
|
else: |
|
print("File type not recognised.") |
|
|
|
|
|
if pdf_doc: |
|
out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf" |
|
pdf_doc.save(out_pdf_file_path) |
|
output_files.append(out_pdf_file_path) |
|
|
|
else: |
|
print("PDF input not found. Outputs not saved to PDF.") |
|
|
|
|
|
else: |
|
if is_pdf(file_path) == True: |
|
orig_pdf_file_path = file_path |
|
output_files.append(orig_pdf_file_path) |
|
|
|
try: |
|
|
|
|
|
out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json' |
|
with open(out_annotation_file_path, 'w') as f: |
|
json.dump(all_image_annotations, f) |
|
output_log_files.append(out_annotation_file_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state) |
|
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv' |
|
review_df.to_csv(out_review_file_file_path, index=None) |
|
output_files.append(out_review_file_file_path) |
|
|
|
except Exception as e: |
|
print("Could not save annotations to json or csv file:", e) |
|
|
|
return doc, all_image_annotations, output_files, output_log_files |
|
|
|
def get_boxes_json(annotations:AnnotatedImageData): |
|
return annotations["boxes"] |
|
|
|
def update_entities_df(choice:str, df:pd.DataFrame): |
|
if choice=="ALL": |
|
return df |
|
else: |
|
return df.loc[df["label"]==choice,:] |
|
|
|
def df_select_callback(df: pd.DataFrame, evt: gr.SelectData): |
|
row_value_page = evt.row_value[0] |
|
return row_value_page |
|
|
|
def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2): |
|
''' |
|
Converts coordinates from image space to Adobe PDF space. |
|
|
|
Parameters: |
|
- pdf_page_width: Width of the PDF page |
|
- pdf_page_height: Height of the PDF page |
|
- image_width: Width of the source image |
|
- image_height: Height of the source image |
|
- x1, y1, x2, y2: Coordinates in image space |
|
|
|
Returns: |
|
- Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space |
|
''' |
|
|
|
|
|
scale_width = pdf_page_width / image_width |
|
scale_height = pdf_page_height / image_height |
|
|
|
|
|
pdf_x1 = x1 * scale_width |
|
pdf_x2 = x2 * scale_width |
|
|
|
|
|
|
|
pdf_y1 = pdf_page_height - (y1 * scale_height) |
|
pdf_y2 = pdf_page_height - (y2 * scale_height) |
|
|
|
|
|
if pdf_y1 > pdf_y2: |
|
pdf_y1, pdf_y2 = pdf_y2, pdf_y1 |
|
|
|
return pdf_x1, pdf_y1, pdf_x2, pdf_y2 |
|
|
|
|
|
def create_xfdf(df, pdf_path, pymupdf_doc, image_paths): |
|
''' |
|
Create an xfdf file from a review csv file and a pdf |
|
''' |
|
|
|
|
|
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve") |
|
|
|
|
|
header = SubElement(xfdf, 'header') |
|
header.set('pdf-filepath', pdf_path) |
|
|
|
|
|
annots = SubElement(xfdf, 'annots') |
|
|
|
for _, row in df.iterrows(): |
|
page_python_format = int(row["page"])-1 |
|
|
|
pymupdf_page = pymupdf_doc.load_page(page_python_format) |
|
|
|
pdf_page_height = pymupdf_page.rect.height |
|
pdf_page_width = pymupdf_page.rect.width |
|
|
|
image = image_paths[page_python_format] |
|
|
|
|
|
|
|
if isinstance(image, str): |
|
image = Image.open(image) |
|
|
|
image_page_width, image_page_height = image.size |
|
|
|
|
|
redact_annot = SubElement(annots, 'redact') |
|
|
|
|
|
annot_id = str(uuid.uuid4()) |
|
redact_annot.set('name', annot_id) |
|
|
|
|
|
redact_annot.set('page', str(int(row['page']) - 1)) |
|
|
|
|
|
x1, y1, x2, y2 = convert_image_coords_to_adobe( |
|
pdf_page_width, |
|
pdf_page_height, |
|
image_page_width, |
|
image_page_height, |
|
row['xmin'], |
|
row['ymin'], |
|
row['xmax'], |
|
row['ymax'] |
|
) |
|
|
|
if CUSTOM_BOX_COLOUR == "grey": |
|
colour_str = "0.5,0.5,0.5" |
|
else: |
|
colour_str = row['color'].strip('()').replace(' ', '') |
|
|
|
|
|
redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}") |
|
|
|
|
|
redact_annot.set('title', row['label']) |
|
redact_annot.set('contents', row['text']) |
|
redact_annot.set('subject', row['label']) |
|
redact_annot.set('mimetype', "Form") |
|
|
|
|
|
redact_annot.set('border-color', colour_str) |
|
redact_annot.set('repeat', 'false') |
|
redact_annot.set('interior-color', colour_str) |
|
|
|
|
|
|
|
|
|
redact_annot.set('opacity', "0.5") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ") |
|
|
|
return xml_str |
|
|
|
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths): |
|
''' |
|
Load in files to convert a review file into an Adobe comment file format |
|
''' |
|
output_paths = [] |
|
pdf_name = "" |
|
|
|
if isinstance(input_files, str): |
|
file_paths_list = [input_files] |
|
else: |
|
file_paths_list = input_files |
|
|
|
|
|
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json')) |
|
|
|
for file in file_paths_list: |
|
|
|
if isinstance(file, str): |
|
file_path = file |
|
else: |
|
file_path = file.name |
|
|
|
file_path_name = get_file_name_without_type(file_path) |
|
file_path_end = detect_file_type(file_path) |
|
|
|
if file_path_end == "pdf": |
|
pdf_name = os.path.basename(file_path) |
|
|
|
if file_path_end == "csv": |
|
|
|
if not pdf_name: |
|
pdf_name = file_path_name |
|
|
|
df = pd.read_csv(file_path) |
|
|
|
df.fillna('', inplace=True) |
|
|
|
xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths) |
|
|
|
output_path = output_folder + file_path_name + "_adobe.xfdf" |
|
|
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
f.write(xfdf_content) |
|
|
|
output_paths.append(output_path) |
|
|
|
return output_paths |
|
|
|
|
|
|
|
|
|
def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2): |
|
''' |
|
Converts coordinates from Adobe PDF space to image space. |
|
|
|
Parameters: |
|
- pdf_page_width: Width of the PDF page |
|
- pdf_page_height: Height of the PDF page |
|
- image_width: Width of the source image |
|
- image_height: Height of the source image |
|
- x1, y1, x2, y2: Coordinates in Adobe PDF space |
|
|
|
Returns: |
|
- Tuple of converted coordinates (x1, y1, x2, y2) in image space |
|
''' |
|
|
|
|
|
scale_width = image_width / pdf_page_width |
|
scale_height = image_height / pdf_page_height |
|
|
|
|
|
image_x1 = x1 * scale_width |
|
image_x2 = x2 * scale_width |
|
|
|
|
|
|
|
image_y1 = (pdf_page_height - y1) * scale_height |
|
image_y2 = (pdf_page_height - y2) * scale_height |
|
|
|
|
|
if image_y1 > image_y2: |
|
image_y1, image_y2 = image_y2, image_y1 |
|
|
|
return image_x1, image_y1, image_x2, image_y2 |
|
|
|
def parse_xfdf(xfdf_path): |
|
''' |
|
Parse the XFDF file and extract redaction annotations. |
|
|
|
Parameters: |
|
- xfdf_path: Path to the XFDF file |
|
|
|
Returns: |
|
- List of dictionaries containing redaction information |
|
''' |
|
tree = parse(xfdf_path) |
|
root = tree.getroot() |
|
|
|
|
|
namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'} |
|
|
|
redactions = [] |
|
|
|
|
|
for redact in root.findall('.//xfdf:redact', namespaces=namespace): |
|
|
|
|
|
|
|
redaction_info = { |
|
'image': '', |
|
'page': int(redact.get('page')) + 1, |
|
'xmin': float(redact.get('rect').split(',')[0]), |
|
'ymin': float(redact.get('rect').split(',')[1]), |
|
'xmax': float(redact.get('rect').split(',')[2]), |
|
'ymax': float(redact.get('rect').split(',')[3]), |
|
'label': redact.get('title'), |
|
'text': redact.get('contents'), |
|
'color': redact.get('border-color', '(0, 0, 0)') |
|
} |
|
redactions.append(redaction_info) |
|
|
|
print("redactions:", redactions) |
|
|
|
return redactions |
|
|
|
def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths): |
|
''' |
|
Convert redaction annotations from XFDF and associated images into a DataFrame. |
|
|
|
Parameters: |
|
- xfdf_path: Path to the XFDF file |
|
- pdf_doc: PyMuPDF document object |
|
- image_paths: List of PIL Image objects corresponding to PDF pages |
|
|
|
Returns: |
|
- DataFrame containing redaction information |
|
''' |
|
output_paths = [] |
|
xfdf_paths = [] |
|
df = pd.DataFrame() |
|
|
|
|
|
|
|
|
|
file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json')) |
|
|
|
for file in file_paths_list: |
|
|
|
if isinstance(file, str): |
|
file_path = file |
|
else: |
|
file_path = file.name |
|
|
|
file_path_name = get_file_name_without_type(file_path) |
|
file_path_end = detect_file_type(file_path) |
|
|
|
if file_path_end == "pdf": |
|
pdf_name = os.path.basename(file_path) |
|
|
|
|
|
|
|
output_paths.append(file_path) |
|
|
|
if file_path_end == "xfdf": |
|
|
|
if not pdf_name: |
|
message = "Original PDF needed to convert from .xfdf format" |
|
print(message) |
|
raise ValueError(message) |
|
|
|
xfdf_path = file |
|
|
|
|
|
|
|
|
|
|
|
|
|
file_path_name = get_file_name_without_type(xfdf_path) |
|
|
|
|
|
|
|
|
|
redactions = parse_xfdf(xfdf_path) |
|
|
|
|
|
df = pd.DataFrame(redactions) |
|
|
|
df.fillna('', inplace=True) |
|
|
|
for _, row in df.iterrows(): |
|
page_python_format = int(row["page"])-1 |
|
|
|
pymupdf_page = pymupdf_doc.load_page(page_python_format) |
|
|
|
pdf_page_height = pymupdf_page.rect.height |
|
pdf_page_width = pymupdf_page.rect.width |
|
|
|
image_path = image_paths[page_python_format] |
|
|
|
|
|
|
|
if isinstance(image_path, str): |
|
image = Image.open(image_path) |
|
|
|
image_page_width, image_page_height = image.size |
|
|
|
|
|
image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax']) |
|
|
|
df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2] |
|
|
|
|
|
|
|
df.loc[_, 'image'] = image_path |
|
|
|
|
|
|
|
out_file_path = output_folder + file_path_name + "_review_file.csv" |
|
df.to_csv(out_file_path, index=None) |
|
|
|
output_paths.append(out_file_path) |
|
|
|
return output_paths |