Commit
·
12224f5
1
Parent(s):
2a4b347
Better redaction output formatting. Custom output folders allowed. Upgraded Gradio version
Browse files- Dockerfile +1 -1
- app.py +1 -1
- requirements.txt +2 -2
- tools/file_conversion.py +4 -4
- tools/file_redaction.py +211 -51
- tools/helper_functions.py +18 -0
Dockerfile
CHANGED
@@ -16,7 +16,7 @@ COPY requirements.txt .
|
|
16 |
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
-
RUN pip install --no-cache-dir gradio==4.
|
20 |
|
21 |
# Set up a new user named "user" with user ID 1000
|
22 |
RUN useradd -m -u 1000 user
|
|
|
16 |
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
+
RUN pip install --no-cache-dir gradio==4.33.0
|
20 |
|
21 |
# Set up a new user named "user" with user ID 1000
|
22 |
RUN useradd -m -u 1000 user
|
app.py
CHANGED
@@ -48,7 +48,7 @@ with block:
|
|
48 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
|
49 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
|
50 |
|
51 |
-
redact_btn = gr.Button("Redact document")
|
52 |
|
53 |
with gr.Row():
|
54 |
output_summary = gr.Textbox(label="Output summary")
|
|
|
48 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
|
49 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
|
50 |
|
51 |
+
redact_btn = gr.Button("Redact document", variant="primary")
|
52 |
|
53 |
with gr.Row():
|
54 |
output_summary = gr.Textbox(label="Output summary")
|
requirements.txt
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
pdfminer.six==20231228
|
2 |
pdf2image==1.17.0
|
3 |
opencv-python==4.9.0.80
|
4 |
-
presidio_analyzer==2.2.
|
5 |
-
presidio_anonymizer==2.2.
|
6 |
presidio-image-redactor==0.0.52
|
7 |
pikepdf==8.15.1
|
8 |
pandas==2.2.2
|
|
|
1 |
pdfminer.six==20231228
|
2 |
pdf2image==1.17.0
|
3 |
opencv-python==4.9.0.80
|
4 |
+
presidio_analyzer==2.2.354
|
5 |
+
presidio_anonymizer==2.2.354
|
6 |
presidio-image-redactor==0.0.52
|
7 |
pikepdf==8.15.1
|
8 |
pandas==2.2.2
|
tools/file_conversion.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
-
from tools.helper_functions import get_file_path_end
|
3 |
from PIL import Image
|
4 |
import os
|
5 |
from gradio import Progress
|
@@ -50,7 +50,7 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
|
|
50 |
print("Current page: ", str(page_num))
|
51 |
|
52 |
# Convert one page to image
|
53 |
-
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
|
54 |
|
55 |
# If no images are returned, break the loop
|
56 |
if not image:
|
@@ -124,8 +124,8 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
124 |
# Convert annotated text pdf back to image to give genuine redactions
|
125 |
print("Creating image version of results")
|
126 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
127 |
-
out_text_image_file_path =
|
128 |
-
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=
|
129 |
|
130 |
out_file_paths.append(out_text_image_file_path)
|
131 |
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_path_end, output_folder
|
3 |
from PIL import Image
|
4 |
import os
|
5 |
from gradio import Progress
|
|
|
50 |
print("Current page: ", str(page_num))
|
51 |
|
52 |
# Convert one page to image
|
53 |
+
image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
|
54 |
|
55 |
# If no images are returned, break the loop
|
56 |
if not image:
|
|
|
124 |
# Convert annotated text pdf back to image to give genuine redactions
|
125 |
print("Creating image version of results")
|
126 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
127 |
+
out_text_image_file_path = output_folder + file_path_without_ext + "_result_as_text_back_to_img.pdf"
|
128 |
+
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
|
129 |
|
130 |
out_file_paths.append(out_text_image_file_path)
|
131 |
|
tools/file_redaction.py
CHANGED
@@ -2,18 +2,21 @@ from PIL import Image
|
|
2 |
from typing import List
|
3 |
import pandas as pd
|
4 |
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
|
|
|
5 |
from pdfminer.high_level import extract_pages
|
6 |
from tools.file_conversion import process_file
|
7 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
|
8 |
from pikepdf import Pdf, Dictionary, Name
|
9 |
from gradio import Progress
|
10 |
import time
|
|
|
11 |
|
12 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
13 |
-
from tools.helper_functions import get_file_path_end
|
14 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
15 |
import gradio as gr
|
16 |
|
|
|
17 |
def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
18 |
|
19 |
tic = time.perf_counter()
|
@@ -37,7 +40,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
37 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
38 |
|
39 |
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
|
40 |
-
out_image_file_path =
|
41 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
42 |
|
43 |
out_file_paths.append(out_image_file_path)
|
@@ -49,7 +52,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
49 |
|
50 |
# Analyse text-based pdf
|
51 |
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
52 |
-
out_text_file_path =
|
53 |
pdf_text.save(out_text_file_path)
|
54 |
|
55 |
out_file_paths.append(out_text_file_path)
|
@@ -69,11 +72,44 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
|
|
69 |
|
70 |
return out_message, out_file_paths, out_file_paths
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
74 |
'''
|
75 |
-
|
76 |
'''
|
|
|
|
|
|
|
77 |
|
78 |
if not image_paths:
|
79 |
|
@@ -83,9 +119,6 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
83 |
|
84 |
image_paths = process_file(file_path)
|
85 |
|
86 |
-
# Create a new PDF
|
87 |
-
#pdf = pikepdf.new()
|
88 |
-
|
89 |
images = []
|
90 |
number_of_pages = len(image_paths)
|
91 |
|
@@ -100,6 +133,8 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
100 |
# Get the image to redact using PIL lib (pillow)
|
101 |
image = image_paths[i] #Image.open(image_paths[i])
|
102 |
|
|
|
|
|
103 |
# %%
|
104 |
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
105 |
engine = ImageRedactorEngine(image_analyser)
|
@@ -108,24 +143,35 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
|
|
108 |
ocr_lang = 'eng'
|
109 |
else: ocr_lang = language
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
fill=(0, 0, 0),
|
115 |
-
ocr_kwargs={"lang": ocr_lang},
|
116 |
-
allow_list=allow_list,
|
117 |
-
ad_hoc_recognizers= None,
|
118 |
-
**{
|
119 |
"language": language,
|
120 |
"entities": chosen_redact_entities,
|
121 |
"score_threshold": score_threshold
|
122 |
-
}
|
123 |
-
)
|
124 |
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
return images
|
128 |
|
|
|
129 |
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
130 |
'''
|
131 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
@@ -136,13 +182,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
136 |
annotations_all_pages = []
|
137 |
analyzed_bounding_boxes_df = pd.DataFrame()
|
138 |
|
|
|
|
|
|
|
139 |
pdf = Pdf.open(filename)
|
140 |
|
141 |
page_num = 0
|
142 |
|
143 |
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
144 |
|
145 |
-
|
146 |
print("Page number is: ", page_num)
|
147 |
|
148 |
annotations_on_page = []
|
@@ -165,21 +213,63 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
165 |
return_decision_process=False,
|
166 |
allow_list=allow_list)
|
167 |
|
168 |
-
#if analyzer_results:
|
169 |
-
# pass
|
170 |
-
#explanation = analyzer_results[0].analysis_explanation.to_dict()
|
171 |
-
#analyser_explanations.append(explanation)
|
172 |
characters = [char # This is what we want to include in the list
|
173 |
for line in text_container # Loop through each line in text_container
|
174 |
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
|
175 |
for char in line] # Loop through each character in the line
|
176 |
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
|
177 |
|
178 |
-
# If any results found
|
179 |
-
print(analyzer_results)
|
180 |
|
|
|
|
|
|
|
|
|
|
|
181 |
if len(analyzer_results) > 0 and len(characters) > 0:
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
combined_analyzer_results.extend(analyzer_results)
|
184 |
|
185 |
if len(analyzer_results) > 0:
|
@@ -195,14 +285,19 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
195 |
bounding_box = analyzed_bounding_box["boundingBox"]
|
196 |
annotation = Dictionary(
|
197 |
Type=Name.Annot,
|
198 |
-
Subtype=Name.Highlight,
|
199 |
QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
|
200 |
Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
|
201 |
C=[0, 0, 0],
|
|
|
202 |
CA=1, # Transparency
|
203 |
-
T=analyzed_bounding_box["result"].entity_type
|
|
|
|
|
|
|
|
|
204 |
)
|
205 |
-
annotations_on_page.append(annotation)
|
206 |
|
207 |
annotations_all_pages.extend([annotations_on_page])
|
208 |
|
@@ -210,27 +305,92 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
210 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
211 |
|
212 |
page_num += 1
|
213 |
-
|
214 |
-
# Extracting data from dictionaries
|
215 |
-
# extracted_data = []
|
216 |
-
# for item in annotations_all_pages:
|
217 |
-
# temp_dict = {}
|
218 |
-
# #print(item)
|
219 |
-
# for key, value in item.items():
|
220 |
-
# if isinstance(value, Decimal):
|
221 |
-
# temp_dict[key] = float(value)
|
222 |
-
# elif isinstance(value, list):
|
223 |
-
# temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
|
224 |
-
# else:
|
225 |
-
# temp_dict[key] = value
|
226 |
-
# extracted_data.append(temp_dict)
|
227 |
-
|
228 |
-
# Creating DataFrame
|
229 |
-
# annotations_out = pd.DataFrame(extracted_data)
|
230 |
-
#print(df)
|
231 |
-
|
232 |
-
#annotations_out.to_csv("examples/annotations.csv")
|
233 |
|
234 |
-
analyzed_bounding_boxes_df.to_csv("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
-
|
|
|
2 |
from typing import List
|
3 |
import pandas as pd
|
4 |
from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
|
5 |
+
from presidio_image_redactor.entities import ImageRecognizerResult
|
6 |
from pdfminer.high_level import extract_pages
|
7 |
from tools.file_conversion import process_file
|
8 |
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
|
9 |
from pikepdf import Pdf, Dictionary, Name
|
10 |
from gradio import Progress
|
11 |
import time
|
12 |
+
from collections import defaultdict # For efficient grouping
|
13 |
|
14 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
15 |
+
from tools.helper_functions import get_file_path_end, output_folder
|
16 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
17 |
import gradio as gr
|
18 |
|
19 |
+
|
20 |
def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
|
21 |
|
22 |
tic = time.perf_counter()
|
|
|
40 |
# return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
|
41 |
|
42 |
pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
|
43 |
+
out_image_file_path = output_folder + file_path_without_ext + "_result_as_img.pdf"
|
44 |
pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
|
45 |
|
46 |
out_file_paths.append(out_image_file_path)
|
|
|
52 |
|
53 |
# Analyse text-based pdf
|
54 |
pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
|
55 |
+
out_text_file_path = output_folder + file_path_without_ext + "_result_as_text.pdf"
|
56 |
pdf_text.save(out_text_file_path)
|
57 |
|
58 |
out_file_paths.append(out_text_file_path)
|
|
|
72 |
|
73 |
return out_message, out_file_paths, out_file_paths
|
74 |
|
75 |
+
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
76 |
+
merged_bboxes = []
|
77 |
+
grouped_bboxes = defaultdict(list)
|
78 |
+
|
79 |
+
# 1. Group by approximate vertical proximity
|
80 |
+
for box in bboxes:
|
81 |
+
grouped_bboxes[round(box.top / vertical_threshold)].append(box)
|
82 |
+
|
83 |
+
# 2. Merge within each group
|
84 |
+
for _, group in grouped_bboxes.items():
|
85 |
+
group.sort(key=lambda box: box.left)
|
86 |
+
|
87 |
+
merged_box = group[0]
|
88 |
+
for next_box in group[1:]:
|
89 |
+
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
90 |
+
print("Merging a box")
|
91 |
+
# Calculate new dimensions for the merged box
|
92 |
+
new_left = min(merged_box.left, next_box.left)
|
93 |
+
new_top = min(merged_box.top, next_box.top)
|
94 |
+
new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
|
95 |
+
new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
|
96 |
+
merged_box = ImageRecognizerResult(
|
97 |
+
merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height
|
98 |
+
)
|
99 |
+
else:
|
100 |
+
merged_bboxes.append(merged_box)
|
101 |
+
merged_box = next_box
|
102 |
+
|
103 |
+
merged_bboxes.append(merged_box)
|
104 |
+
return merged_bboxes
|
105 |
|
106 |
def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
107 |
'''
|
108 |
+
Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
|
109 |
'''
|
110 |
+
from PIL import Image, ImageChops, ImageDraw
|
111 |
+
|
112 |
+
fill = (0, 0, 0)
|
113 |
|
114 |
if not image_paths:
|
115 |
|
|
|
119 |
|
120 |
image_paths = process_file(file_path)
|
121 |
|
|
|
|
|
|
|
122 |
images = []
|
123 |
number_of_pages = len(image_paths)
|
124 |
|
|
|
133 |
# Get the image to redact using PIL lib (pillow)
|
134 |
image = image_paths[i] #Image.open(image_paths[i])
|
135 |
|
136 |
+
image = ImageChops.duplicate(image)
|
137 |
+
|
138 |
# %%
|
139 |
image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
140 |
engine = ImageRedactorEngine(image_analyser)
|
|
|
143 |
ocr_lang = 'eng'
|
144 |
else: ocr_lang = language
|
145 |
|
146 |
+
bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
|
147 |
+
**{
|
148 |
+
"allow_list": allow_list,
|
|
|
|
|
|
|
|
|
|
|
149 |
"language": language,
|
150 |
"entities": chosen_redact_entities,
|
151 |
"score_threshold": score_threshold
|
152 |
+
})
|
|
|
153 |
|
154 |
+
#print("For page: ", str(i), "Bounding boxes: ", bboxes)
|
155 |
+
|
156 |
+
draw = ImageDraw.Draw(image)
|
157 |
+
|
158 |
+
merged_bboxes = merge_img_bboxes(bboxes)
|
159 |
+
|
160 |
+
print("For page: ", str(i), "Merged bounding boxes: ", merged_bboxes)
|
161 |
+
|
162 |
+
# 3. Draw the merged boxes (unchanged)
|
163 |
+
for box in merged_bboxes:
|
164 |
+
x0 = box.left
|
165 |
+
y0 = box.top
|
166 |
+
x1 = x0 + box.width
|
167 |
+
y1 = y0 + box.height
|
168 |
+
draw.rectangle([x0, y0, x1, y1], fill=fill)
|
169 |
+
|
170 |
+
images.append(image)
|
171 |
|
172 |
return images
|
173 |
|
174 |
+
|
175 |
def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
176 |
'''
|
177 |
Redact chosen entities from a pdf that is made up of multiple pages that are not images.
|
|
|
182 |
annotations_all_pages = []
|
183 |
analyzed_bounding_boxes_df = pd.DataFrame()
|
184 |
|
185 |
+
# Horizontal distance between PII bounding boxes under/equal they are combined into one
|
186 |
+
combine_pixel_dist = 100
|
187 |
+
|
188 |
pdf = Pdf.open(filename)
|
189 |
|
190 |
page_num = 0
|
191 |
|
192 |
for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
|
193 |
|
|
|
194 |
print("Page number is: ", page_num)
|
195 |
|
196 |
annotations_on_page = []
|
|
|
213 |
return_decision_process=False,
|
214 |
allow_list=allow_list)
|
215 |
|
|
|
|
|
|
|
|
|
216 |
characters = [char # This is what we want to include in the list
|
217 |
for line in text_container # Loop through each line in text_container
|
218 |
if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
|
219 |
for char in line] # Loop through each character in the line
|
220 |
#if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
|
221 |
|
|
|
|
|
222 |
|
223 |
+
# if len(analyzer_results) > 0 and len(characters) > 0:
|
224 |
+
# analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
|
225 |
+
# combined_analyzer_results.extend(analyzer_results)
|
226 |
+
|
227 |
+
# Inside the loop where you process analyzer_results:
|
228 |
if len(analyzer_results) > 0 and len(characters) > 0:
|
229 |
+
merged_bounding_boxes = []
|
230 |
+
current_box = None
|
231 |
+
current_y = None
|
232 |
+
|
233 |
+
for result in analyzer_results:
|
234 |
+
for char in characters[result.start : result.end]:
|
235 |
+
if isinstance(char, LTChar):
|
236 |
+
char_box = list(char.bbox)
|
237 |
+
|
238 |
+
# Fix: Check if either current_y or current_box are None
|
239 |
+
if current_y is None or current_box is None:
|
240 |
+
# This is the first character, so initialize current_box and current_y
|
241 |
+
current_box = char_box
|
242 |
+
current_y = char_box[1]
|
243 |
+
else: # Now we have previous values to compare
|
244 |
+
print("Comparing values")
|
245 |
+
vertical_diff_bboxes = abs(char_box[1] - current_y)
|
246 |
+
horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
247 |
+
#print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
|
248 |
+
|
249 |
+
if (
|
250 |
+
vertical_diff_bboxes <= 5
|
251 |
+
and horizontal_diff_bboxes <= combine_pixel_dist
|
252 |
+
):
|
253 |
+
old_right_pos = current_box[2]
|
254 |
+
current_box[2] = char_box[2]
|
255 |
+
|
256 |
+
print("Old right pos: ", str(old_right_pos), "has been replaced with: ", str(current_box[2]), "for result: ", result)
|
257 |
+
|
258 |
+
else:
|
259 |
+
merged_bounding_boxes.append(
|
260 |
+
{"boundingBox": current_box, "result": result})
|
261 |
+
|
262 |
+
current_box = char_box
|
263 |
+
current_y = char_box[1]
|
264 |
+
# Add the last box
|
265 |
+
if current_box:
|
266 |
+
merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
|
267 |
+
|
268 |
+
if not merged_bounding_boxes:
|
269 |
+
analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
|
270 |
+
else:
|
271 |
+
analyzed_bounding_boxes.extend(merged_bounding_boxes)
|
272 |
+
|
273 |
combined_analyzer_results.extend(analyzer_results)
|
274 |
|
275 |
if len(analyzer_results) > 0:
|
|
|
285 |
bounding_box = analyzed_bounding_box["boundingBox"]
|
286 |
annotation = Dictionary(
|
287 |
Type=Name.Annot,
|
288 |
+
Subtype=Name.Square, #Name.Highlight,
|
289 |
QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
|
290 |
Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
|
291 |
C=[0, 0, 0],
|
292 |
+
IC=[0, 0, 0],
|
293 |
CA=1, # Transparency
|
294 |
+
T=analyzed_bounding_box["result"].entity_type,
|
295 |
+
BS=Dictionary(
|
296 |
+
W=0, # Border width: 1 point
|
297 |
+
S=Name.S # Border style: solid
|
298 |
+
)
|
299 |
)
|
300 |
+
annotations_on_page.append(annotation)
|
301 |
|
302 |
annotations_all_pages.extend([annotations_on_page])
|
303 |
|
|
|
305 |
page.Annots = pdf.make_indirect(annotations_on_page)
|
306 |
|
307 |
page_num += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
+
analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
|
310 |
+
|
311 |
+
return pdf
|
312 |
+
|
313 |
+
|
314 |
+
# for page_num, annotations_on_page in enumerate(annotations_all_pages):
|
315 |
+
# # 2. Normalize annotation heights on the same line:
|
316 |
+
# line_heights = {} # {y_coordinate: max_height}
|
317 |
+
|
318 |
+
# # Get line heights for every annotation
|
319 |
+
# for annotation in annotations_on_page:
|
320 |
+
# if 'Rect' in annotation:
|
321 |
+
# y = annotation['Rect'][1]
|
322 |
+
# height = annotation['Rect'][3] - annotation['Rect'][1]
|
323 |
+
# line_heights[y] = max(line_heights.get(y, 0), height)
|
324 |
+
|
325 |
+
# # Update line heights for annotations
|
326 |
+
# for annotation in annotations_on_page:
|
327 |
+
# if 'Rect' in annotation:
|
328 |
+
# y = annotation['Rect'][1]
|
329 |
+
# annotation['Rect'][3] = y + line_heights[y]
|
330 |
+
|
331 |
+
# # Update QuadPoints to match the new Rect coordinates
|
332 |
+
# x1, y1, x2, y2 = annotation['Rect'] # Extract coordinates from Rect
|
333 |
+
# annotation['QuadPoints'] = [
|
334 |
+
# x1, y2, # Top left
|
335 |
+
# x2, y2, # Top right
|
336 |
+
# x1, y1, # Bottom left
|
337 |
+
# x2, y1 # Bottom right
|
338 |
+
# ]
|
339 |
+
|
340 |
+
|
341 |
+
# def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
|
342 |
+
# '''
|
343 |
+
# take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
|
344 |
+
# '''
|
345 |
+
|
346 |
+
# if not image_paths:
|
347 |
+
|
348 |
+
# out_message = "PDF does not exist as images. Converting pages to image"
|
349 |
+
# print(out_message)
|
350 |
+
# progress(0, desc=out_message)
|
351 |
+
|
352 |
+
# image_paths = process_file(file_path)
|
353 |
+
|
354 |
+
# # Create a new PDF
|
355 |
+
# #pdf = pikepdf.new()
|
356 |
+
|
357 |
+
# images = []
|
358 |
+
# number_of_pages = len(image_paths)
|
359 |
+
|
360 |
+
# out_message = "Redacting pages"
|
361 |
+
# print(out_message)
|
362 |
+
# progress(0.1, desc=out_message)
|
363 |
+
|
364 |
+
# for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
|
365 |
+
|
366 |
+
# print("Redacting page ", str(i + 1))
|
367 |
+
|
368 |
+
# # Get the image to redact using PIL lib (pillow)
|
369 |
+
# image = image_paths[i] #Image.open(image_paths[i])
|
370 |
+
|
371 |
+
# # %%
|
372 |
+
# image_analyser = ImageAnalyzerEngine(nlp_analyser)
|
373 |
+
# engine = ImageRedactorEngine(image_analyser)
|
374 |
+
|
375 |
+
# if language == 'en':
|
376 |
+
# ocr_lang = 'eng'
|
377 |
+
# else: ocr_lang = language
|
378 |
+
|
379 |
+
# # %%
|
380 |
+
# # Redact the image with pink color
|
381 |
+
# redacted_image = engine.redact(image,
|
382 |
+
# fill=(0, 0, 0),
|
383 |
+
# ocr_kwargs={"lang": ocr_lang},
|
384 |
+
# allow_list=allow_list,
|
385 |
+
# ad_hoc_recognizers= None,
|
386 |
+
# **{
|
387 |
+
# "language": language,
|
388 |
+
# "entities": chosen_redact_entities,
|
389 |
+
# "score_threshold": score_threshold
|
390 |
+
# },
|
391 |
+
# )
|
392 |
+
|
393 |
+
# images.append(redacted_image)
|
394 |
+
|
395 |
|
396 |
+
# return images
|
tools/helper_functions.py
CHANGED
@@ -1,5 +1,23 @@
|
|
1 |
import os
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
def get_file_path_end(file_path):
|
4 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
5 |
basename = os.path.basename(file_path)
|
|
|
1 |
import os
|
2 |
|
3 |
+
def get_or_create_env_var(var_name, default_value):
|
4 |
+
# Get the environment variable if it exists
|
5 |
+
value = os.environ.get(var_name)
|
6 |
+
|
7 |
+
# If it doesn't exist, set it to the default value
|
8 |
+
if value is None:
|
9 |
+
os.environ[var_name] = default_value
|
10 |
+
value = default_value
|
11 |
+
|
12 |
+
return value
|
13 |
+
|
14 |
+
# Retrieving or setting output folder
|
15 |
+
env_var_name = 'GRADIO_OUTPUT_FOLDER'
|
16 |
+
default_value = 'output/'
|
17 |
+
|
18 |
+
output_folder = get_or_create_env_var(env_var_name, default_value)
|
19 |
+
print(f'The value of {env_var_name} is {output_folder}')
|
20 |
+
|
21 |
def get_file_path_end(file_path):
|
22 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
23 |
basename = os.path.basename(file_path)
|