seanpedrickcase commited on
Commit
12224f5
·
1 Parent(s): 2a4b347

Better redaction output formatting. Custom output folders allowed. Upgraded Gradio version

Browse files
Dockerfile CHANGED
@@ -16,7 +16,7 @@ COPY requirements.txt .
16
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
- RUN pip install --no-cache-dir gradio==4.31.0
20
 
21
  # Set up a new user named "user" with user ID 1000
22
  RUN useradd -m -u 1000 user
 
16
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
+ RUN pip install --no-cache-dir gradio==4.33.0
20
 
21
  # Set up a new user named "user" with user ID 1000
22
  RUN useradd -m -u 1000 user
app.py CHANGED
@@ -48,7 +48,7 @@ with block:
48
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
49
  in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
50
 
51
- redact_btn = gr.Button("Redact document")
52
 
53
  with gr.Row():
54
  output_summary = gr.Textbox(label="Output summary")
 
48
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language", multiselect=False)
49
  in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=1, value=[[""]], type="array", column_widths=["50%"])
50
 
51
+ redact_btn = gr.Button("Redact document", variant="primary")
52
 
53
  with gr.Row():
54
  output_summary = gr.Textbox(label="Output summary")
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
  pdfminer.six==20231228
2
  pdf2image==1.17.0
3
  opencv-python==4.9.0.80
4
- presidio_analyzer==2.2.351
5
- presidio_anonymizer==2.2.351
6
  presidio-image-redactor==0.0.52
7
  pikepdf==8.15.1
8
  pandas==2.2.2
 
1
  pdfminer.six==20231228
2
  pdf2image==1.17.0
3
  opencv-python==4.9.0.80
4
+ presidio_analyzer==2.2.354
5
+ presidio_anonymizer==2.2.354
6
  presidio-image-redactor==0.0.52
7
  pikepdf==8.15.1
8
  pandas==2.2.2
tools/file_conversion.py CHANGED
@@ -1,5 +1,5 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end
3
  from PIL import Image
4
  import os
5
  from gradio import Progress
@@ -50,7 +50,7 @@ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
50
  print("Current page: ", str(page_num))
51
 
52
  # Convert one page to image
53
- image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1)
54
 
55
  # If no images are returned, break the loop
56
  if not image:
@@ -124,8 +124,8 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
124
  # Convert annotated text pdf back to image to give genuine redactions
125
  print("Creating image version of results")
126
  pdf_text_image_paths = process_file(out_text_file_path[0])
127
- out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
- pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
 
130
  out_file_paths.append(out_text_image_file_path)
131
 
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_path_end, output_folder
3
  from PIL import Image
4
  import os
5
  from gradio import Progress
 
50
  print("Current page: ", str(page_num))
51
 
52
  # Convert one page to image
53
+ image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
54
 
55
  # If no images are returned, break the loop
56
  if not image:
 
124
  # Convert annotated text pdf back to image to give genuine redactions
125
  print("Creating image version of results")
126
  pdf_text_image_paths = process_file(out_text_file_path[0])
127
+ out_text_image_file_path = output_folder + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
+ pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=300.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
 
130
  out_file_paths.append(out_text_image_file_path)
131
 
tools/file_redaction.py CHANGED
@@ -2,18 +2,21 @@ from PIL import Image
2
  from typing import List
3
  import pandas as pd
4
  from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
 
5
  from pdfminer.high_level import extract_pages
6
  from tools.file_conversion import process_file
7
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
8
  from pikepdf import Pdf, Dictionary, Name
9
  from gradio import Progress
10
  import time
 
11
 
12
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
13
- from tools.helper_functions import get_file_path_end
14
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
15
  import gradio as gr
16
 
 
17
  def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
18
 
19
  tic = time.perf_counter()
@@ -37,7 +40,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
37
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
38
 
39
  pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
40
- out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
41
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
42
 
43
  out_file_paths.append(out_image_file_path)
@@ -49,7 +52,7 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
49
 
50
  # Analyse text-based pdf
51
  pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
52
- out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
53
  pdf_text.save(out_text_file_path)
54
 
55
  out_file_paths.append(out_text_file_path)
@@ -69,11 +72,44 @@ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str,
69
 
70
  return out_message, out_file_paths, out_file_paths
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
74
  '''
75
- take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
76
  '''
 
 
 
77
 
78
  if not image_paths:
79
 
@@ -83,9 +119,6 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
83
 
84
  image_paths = process_file(file_path)
85
 
86
- # Create a new PDF
87
- #pdf = pikepdf.new()
88
-
89
  images = []
90
  number_of_pages = len(image_paths)
91
 
@@ -100,6 +133,8 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
100
  # Get the image to redact using PIL lib (pillow)
101
  image = image_paths[i] #Image.open(image_paths[i])
102
 
 
 
103
  # %%
104
  image_analyser = ImageAnalyzerEngine(nlp_analyser)
105
  engine = ImageRedactorEngine(image_analyser)
@@ -108,24 +143,35 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
108
  ocr_lang = 'eng'
109
  else: ocr_lang = language
110
 
111
- # %%
112
- # Redact the image with pink color
113
- redacted_image = engine.redact(image,
114
- fill=(0, 0, 0),
115
- ocr_kwargs={"lang": ocr_lang},
116
- allow_list=allow_list,
117
- ad_hoc_recognizers= None,
118
- **{
119
  "language": language,
120
  "entities": chosen_redact_entities,
121
  "score_threshold": score_threshold
122
- },
123
- )
124
 
125
- images.append(redacted_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  return images
128
 
 
129
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
130
  '''
131
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
@@ -136,13 +182,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
136
  annotations_all_pages = []
137
  analyzed_bounding_boxes_df = pd.DataFrame()
138
 
 
 
 
139
  pdf = Pdf.open(filename)
140
 
141
  page_num = 0
142
 
143
  for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
144
 
145
-
146
  print("Page number is: ", page_num)
147
 
148
  annotations_on_page = []
@@ -165,21 +213,63 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
165
  return_decision_process=False,
166
  allow_list=allow_list)
167
 
168
- #if analyzer_results:
169
- # pass
170
- #explanation = analyzer_results[0].analysis_explanation.to_dict()
171
- #analyser_explanations.append(explanation)
172
  characters = [char # This is what we want to include in the list
173
  for line in text_container # Loop through each line in text_container
174
  if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
175
  for char in line] # Loop through each character in the line
176
  #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
177
 
178
- # If any results found
179
- print(analyzer_results)
180
 
 
 
 
 
 
181
  if len(analyzer_results) > 0 and len(characters) > 0:
182
- analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  combined_analyzer_results.extend(analyzer_results)
184
 
185
  if len(analyzer_results) > 0:
@@ -195,14 +285,19 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
195
  bounding_box = analyzed_bounding_box["boundingBox"]
196
  annotation = Dictionary(
197
  Type=Name.Annot,
198
- Subtype=Name.Highlight,
199
  QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
200
  Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
201
  C=[0, 0, 0],
 
202
  CA=1, # Transparency
203
- T=analyzed_bounding_box["result"].entity_type
 
 
 
 
204
  )
205
- annotations_on_page.append(annotation)
206
 
207
  annotations_all_pages.extend([annotations_on_page])
208
 
@@ -210,27 +305,92 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
210
  page.Annots = pdf.make_indirect(annotations_on_page)
211
 
212
  page_num += 1
213
-
214
- # Extracting data from dictionaries
215
- # extracted_data = []
216
- # for item in annotations_all_pages:
217
- # temp_dict = {}
218
- # #print(item)
219
- # for key, value in item.items():
220
- # if isinstance(value, Decimal):
221
- # temp_dict[key] = float(value)
222
- # elif isinstance(value, list):
223
- # temp_dict[key] = [float(v) if isinstance(v, Decimal) else v for v in value]
224
- # else:
225
- # temp_dict[key] = value
226
- # extracted_data.append(temp_dict)
227
-
228
- # Creating DataFrame
229
- # annotations_out = pd.DataFrame(extracted_data)
230
- #print(df)
231
-
232
- #annotations_out.to_csv("examples/annotations.csv")
233
 
234
- analyzed_bounding_boxes_df.to_csv("output/annotations_made.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- return pdf
 
2
  from typing import List
3
  import pandas as pd
4
  from presidio_image_redactor import ImageRedactorEngine, ImageAnalyzerEngine
5
+ from presidio_image_redactor.entities import ImageRecognizerResult
6
  from pdfminer.high_level import extract_pages
7
  from tools.file_conversion import process_file
8
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
9
  from pikepdf import Pdf, Dictionary, Name
10
  from gradio import Progress
11
  import time
12
+ from collections import defaultdict # For efficient grouping
13
 
14
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
15
+ from tools.helper_functions import get_file_path_end, output_folder
16
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
17
  import gradio as gr
18
 
19
+
20
  def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
21
 
22
  tic = time.perf_counter()
 
40
  # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
41
 
42
  pdf_images = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat)
43
+ out_image_file_path = output_folder + file_path_without_ext + "_result_as_img.pdf"
44
  pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
45
 
46
  out_file_paths.append(out_image_file_path)
 
52
 
53
  # Analyse text-based pdf
54
  pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
55
+ out_text_file_path = output_folder + file_path_without_ext + "_result_as_text.pdf"
56
  pdf_text.save(out_text_file_path)
57
 
58
  out_file_paths.append(out_text_file_path)
 
72
 
73
  return out_message, out_file_paths, out_file_paths
74
 
75
+ def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
76
+ merged_bboxes = []
77
+ grouped_bboxes = defaultdict(list)
78
+
79
+ # 1. Group by approximate vertical proximity
80
+ for box in bboxes:
81
+ grouped_bboxes[round(box.top / vertical_threshold)].append(box)
82
+
83
+ # 2. Merge within each group
84
+ for _, group in grouped_bboxes.items():
85
+ group.sort(key=lambda box: box.left)
86
+
87
+ merged_box = group[0]
88
+ for next_box in group[1:]:
89
+ if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
90
+ print("Merging a box")
91
+ # Calculate new dimensions for the merged box
92
+ new_left = min(merged_box.left, next_box.left)
93
+ new_top = min(merged_box.top, next_box.top)
94
+ new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
95
+ new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
96
+ merged_box = ImageRecognizerResult(
97
+ merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height
98
+ )
99
+ else:
100
+ merged_bboxes.append(merged_box)
101
+ merged_box = next_box
102
+
103
+ merged_bboxes.append(merged_box)
104
+ return merged_bboxes
105
 
106
  def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
107
  '''
108
+ Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
109
  '''
110
+ from PIL import Image, ImageChops, ImageDraw
111
+
112
+ fill = (0, 0, 0)
113
 
114
  if not image_paths:
115
 
 
119
 
120
  image_paths = process_file(file_path)
121
 
 
 
 
122
  images = []
123
  number_of_pages = len(image_paths)
124
 
 
133
  # Get the image to redact using PIL lib (pillow)
134
  image = image_paths[i] #Image.open(image_paths[i])
135
 
136
+ image = ImageChops.duplicate(image)
137
+
138
  # %%
139
  image_analyser = ImageAnalyzerEngine(nlp_analyser)
140
  engine = ImageRedactorEngine(image_analyser)
 
143
  ocr_lang = 'eng'
144
  else: ocr_lang = language
145
 
146
+ bboxes = image_analyser.analyze(image,ocr_kwargs={"lang": ocr_lang},
147
+ **{
148
+ "allow_list": allow_list,
 
 
 
 
 
149
  "language": language,
150
  "entities": chosen_redact_entities,
151
  "score_threshold": score_threshold
152
+ })
 
153
 
154
+ #print("For page: ", str(i), "Bounding boxes: ", bboxes)
155
+
156
+ draw = ImageDraw.Draw(image)
157
+
158
+ merged_bboxes = merge_img_bboxes(bboxes)
159
+
160
+ print("For page: ", str(i), "Merged bounding boxes: ", merged_bboxes)
161
+
162
+ # 3. Draw the merged boxes (unchanged)
163
+ for box in merged_bboxes:
164
+ x0 = box.left
165
+ y0 = box.top
166
+ x1 = x0 + box.width
167
+ y1 = y0 + box.height
168
+ draw.rectangle([x0, y0, x1, y1], fill=fill)
169
+
170
+ images.append(image)
171
 
172
  return images
173
 
174
+
175
  def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
176
  '''
177
  Redact chosen entities from a pdf that is made up of multiple pages that are not images.
 
182
  annotations_all_pages = []
183
  analyzed_bounding_boxes_df = pd.DataFrame()
184
 
185
+ # Horizontal distance between PII bounding boxes under/equal they are combined into one
186
+ combine_pixel_dist = 100
187
+
188
  pdf = Pdf.open(filename)
189
 
190
  page_num = 0
191
 
192
  for page in progress.tqdm(pdf.pages, total=len(pdf.pages), unit="pages", desc="Redacting pages"):
193
 
 
194
  print("Page number is: ", page_num)
195
 
196
  annotations_on_page = []
 
213
  return_decision_process=False,
214
  allow_list=allow_list)
215
 
 
 
 
 
216
  characters = [char # This is what we want to include in the list
217
  for line in text_container # Loop through each line in text_container
218
  if isinstance(line, LTTextLine) # Check if the line is an instance of LTTextLine
219
  for char in line] # Loop through each character in the line
220
  #if isinstance(char, LTChar)] # Check if the character is not an instance of LTAnno #isinstance(char, LTChar) or
221
 
 
 
222
 
223
+ # if len(analyzer_results) > 0 and len(characters) > 0:
224
+ # analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
225
+ # combined_analyzer_results.extend(analyzer_results)
226
+
227
+ # Inside the loop where you process analyzer_results:
228
  if len(analyzer_results) > 0 and len(characters) > 0:
229
+ merged_bounding_boxes = []
230
+ current_box = None
231
+ current_y = None
232
+
233
+ for result in analyzer_results:
234
+ for char in characters[result.start : result.end]:
235
+ if isinstance(char, LTChar):
236
+ char_box = list(char.bbox)
237
+
238
+ # Fix: Check if either current_y or current_box are None
239
+ if current_y is None or current_box is None:
240
+ # This is the first character, so initialize current_box and current_y
241
+ current_box = char_box
242
+ current_y = char_box[1]
243
+ else: # Now we have previous values to compare
244
+ print("Comparing values")
245
+ vertical_diff_bboxes = abs(char_box[1] - current_y)
246
+ horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
247
+ #print("Vertical distance with last bbox: ", str(vertical_diff_bboxes), "Horizontal distance: ", str(horizontal_diff_bboxes), "For result: ", result)
248
+
249
+ if (
250
+ vertical_diff_bboxes <= 5
251
+ and horizontal_diff_bboxes <= combine_pixel_dist
252
+ ):
253
+ old_right_pos = current_box[2]
254
+ current_box[2] = char_box[2]
255
+
256
+ print("Old right pos: ", str(old_right_pos), "has been replaced with: ", str(current_box[2]), "for result: ", result)
257
+
258
+ else:
259
+ merged_bounding_boxes.append(
260
+ {"boundingBox": current_box, "result": result})
261
+
262
+ current_box = char_box
263
+ current_y = char_box[1]
264
+ # Add the last box
265
+ if current_box:
266
+ merged_bounding_boxes.append({"boundingBox": current_box, "result": result})
267
+
268
+ if not merged_bounding_boxes:
269
+ analyzed_bounding_boxes.extend({"boundingBox": char.bbox, "result": result} for result in analyzer_results for char in characters[result.start:result.end] if isinstance(char, LTChar))
270
+ else:
271
+ analyzed_bounding_boxes.extend(merged_bounding_boxes)
272
+
273
  combined_analyzer_results.extend(analyzer_results)
274
 
275
  if len(analyzer_results) > 0:
 
285
  bounding_box = analyzed_bounding_box["boundingBox"]
286
  annotation = Dictionary(
287
  Type=Name.Annot,
288
+ Subtype=Name.Square, #Name.Highlight,
289
  QuadPoints=[bounding_box[0], bounding_box[3], bounding_box[2], bounding_box[3], bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[1]],
290
  Rect=[bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]],
291
  C=[0, 0, 0],
292
+ IC=[0, 0, 0],
293
  CA=1, # Transparency
294
+ T=analyzed_bounding_box["result"].entity_type,
295
+ BS=Dictionary(
296
+ W=0, # Border width: 1 point
297
+ S=Name.S # Border style: solid
298
+ )
299
  )
300
+ annotations_on_page.append(annotation)
301
 
302
  annotations_all_pages.extend([annotations_on_page])
303
 
 
305
  page.Annots = pdf.make_indirect(annotations_on_page)
306
 
307
  page_num += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
+ analyzed_bounding_boxes_df.to_csv(output_folder + "annotations_made.csv")
310
+
311
+ return pdf
312
+
313
+
314
+ # for page_num, annotations_on_page in enumerate(annotations_all_pages):
315
+ # # 2. Normalize annotation heights on the same line:
316
+ # line_heights = {} # {y_coordinate: max_height}
317
+
318
+ # # Get line heights for every annotation
319
+ # for annotation in annotations_on_page:
320
+ # if 'Rect' in annotation:
321
+ # y = annotation['Rect'][1]
322
+ # height = annotation['Rect'][3] - annotation['Rect'][1]
323
+ # line_heights[y] = max(line_heights.get(y, 0), height)
324
+
325
+ # # Update line heights for annotations
326
+ # for annotation in annotations_on_page:
327
+ # if 'Rect' in annotation:
328
+ # y = annotation['Rect'][1]
329
+ # annotation['Rect'][3] = y + line_heights[y]
330
+
331
+ # # Update QuadPoints to match the new Rect coordinates
332
+ # x1, y1, x2, y2 = annotation['Rect'] # Extract coordinates from Rect
333
+ # annotation['QuadPoints'] = [
334
+ # x1, y2, # Top left
335
+ # x2, y2, # Top right
336
+ # x1, y1, # Bottom left
337
+ # x2, y1 # Bottom right
338
+ # ]
339
+
340
+
341
+ # def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
342
+ # '''
343
+ # take an path for an image of a document, then run this image through the Presidio ImageAnalyzer to get a redacted page back
344
+ # '''
345
+
346
+ # if not image_paths:
347
+
348
+ # out_message = "PDF does not exist as images. Converting pages to image"
349
+ # print(out_message)
350
+ # progress(0, desc=out_message)
351
+
352
+ # image_paths = process_file(file_path)
353
+
354
+ # # Create a new PDF
355
+ # #pdf = pikepdf.new()
356
+
357
+ # images = []
358
+ # number_of_pages = len(image_paths)
359
+
360
+ # out_message = "Redacting pages"
361
+ # print(out_message)
362
+ # progress(0.1, desc=out_message)
363
+
364
+ # for i in progress.tqdm(range(0,number_of_pages), total=number_of_pages, unit="pages", desc="Redacting pages"):
365
+
366
+ # print("Redacting page ", str(i + 1))
367
+
368
+ # # Get the image to redact using PIL lib (pillow)
369
+ # image = image_paths[i] #Image.open(image_paths[i])
370
+
371
+ # # %%
372
+ # image_analyser = ImageAnalyzerEngine(nlp_analyser)
373
+ # engine = ImageRedactorEngine(image_analyser)
374
+
375
+ # if language == 'en':
376
+ # ocr_lang = 'eng'
377
+ # else: ocr_lang = language
378
+
379
+ # # %%
380
+ # # Redact the image with pink color
381
+ # redacted_image = engine.redact(image,
382
+ # fill=(0, 0, 0),
383
+ # ocr_kwargs={"lang": ocr_lang},
384
+ # allow_list=allow_list,
385
+ # ad_hoc_recognizers= None,
386
+ # **{
387
+ # "language": language,
388
+ # "entities": chosen_redact_entities,
389
+ # "score_threshold": score_threshold
390
+ # },
391
+ # )
392
+
393
+ # images.append(redacted_image)
394
+
395
 
396
+ # return images
tools/helper_functions.py CHANGED
@@ -1,5 +1,23 @@
1
  import os
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def get_file_path_end(file_path):
4
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
5
  basename = os.path.basename(file_path)
 
1
  import os
2
 
3
+ def get_or_create_env_var(var_name, default_value):
4
+ # Get the environment variable if it exists
5
+ value = os.environ.get(var_name)
6
+
7
+ # If it doesn't exist, set it to the default value
8
+ if value is None:
9
+ os.environ[var_name] = default_value
10
+ value = default_value
11
+
12
+ return value
13
+
14
+ # Retrieving or setting output folder
15
+ env_var_name = 'GRADIO_OUTPUT_FOLDER'
16
+ default_value = 'output/'
17
+
18
+ output_folder = get_or_create_env_var(env_var_name, default_value)
19
+ print(f'The value of {env_var_name} is {output_folder}')
20
+
21
  def get_file_path_end(file_path):
22
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
23
  basename = os.path.basename(file_path)