Commit
·
0c2987b
1
Parent(s):
143e2cc
Corrected image resizing method for instances where the image is very large.
Browse files- app.py +1 -1
- tools/file_conversion.py +26 -8
- tools/file_redaction.py +2 -2
- tools/redaction_review.py +48 -31
app.py
CHANGED
@@ -473,7 +473,7 @@ print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
|
|
473 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
474 |
print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
475 |
|
476 |
-
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '
|
477 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
478 |
|
479 |
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
|
|
473 |
MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
|
474 |
print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
|
475 |
|
476 |
+
MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
|
477 |
print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
|
478 |
|
479 |
GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
|
tools/file_conversion.py
CHANGED
@@ -16,6 +16,7 @@ from typing import List, Optional
|
|
16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
17 |
|
18 |
image_dpi = 300.0
|
|
|
19 |
|
20 |
def is_pdf_or_image(filename):
|
21 |
"""
|
@@ -74,14 +75,31 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
74 |
image.save(out_path, format="PNG")
|
75 |
|
76 |
# Check file size and resize if necessary
|
77 |
-
max_size = 5 * 1024 * 1024 # 5 MB in bytes
|
78 |
-
file_size = os.path.getsize(out_path)
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
return page_num, out_path
|
87 |
|
|
|
16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
17 |
|
18 |
image_dpi = 300.0
|
19 |
+
Image.MAX_IMAGE_PIXELS = None
|
20 |
|
21 |
def is_pdf_or_image(filename):
|
22 |
"""
|
|
|
75 |
image.save(out_path, format="PNG")
|
76 |
|
77 |
# Check file size and resize if necessary
|
78 |
+
max_size = 5 * 1024 * 1024 # 5 MB in bytes # 5
|
79 |
+
file_size = os.path.getsize(out_path)
|
80 |
+
|
81 |
+
# Resize images if they are too big
|
82 |
+
if file_size > max_size:
|
83 |
+
# Start with the original image size
|
84 |
+
width, height = image.size
|
85 |
+
|
86 |
+
print(f"Image size before {new_width}x{new_height}, original file_size: {file_size}")
|
87 |
+
|
88 |
+
while file_size > max_size:
|
89 |
+
# Reduce the size by a factor (e.g., 50% of the current size)
|
90 |
+
new_width = int(width * 0.5)
|
91 |
+
new_height = int(height * 0.5)
|
92 |
+
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
93 |
+
|
94 |
+
# Save the resized image
|
95 |
+
image.save(out_path, format="PNG", optimize=True)
|
96 |
+
|
97 |
+
# Update the file size
|
98 |
+
file_size = os.path.getsize(out_path)
|
99 |
+
print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
|
100 |
+
|
101 |
+
# Update the dimensions for the next iteration
|
102 |
+
width, height = new_width, new_height
|
103 |
|
104 |
return page_num, out_path
|
105 |
|
tools/file_redaction.py
CHANGED
@@ -315,11 +315,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
315 |
redact_whole_page_list)
|
316 |
|
317 |
|
318 |
-
print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
319 |
|
320 |
# Save Textract request metadata (if exists)
|
321 |
if new_request_metadata:
|
322 |
-
print("Request metadata:", new_request_metadata)
|
323 |
all_request_metadata.append(new_request_metadata)
|
324 |
|
325 |
elif in_redact_method == text_ocr_option:
|
|
|
315 |
redact_whole_page_list)
|
316 |
|
317 |
|
318 |
+
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
319 |
|
320 |
# Save Textract request metadata (if exists)
|
321 |
if new_request_metadata:
|
322 |
+
#print("Request metadata:", new_request_metadata)
|
323 |
all_request_metadata.append(new_request_metadata)
|
324 |
|
325 |
elif in_redact_method == text_ocr_option:
|
tools/redaction_review.py
CHANGED
@@ -13,6 +13,7 @@ import os
|
|
13 |
import pymupdf
|
14 |
from fitz import Document
|
15 |
from PIL import ImageDraw, Image
|
|
|
16 |
|
17 |
def decrease_page(number:int):
|
18 |
'''
|
@@ -49,6 +50,53 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
|
|
49 |
|
50 |
return current_zoom_level, annotate_current_page
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
|
53 |
'''
|
54 |
Update a gradio_image_annotation object with new annotation data
|
@@ -77,7 +125,6 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
77 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
78 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
79 |
|
80 |
-
|
81 |
zoom_str = str(zoom) + '%'
|
82 |
|
83 |
if not image_annotator_object:
|
@@ -126,38 +173,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
126 |
if page_num_reported > page_max_reported:
|
127 |
page_num_reported = page_max_reported
|
128 |
|
129 |
-
from collections import defaultdict
|
130 |
-
|
131 |
-
# Remove duplicate elements that are blank
|
132 |
-
def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
|
133 |
-
# Group items by 'image'
|
134 |
-
image_groups = defaultdict(list)
|
135 |
-
for item in data:
|
136 |
-
image_groups[item['image']].append(item)
|
137 |
-
|
138 |
-
# Process each group to retain only the entry with non-empty boxes, if available
|
139 |
-
result = []
|
140 |
-
for image, items in image_groups.items():
|
141 |
-
# Filter items with non-empty boxes
|
142 |
-
non_empty_boxes = [item for item in items if item['boxes']]
|
143 |
-
if non_empty_boxes:
|
144 |
-
# Keep the first entry with non-empty boxes
|
145 |
-
result.append(non_empty_boxes[0])
|
146 |
-
else:
|
147 |
-
# If no non-empty boxes, keep the first item with empty boxes
|
148 |
-
result.append(items[0])
|
149 |
-
|
150 |
-
#print("result:", result)
|
151 |
-
|
152 |
-
return result
|
153 |
-
|
154 |
-
#print("image_annotator_object in update_annotator before function:", image_annotator_object)
|
155 |
-
|
156 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
157 |
|
158 |
-
#print("image_annotator_object in update_annotator after function:", image_annotator_object)
|
159 |
-
#print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
|
160 |
-
|
161 |
out_image_annotator = image_annotator(
|
162 |
value = image_annotator_object[page_num_reported - 1],
|
163 |
boxes_alpha=0.1,
|
|
|
13 |
import pymupdf
|
14 |
from fitz import Document
|
15 |
from PIL import ImageDraw, Image
|
16 |
+
from collections import defaultdict
|
17 |
|
18 |
def decrease_page(number:int):
|
19 |
'''
|
|
|
50 |
|
51 |
return current_zoom_level, annotate_current_page
|
52 |
|
53 |
+
|
54 |
+
# Remove duplicate elements that are blank
|
55 |
+
# def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
|
56 |
+
# # Group items by 'image'
|
57 |
+
# image_groups = defaultdict(list)
|
58 |
+
# for item in data:
|
59 |
+
# image_groups[item['image']].append(item)
|
60 |
+
|
61 |
+
# # Process each group to retain only the entry with non-empty boxes, if available
|
62 |
+
# result = []
|
63 |
+
# for image, items in image_groups.items():
|
64 |
+
# # Filter items with non-empty boxes
|
65 |
+
# non_empty_boxes = [item for item in items if item['boxes']]
|
66 |
+
# if non_empty_boxes:
|
67 |
+
# # Keep the first entry with non-empty boxes
|
68 |
+
# result.append(non_empty_boxes[0])
|
69 |
+
# else:
|
70 |
+
# # If no non-empty boxes, keep the first item with empty boxes
|
71 |
+
# result.append(items[0])
|
72 |
+
|
73 |
+
# #print("result:", result)
|
74 |
+
|
75 |
+
# return result
|
76 |
+
|
77 |
+
def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
78 |
+
'''
|
79 |
+
Remove items from the annotator object where the same page exists twice.
|
80 |
+
'''
|
81 |
+
# Group items by 'image'
|
82 |
+
image_groups = defaultdict(list)
|
83 |
+
for item in data:
|
84 |
+
image_groups[item['image']].append(item)
|
85 |
+
|
86 |
+
# Process each group to prioritize items with non-empty boxes
|
87 |
+
result = []
|
88 |
+
for image, items in image_groups.items():
|
89 |
+
# Filter items with non-empty boxes
|
90 |
+
non_empty_boxes = [item for item in items if item.get('boxes')]
|
91 |
+
if non_empty_boxes:
|
92 |
+
# Keep the first entry with non-empty boxes
|
93 |
+
result.append(non_empty_boxes[0])
|
94 |
+
else:
|
95 |
+
# If all items have empty or missing boxes, keep the first item
|
96 |
+
result.append(items[0])
|
97 |
+
|
98 |
+
return result
|
99 |
+
|
100 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
|
101 |
'''
|
102 |
Update a gradio_image_annotation object with new annotation data
|
|
|
125 |
review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
|
126 |
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
127 |
|
|
|
128 |
zoom_str = str(zoom) + '%'
|
129 |
|
130 |
if not image_annotator_object:
|
|
|
173 |
if page_num_reported > page_max_reported:
|
174 |
page_num_reported = page_max_reported
|
175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
177 |
|
|
|
|
|
|
|
178 |
out_image_annotator = image_annotator(
|
179 |
value = image_annotator_object[page_num_reported - 1],
|
180 |
boxes_alpha=0.1,
|