seanpedrickcase commited on
Commit
0c2987b
·
1 Parent(s): 143e2cc

Corrected image resizing method for instances where the image is very large.

Browse files
app.py CHANGED
@@ -473,7 +473,7 @@ print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
473
  MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
474
  print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
475
 
476
- MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
477
  print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
478
 
479
  GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
 
473
  MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
474
  print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
475
 
476
+ MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
477
  print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
478
 
479
  GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
tools/file_conversion.py CHANGED
@@ -16,6 +16,7 @@ from typing import List, Optional
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
 
18
  image_dpi = 300.0
 
19
 
20
  def is_pdf_or_image(filename):
21
  """
@@ -74,14 +75,31 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
74
  image.save(out_path, format="PNG")
75
 
76
  # Check file size and resize if necessary
77
- max_size = 5 * 1024 * 1024 # 5 MB in bytes
78
- file_size = os.path.getsize(out_path)
79
- if file_size >= max_size:
80
- # Resize the image while maintaining aspect ratio
81
- ratio = (max_size / file_size) ** 0.5
82
- new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
83
- image = image.resize(new_size, Image.ANTIALIAS)
84
- image.save(out_path, format="PNG") # Overwrite with resized image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  return page_num, out_path
87
 
 
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
 
18
  image_dpi = 300.0
19
+ Image.MAX_IMAGE_PIXELS = None
20
 
21
  def is_pdf_or_image(filename):
22
  """
 
75
  image.save(out_path, format="PNG")
76
 
77
  # Check file size and resize if necessary
78
+ max_size = 5 * 1024 * 1024 # 5 MB in bytes # 5
79
+ file_size = os.path.getsize(out_path)
80
+
81
+ # Resize images if they are too big
82
+ if file_size > max_size:
83
+ # Start with the original image size
84
+ width, height = image.size
85
+
86
+ print(f"Image size before {new_width}x{new_height}, original file_size: {file_size}")
87
+
88
+ while file_size > max_size:
89
+ # Reduce the size by a factor (e.g., 50% of the current size)
90
+ new_width = int(width * 0.5)
91
+ new_height = int(height * 0.5)
92
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
93
+
94
+ # Save the resized image
95
+ image.save(out_path, format="PNG", optimize=True)
96
+
97
+ # Update the file size
98
+ file_size = os.path.getsize(out_path)
99
+ print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
100
+
101
+ # Update the dimensions for the next iteration
102
+ width, height = new_width, new_height
103
 
104
  return page_num, out_path
105
 
tools/file_redaction.py CHANGED
@@ -315,11 +315,11 @@ def choose_and_run_redactor(file_paths:List[str],
315
  redact_whole_page_list)
316
 
317
 
318
- print("log_files_output_paths at end of image redact function:", log_files_output_paths)
319
 
320
  # Save Textract request metadata (if exists)
321
  if new_request_metadata:
322
- print("Request metadata:", new_request_metadata)
323
  all_request_metadata.append(new_request_metadata)
324
 
325
  elif in_redact_method == text_ocr_option:
 
315
  redact_whole_page_list)
316
 
317
 
318
+ #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
319
 
320
  # Save Textract request metadata (if exists)
321
  if new_request_metadata:
322
+ #print("Request metadata:", new_request_metadata)
323
  all_request_metadata.append(new_request_metadata)
324
 
325
  elif in_redact_method == text_ocr_option:
tools/redaction_review.py CHANGED
@@ -13,6 +13,7 @@ import os
13
  import pymupdf
14
  from fitz import Document
15
  from PIL import ImageDraw, Image
 
16
 
17
  def decrease_page(number:int):
18
  '''
@@ -49,6 +50,53 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
49
 
50
  return current_zoom_level, annotate_current_page
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
53
  '''
54
  Update a gradio_image_annotation object with new annotation data
@@ -77,7 +125,6 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
77
  review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
78
  recogniser_dataframe_out = gr.Dataframe(review_dataframe)
79
 
80
-
81
  zoom_str = str(zoom) + '%'
82
 
83
  if not image_annotator_object:
@@ -126,38 +173,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
126
  if page_num_reported > page_max_reported:
127
  page_num_reported = page_max_reported
128
 
129
- from collections import defaultdict
130
-
131
- # Remove duplicate elements that are blank
132
- def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
133
- # Group items by 'image'
134
- image_groups = defaultdict(list)
135
- for item in data:
136
- image_groups[item['image']].append(item)
137
-
138
- # Process each group to retain only the entry with non-empty boxes, if available
139
- result = []
140
- for image, items in image_groups.items():
141
- # Filter items with non-empty boxes
142
- non_empty_boxes = [item for item in items if item['boxes']]
143
- if non_empty_boxes:
144
- # Keep the first entry with non-empty boxes
145
- result.append(non_empty_boxes[0])
146
- else:
147
- # If no non-empty boxes, keep the first item with empty boxes
148
- result.append(items[0])
149
-
150
- #print("result:", result)
151
-
152
- return result
153
-
154
- #print("image_annotator_object in update_annotator before function:", image_annotator_object)
155
-
156
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
157
 
158
- #print("image_annotator_object in update_annotator after function:", image_annotator_object)
159
- #print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
160
-
161
  out_image_annotator = image_annotator(
162
  value = image_annotator_object[page_num_reported - 1],
163
  boxes_alpha=0.1,
 
13
  import pymupdf
14
  from fitz import Document
15
  from PIL import ImageDraw, Image
16
+ from collections import defaultdict
17
 
18
  def decrease_page(number:int):
19
  '''
 
50
 
51
  return current_zoom_level, annotate_current_page
52
 
53
+
54
+ # Remove duplicate elements that are blank
55
+ # def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
56
+ # # Group items by 'image'
57
+ # image_groups = defaultdict(list)
58
+ # for item in data:
59
+ # image_groups[item['image']].append(item)
60
+
61
+ # # Process each group to retain only the entry with non-empty boxes, if available
62
+ # result = []
63
+ # for image, items in image_groups.items():
64
+ # # Filter items with non-empty boxes
65
+ # non_empty_boxes = [item for item in items if item['boxes']]
66
+ # if non_empty_boxes:
67
+ # # Keep the first entry with non-empty boxes
68
+ # result.append(non_empty_boxes[0])
69
+ # else:
70
+ # # If no non-empty boxes, keep the first item with empty boxes
71
+ # result.append(items[0])
72
+
73
+ # #print("result:", result)
74
+
75
+ # return result
76
+
77
+ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
78
+ '''
79
+ Remove items from the annotator object where the same page exists twice.
80
+ '''
81
+ # Group items by 'image'
82
+ image_groups = defaultdict(list)
83
+ for item in data:
84
+ image_groups[item['image']].append(item)
85
+
86
+ # Process each group to prioritize items with non-empty boxes
87
+ result = []
88
+ for image, items in image_groups.items():
89
+ # Filter items with non-empty boxes
90
+ non_empty_boxes = [item for item in items if item.get('boxes')]
91
+ if non_empty_boxes:
92
+ # Keep the first entry with non-empty boxes
93
+ result.append(non_empty_boxes[0])
94
+ else:
95
+ # If all items have empty or missing boxes, keep the first item
96
+ result.append(items[0])
97
+
98
+ return result
99
+
100
  def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
101
  '''
102
  Update a gradio_image_annotation object with new annotation data
 
125
  review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
126
  recogniser_dataframe_out = gr.Dataframe(review_dataframe)
127
 
 
128
  zoom_str = str(zoom) + '%'
129
 
130
  if not image_annotator_object:
 
173
  if page_num_reported > page_max_reported:
174
  page_num_reported = page_max_reported
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
177
 
 
 
 
178
  out_image_annotator = image_annotator(
179
  value = image_annotator_object[page_num_reported - 1],
180
  boxes_alpha=0.1,