Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 15

Commit

0c2987b

1 Parent(s): 143e2cc

Corrected image resizing method for instances where the image is very large.

Browse files

Files changed (4) hide show

app.py +1 -1
tools/file_conversion.py +26 -8
tools/file_redaction.py +2 -2
tools/redaction_review.py +48 -31

app.py CHANGED Viewed

@@ -473,7 +473,7 @@ print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
 MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
 print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
-MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '100mb')
 print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
 GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))

 MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
 print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
+MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
 print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
 GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))

tools/file_conversion.py CHANGED Viewed

@@ -16,6 +16,7 @@ from typing import List, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 image_dpi = 300.0
 def is_pdf_or_image(filename):
     """
@@ -74,14 +75,31 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
             image.save(out_path, format="PNG")
         # Check file size and resize if necessary
-        max_size = 5 * 1024 * 1024  # 5 MB in bytes
-        file_size = os.path.getsize(out_path)
-        if file_size >= max_size:
-            # Resize the image while maintaining aspect ratio
-            ratio = (max_size / file_size) ** 0.5
-            new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
-            image = image.resize(new_size, Image.ANTIALIAS)
-            image.save(out_path, format="PNG")  # Overwrite with resized image
         return page_num, out_path

 from concurrent.futures import ThreadPoolExecutor, as_completed
 image_dpi = 300.0
+Image.MAX_IMAGE_PIXELS = None
 def is_pdf_or_image(filename):
     """
             image.save(out_path, format="PNG")
         # Check file size and resize if necessary
+        max_size = 5 * 1024 * 1024  # 5 MB in bytes # 5
+        file_size = os.path.getsize(out_path)
+        # Resize images if they are too big
+        if file_size > max_size:
+            # Start with the original image size
+            width, height = image.size
+            print(f"Image size before {new_width}x{new_height}, original file_size: {file_size}")
+            while file_size > max_size:
+                # Reduce the size by a factor (e.g., 50% of the current size)
+                new_width = int(width * 0.5)
+                new_height = int(height * 0.5)
+                image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                # Save the resized image
+                image.save(out_path, format="PNG", optimize=True)
+                # Update the file size
+                file_size = os.path.getsize(out_path)
+                print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
+                # Update the dimensions for the next iteration
+                width, height = new_width, new_height
         return page_num, out_path

tools/file_redaction.py CHANGED Viewed

@@ -315,11 +315,11 @@ def choose_and_run_redactor(file_paths:List[str],
              redact_whole_page_list)
-            print("log_files_output_paths at end of image redact function:", log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
-                print("Request metadata:", new_request_metadata)
                 all_request_metadata.append(new_request_metadata)
         elif in_redact_method == text_ocr_option:

              redact_whole_page_list)
+            #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
             # Save Textract request metadata (if exists)
             if new_request_metadata:
+                #print("Request metadata:", new_request_metadata)
                 all_request_metadata.append(new_request_metadata)
         elif in_redact_method == text_ocr_option:

tools/redaction_review.py CHANGED Viewed

@@ -13,6 +13,7 @@ import os
 import pymupdf
 from fitz import Document
 from PIL import ImageDraw, Image
 def decrease_page(number:int):
     '''
@@ -49,6 +50,53 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
     return current_zoom_level, annotate_current_page
 def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
     '''
     Update a gradio_image_annotation object with new annotation data
@@ -77,7 +125,6 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
         recogniser_dataframe_out = gr.Dataframe(review_dataframe)
     zoom_str = str(zoom) + '%'
     if not image_annotator_object:
@@ -126,38 +173,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
     if page_num_reported > page_max_reported:
         page_num_reported = page_max_reported
-    from collections import defaultdict
-    # Remove duplicate elements that are blank
-    def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
-        # Group items by 'image'
-        image_groups = defaultdict(list)
-        for item in data:
-            image_groups[item['image']].append(item)
-        # Process each group to retain only the entry with non-empty boxes, if available
-        result = []
-        for image, items in image_groups.items():
-            # Filter items with non-empty boxes
-            non_empty_boxes = [item for item in items if item['boxes']]
-            if non_empty_boxes:
-                # Keep the first entry with non-empty boxes
-                result.append(non_empty_boxes[0])
-            else:
-                # If no non-empty boxes, keep the first item with empty boxes
-                result.append(items[0])
-        #print("result:", result)
-        return result
-    #print("image_annotator_object in update_annotator before function:", image_annotator_object)
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
-    #print("image_annotator_object in update_annotator after function:", image_annotator_object)
-    #print("image_annotator_object[page_num_reported - 1]:", image_annotator_object[page_num_reported - 1])
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,

 import pymupdf
 from fitz import Document
 from PIL import ImageDraw, Image
+from collections import defaultdict
 def decrease_page(number:int):
     '''
     return current_zoom_level, annotate_current_page
+   # Remove duplicate elements that are blank
+    # def remove_duplicate_images_with_blank_boxes(data: List[AnnotatedImageData]) -> List[AnnotatedImageData]:
+    #     # Group items by 'image'
+    #     image_groups = defaultdict(list)
+    #     for item in data:
+    #         image_groups[item['image']].append(item)
+    #     # Process each group to retain only the entry with non-empty boxes, if available
+    #     result = []
+    #     for image, items in image_groups.items():
+    #         # Filter items with non-empty boxes
+    #         non_empty_boxes = [item for item in items if item['boxes']]
+    #         if non_empty_boxes:
+    #             # Keep the first entry with non-empty boxes
+    #             result.append(non_empty_boxes[0])
+    #         else:
+    #             # If no non-empty boxes, keep the first item with empty boxes
+    #             result.append(items[0])
+    #     #print("result:", result)
+    #     return result
+def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
+    '''
+    Remove items from the annotator object where the same page exists twice.
+    '''
+    # Group items by 'image'
+    image_groups = defaultdict(list)
+    for item in data:
+        image_groups[item['image']].append(item)
+    # Process each group to prioritize items with non-empty boxes
+    result = []
+    for image, items in image_groups.items():
+        # Filter items with non-empty boxes
+        non_empty_boxes = [item for item in items if item.get('boxes')]
+        if non_empty_boxes:
+            # Keep the first entry with non-empty boxes
+            result.append(non_empty_boxes[0])
+        else:
+            # If all items have empty or missing boxes, keep the first item
+            result.append(items[0])
+    return result
 def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
     '''
     Update a gradio_image_annotation object with new annotation data
         review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
         recogniser_dataframe_out = gr.Dataframe(review_dataframe)
     zoom_str = str(zoom) + '%'
     if not image_annotator_object:
     if page_num_reported > page_max_reported:
         page_num_reported = page_max_reported
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,