Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Sep 26, 2024

Commit

84c83c0

1 Parent(s): a748df6

General improvement in quick image matching and merging

Browse files

Files changed (5) hide show

app.py +2 -2
tools/aws_textract.py +40 -47
tools/custom_image_analyser_engine.py +148 -132
tools/file_conversion.py +1 -1
tools/file_redaction.py +202 -77

app.py CHANGED Viewed

@@ -89,7 +89,7 @@ with app:
     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
-            in_redaction_method = gr.Radio(label="Choose document redaction method. Note that for AWS Textract, there will be a cost to the service from use of AWS services.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - AWS Textract, handwriting/signatures"])
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
@@ -150,7 +150,7 @@ with app:
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
             with gr.Row():
-                handwrite_signature_checkbox = gr.CheckboxGroup(choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")

     with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose document/image files (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'])
+            in_redaction_method = gr.Radio(label="Choose document redaction method. AWS Textract has a cost per page so please only use when needed.", value = "Simple text analysis - PDFs with selectable text", choices=["Simple text analysis - PDFs with selectable text", "Quick image analysis - typed text", "Complex image analysis - docs with handwriting/signatures (AWS Textract)"])
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses), please go to the redaction settings tab.""")
             document_redact_btn = gr.Button("Redact document(s)", variant="primary")
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
                 page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
             with gr.Row():
+                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
         with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")

tools/aws_textract.py CHANGED Viewed

@@ -91,8 +91,9 @@ def json_to_ocrresult(json_data, page_width, page_height):
     handwriting_recogniser_results = []
     signatures = []
     handwriting = []
-    combined_results = {}
     for text_block in json_data:
@@ -100,17 +101,23 @@ def json_to_ocrresult(json_data, page_width, page_height):
         is_handwriting = False
         if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
             if text_block['BlockType'] == 'LINE':
                 # Extract text and bounding box for the line
                 line_text = text_block.get('Text', '')
-                line_bbox = text_block["Geometry"]["BoundingBox"]
-                line_left = int(line_bbox["Left"] * page_width)
-                line_top = int(line_bbox["Top"] * page_height)
-                line_right = int((line_bbox["Left"] + line_bbox["Width"]) * page_width)
-                line_bottom = int((line_bbox["Top"] + line_bbox["Height"]) * page_height)
                 words = []
                 if 'Relationships' in text_block:
@@ -128,12 +135,12 @@ def json_to_ocrresult(json_data, page_width, page_height):
                                     word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
                                     # Extract BoundingBox details
-                                    width = word_bbox["Width"]
-                                    height = word_bbox["Height"]
                                     # Convert proportional coordinates to absolute coordinates
-                                    width_abs = int(width * page_width)
-                                    height_abs = int(height * page_height)
                                     words.append({
                                         'text': word_text,
@@ -146,18 +153,14 @@ def json_to_ocrresult(json_data, page_width, page_height):
                                         is_handwriting = True
                                         entity_name = "HANDWRITING"
                                         word_end = len(entity_name)
-                                        recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=width_abs, height=height_abs)
-                                        handwriting.append(recogniser_result)
-                                        print("Handwriting found:", handwriting[-1])
-                combined_results[line_text] = {
-                    'bounding_box': (line_left, line_top, line_right, line_bottom),
-                    'words': words
-                }
-                # If handwriting or signature, add to bounding box
             elif (text_block['BlockType'] == 'SIGNATURE'):
                 line_text = "SIGNATURE"
@@ -167,38 +170,26 @@ def json_to_ocrresult(json_data, page_width, page_height):
                 confidence = text_block['Confidence']
                 word_end = len(entity_name)
-                # Extract BoundingBox details
-                bbox = text_block["Geometry"]["BoundingBox"]
-                left = bbox["Left"]
-                top = bbox["Top"]
-                width = bbox["Width"]
-                height = bbox["Height"]
-                # Convert proportional coordinates to absolute coordinates
-                left_abs = int(left * page_width)
-                top_abs = int(top * page_height)
-                width_abs = int(width * page_width)
-                height_abs = int(height * page_height)
-                recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=left_abs, top=top_abs, width=width_abs, height=height_abs)
                 signatures.append(recogniser_result)
                 print("Signature found:", signatures[-1])
-            # Extract BoundingBox details
-            bbox = text_block["Geometry"]["BoundingBox"]
-            left = bbox["Left"]
-            top = bbox["Top"]
-            width = bbox["Width"]
-            height = bbox["Height"]
-            # Convert proportional coordinates to absolute coordinates
-            left_abs = int(left * page_width)
-            top_abs = int(top * page_height)
-            width_abs = int(width * page_width)
-            height_abs = int(height * page_height)
             # Create OCRResult with absolute coordinates
-            ocr_result = OCRResult(line_text, left_abs, top_abs, width_abs, height_abs)
             all_ocr_results.append(ocr_result)
             is_signature_or_handwriting = is_signature | is_handwriting
@@ -209,5 +200,7 @@ def json_to_ocrresult(json_data, page_width, page_height):
                 if is_signature: signature_recogniser_results.append(recogniser_result)
                 if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
-    return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, combined_results

     handwriting_recogniser_results = []
     signatures = []
     handwriting = []
+    ocr_results_with_children = {}
+    i = 1
     for text_block in json_data:
         is_handwriting = False
         if (text_block['BlockType'] == 'LINE') | (text_block['BlockType'] == 'SIGNATURE'): # (text_block['BlockType'] == 'WORD') |
+            # Extract text and bounding box for the line
+            line_bbox = text_block["Geometry"]["BoundingBox"]
+            line_left = int(line_bbox["Left"] * page_width)
+            line_top = int(line_bbox["Top"] * page_height)
+            line_right = int((line_bbox["Left"] + line_bbox["Width"]) * page_width)
+            line_bottom = int((line_bbox["Top"] + line_bbox["Height"]) * page_height)
+            width_abs = int(line_bbox["Width"] * page_width)
+            height_abs = int(line_bbox["Height"] * page_height)
             if text_block['BlockType'] == 'LINE':
                 # Extract text and bounding box for the line
                 line_text = text_block.get('Text', '')
                 words = []
                 if 'Relationships' in text_block:
                                     word_bottom = int((word_bbox["Top"] + word_bbox["Height"]) * page_height)
                                     # Extract BoundingBox details
+                                    word_width = word_bbox["Width"]
+                                    word_height = word_bbox["Height"]
                                     # Convert proportional coordinates to absolute coordinates
+                                    word_width_abs = int(word_width * page_width)
+                                    word_height_abs = int(word_height * page_height)
                                     words.append({
                                         'text': word_text,
                                         is_handwriting = True
                                         entity_name = "HANDWRITING"
                                         word_end = len(entity_name)
+                                        recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
+                                        handwriting.append(recogniser_result)
+                                        print("Handwriting found:", handwriting[-1])
+            # If handwriting or signature, add to bounding box
             elif (text_block['BlockType'] == 'SIGNATURE'):
                 line_text = "SIGNATURE"
                 confidence = text_block['Confidence']
                 word_end = len(entity_name)
+                recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
                 signatures.append(recogniser_result)
                 print("Signature found:", signatures[-1])
+                words = []
+                words.append({
+                             'text': line_text,
+                             'bounding_box': (line_left, line_top, line_right, line_bottom)
+                        })
+            ocr_results_with_children["text_line_" + str(i)] = {
+                "line": i,
+                'text': line_text,
+                'bounding_box': (line_left, line_top, line_right, line_bottom),
+                'words': words
+            }
             # Create OCRResult with absolute coordinates
+            ocr_result = OCRResult(line_text, line_left, line_top, width_abs, height_abs)
             all_ocr_results.append(ocr_result)
             is_signature_or_handwriting = is_signature | is_handwriting
                 if is_signature: signature_recogniser_results.append(recogniser_result)
                 if is_handwriting: handwriting_recogniser_results.append(recogniser_result)
+            i += 1
+    return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -9,6 +9,7 @@ import PIL
 from PIL import ImageDraw, ImageFont, Image
 from typing import Optional, Tuple, Union
 from copy import deepcopy
 @dataclass
 class OCRResult:
@@ -399,6 +400,11 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
@@ -412,13 +418,6 @@ class CustomImageAnalyzerEngine:
         self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
         if not image_preprocessor:
-            # image_preprocessor = ImagePreprocessor(
-            #     c_low_contrast=10,
-            #     c_high_contrast=20,
-            #     contrast_threshold=0.5,
-            #     bg_threshold=128,
-            #     block_size=11
-            # )
             image_preprocessor = ContrastSegmentedImageEnhancer()
             #print(image_preprocessor)
         self.image_preprocessor = image_preprocessor
@@ -432,9 +431,6 @@ class CustomImageAnalyzerEngine:
         image_processed, preprocessing_metadata = self.image_preprocessor.preprocess_image(image)
-        #print("pre-processing metadata:", preprocessing_metadata)
-        #image_processed.save("image_processed.png")
         ocr_data = pytesseract.image_to_data(image_processed, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
         if preprocessing_metadata and ("scale_factor" in preprocessing_metadata):
@@ -460,64 +456,95 @@ class CustomImageAnalyzerEngine:
     def analyze_text(
         self,
-        ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified
         if "language" not in text_analyzer_kwargs:
             text_analyzer_kwargs["language"] = "en"
         allow_list = text_analyzer_kwargs.get('allow_list', [])
         combined_results = []
-        for ocr_result in ocr_results:
             # Analyze each OCR result (line) individually
             analyzer_result = self.analyzer_engine.analyze(
-                text=ocr_result.text, **text_analyzer_kwargs
             )
             for result in analyzer_result:
                 # Extract the relevant portion of text based on start and end
-                relevant_text = ocr_result.text[result.start:result.end]
                 # Find the corresponding entry in ocr_results_with_children
-                child_info = ocr_results_with_children.get(ocr_result.text)
-                if child_info:
-                    # Calculate left and width based on child words
-                    #print("Found in ocr_results_with_children")
-                    child_words = child_info['words']
-                    start_word = child_words[0]
-                    end_word = child_words[-1]
-                    left = start_word['bounding_box'][0]
-                    width = end_word['bounding_box'][2] - left
-                    relevant_ocr_result = OCRResult(
-                        text=relevant_text,
-                        left=left,
-                        top=ocr_result.top,
-                        width=width,
-                        height=ocr_result.height
-                    )
-                else:
                     # Fallback to previous method if not found in ocr_results_with_children
-                    #print("Couldn't find result in ocr_results_with_children")
-                    relevant_ocr_result = OCRResult(
-                        text=relevant_text,
-                        left=ocr_result.left + self.estimate_x_offset(relevant_text, result.start),
-                        top=ocr_result.top,
-                        width=self.estimate_width(ocr_result=ocr_result, start=result.start, end=result.end),
-                        height=ocr_result.height
-                    )
-                result_mod = result
-                result.start = 0
-                result.end = len(relevant_text)
                 # Map the analyzer results to bounding boxes for this line
                 line_results = self.map_analyzer_results_to_bounding_boxes(
-                    [result_mod], [relevant_ocr_result], ocr_result.text, allow_list, ocr_results_with_children
                 )
                 combined_results.extend(line_results)
@@ -526,98 +553,64 @@ class CustomImageAnalyzerEngine:
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
         text_analyzer_results: List[RecognizerResult],
-        ocr_results: List[OCRResult],
         full_text: str,
         allow_list: List[str],
-        ocr_results_with_children: Dict[str, Dict]
     ) -> List[CustomImageRecognizerResult]:
-        pii_bboxes = []
         text_position = 0
-        for ocr_result in ocr_results:
-            word_end = text_position + len(ocr_result.text)
-            #print("Checking relevant OCR result:", ocr_result)
-            for result in text_analyzer_results:
-                max_of_current_text_pos_or_result_start_pos = max(text_position, result.start)
-                min_of_result_end_pos_or_results_end = min(word_end, result.end)
-                #print("max_of_current_text_pos_or_result_start_pos", str(max_of_current_text_pos_or_result_start_pos))
-                #print("min_of_result_end_pos_or_results_end", str(min_of_result_end_pos_or_results_end))
-                if (max_of_current_text_pos_or_result_start_pos < min_of_result_end_pos_or_results_end) and (ocr_result.text not in allow_list):
-                    print("result", result, "made it through if statement")
-                    # Find the corresponding entry in ocr_results_with_children
-                    child_info = ocr_results_with_children.get(full_text)
-                    if child_info:
                         # Use the bounding box from ocr_results_with_children
-                        bbox = child_info['bounding_box']
                         left, top, right, bottom = bbox
                         width = right - left
                         height = bottom - top
                     else:
-                        # Fallback to ocr_result if not found
-                        left = ocr_result.left
-                        top = ocr_result.top
-                        width = ocr_result.width
-                        height = ocr_result.height
-                    pii_bboxes.append(
                         CustomImageRecognizerResult(
-                            entity_type=result.entity_type,
-                            start=result.start,
-                            end=result.end,
-                            score=result.score,
                             left=left,
                             top=top,
                             width=width,
                             height=height,
-                            text=ocr_result.text
                         )
                     )
             text_position = word_end + 1  # +1 for the space between words
-        return pii_bboxes
-    # @staticmethod
-    # def map_analyzer_results_to_bounding_boxes(
-    #     text_analyzer_results: List[RecognizerResult],
-    #     ocr_results: List[OCRResult],
-    #     full_text: str,
-    #     allow_list: List[str],
-    # ) -> List[CustomImageRecognizerResult]:
-    #     pii_bboxes = []
-    #     text_position = 0
-    #     for ocr_result in ocr_results:
-    #         word_end = text_position + len(ocr_result.text)
-    #         print("Checking relevant OCR result:", ocr_result)
-    #         for result in text_analyzer_results:
-    #             if (max(text_position, result.start) < min(word_end, result.end)) and (ocr_result.text not in allow_list):
-    #                 print("result", result, "made it through if statement")
-    #                 pii_bboxes.append(
-    #                     CustomImageRecognizerResult(
-    #                         entity_type=result.entity_type,
-    #                         start=result.start,
-    #                         end=result.end,
-    #                         score=result.score,
-    #                         left=ocr_result.left,
-    #                         top=ocr_result.top,
-    #                         width=ocr_result.width,
-    #                         height=ocr_result.height,
-    #                         text=ocr_result.text
-    #                     )
-    #                 )
-    #         text_position = word_end + 1  # +1 for the space between words
-    #     return pii_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
@@ -789,6 +782,21 @@ def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
     current_bbox = None
     line_counter = 1
     for result in sorted_results:
         if not current_line:
             # Start a new line
@@ -797,6 +805,7 @@ def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
         else:
             # Check if the result is on the same line (y-axis) and close horizontally (x-axis)
             last_result = current_line[-1]
             if abs(result.top - last_result.top) <= y_threshold and \
                (result.left - (last_result.left + last_result.width)) <= x_threshold:
                 # Update the bounding box to include the new word
@@ -810,18 +819,22 @@ def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
                 )
                 current_line.append(result)
             else:
                 # Commit the current line and start a new one
                 combined_results.append(current_bbox)
-                new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
-                    'bounding_box': (current_bbox.left, current_bbox.top,
-                                     current_bbox.left + current_bbox.width,
-                                     current_bbox.top + current_bbox.height),
-                    'words': [{'text': word.text,
-                               'bounding_box': (word.left, word.top,
-                                                word.left + word.width,
-                                                word.top + word.height)}
-                              for word in current_line]
-                }
                 line_counter += 1
                 current_line = [result]
                 current_bbox = result
@@ -829,16 +842,19 @@ def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
     # Append the last line
     if current_bbox:
         combined_results.append(current_bbox)
-        new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
-            'bounding_box': (current_bbox.left, current_bbox.top,
-                             current_bbox.left + current_bbox.width,
-                             current_bbox.top + current_bbox.height),
-            'words': [{'text': word.text,
-                       'bounding_box': (word.left, word.top,
-                                        word.left + word.width,
-                                        word.top + word.height)}
-                      for word in current_line]
-        }
     return combined_results, new_format_results

 from PIL import ImageDraw, ImageFont, Image
 from typing import Optional, Tuple, Union
 from copy import deepcopy
+import string  # Import string to get a list of common punctuation characters
 @dataclass
 class OCRResult:
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
+def bounding_boxes_overlap(box1, box2):
+    """Check if two bounding boxes overlap."""
+    return (box1[0] < box2[2] and box2[0] < box1[2] and
+            box1[1] < box2[3] and box2[1] < box1[3])
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         self.tesseract_config = tesseract_config or '--oem 3 --psm 11'
         if not image_preprocessor:
             image_preprocessor = ContrastSegmentedImageEnhancer()
             #print(image_preprocessor)
         self.image_preprocessor = image_preprocessor
         image_processed, preprocessing_metadata = self.image_preprocessor.preprocess_image(image)
         ocr_data = pytesseract.image_to_data(image_processed, output_type=pytesseract.Output.DICT, config=self.tesseract_config)
         if preprocessing_metadata and ("scale_factor" in preprocessing_metadata):
     def analyze_text(
         self,
+        line_level_ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
         # Define English as default language, if not specified
         if "language" not in text_analyzer_kwargs:
             text_analyzer_kwargs["language"] = "en"
+        horizontal_buffer = 0 # add pixels to right of width
+        height_buffer = 2 # add pixels to bounding box height
         allow_list = text_analyzer_kwargs.get('allow_list', [])
         combined_results = []
+        for i, line_level_ocr_result in enumerate(line_level_ocr_results):
             # Analyze each OCR result (line) individually
             analyzer_result = self.analyzer_engine.analyze(
+                text=line_level_ocr_result.text, **text_analyzer_kwargs
             )
+            if i < len(ocr_results_with_children):  # Check if i is a valid index
+                child_level_key = list(ocr_results_with_children.keys())[i]
+            else:
+               continue
+            ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
+            # Go through results to add bounding boxes
             for result in analyzer_result:
                 # Extract the relevant portion of text based on start and end
+                relevant_text = line_level_ocr_result.text[result.start:result.end]
                 # Find the corresponding entry in ocr_results_with_children
+                child_words = ocr_results_with_children_line_level['words']
+                 # Initialize bounding box values
+                left, top, bottom = float('inf'), float('inf'), float('-inf')
+                all_words = ""
+                word_num = 0  # Initialize word count
+                total_width = 0  # Initialize total width
+                for word_text in relevant_text.split():  # Iterate through each word in relevant_text
+                    print("Looking for word_text:", word_text)
+                    for word in child_words:
+                        #if word['text'].strip(string.punctuation).strip() == word_text.strip(string.punctuation).strip():  # Check for exact match
+                        if word_text in word['text']:
+                            found_word = word
+                            print("found_word:", found_word)
+                            if word_num == 0:  # First word
+                                left = found_word['bounding_box'][0]
+                                top = found_word['bounding_box'][1]
+                            bottom = max(bottom, found_word['bounding_box'][3])  # Update bottom for all words
+                            all_words += found_word['text'] + " "  # Concatenate words
+                            total_width = found_word['bounding_box'][2] - left  # Add each word's width
+                            word_num += 1
+                            break  # Move to the next word in relevant_text
+                width = total_width + horizontal_buffer # Set width to total width of all matched words
+                height = bottom - top if word_num > 0 else 0  # Calculate height
+                relevant_line_ocr_result = OCRResult(
+                    text=relevant_text,
+                    left=left,
+                    top=top - height_buffer,
+                    width=width,
+                    height=height + height_buffer
+                )
+                if not ocr_results_with_children_line_level:
                     # Fallback to previous method if not found in ocr_results_with_children
+                    print("No child info found")
+                    continue
+                # Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
+                result_reset_pos = result
+                result_reset_pos.start = 0
+                result_reset_pos.end = len(relevant_text)
+                print("result_reset_pos:", result_reset_pos)
+                print("relevant_line_ocr_result:", relevant_line_ocr_result)
+                #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
                 # Map the analyzer results to bounding boxes for this line
                 line_results = self.map_analyzer_results_to_bounding_boxes(
+                    [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
                 )
+                print("line_results:", line_results)
                 combined_results.extend(line_results)
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
         text_analyzer_results: List[RecognizerResult],
+        redaction_relevant_ocr_results: List[OCRResult],
         full_text: str,
         allow_list: List[str],
+        ocr_results_with_children_child_info: Dict[str, Dict]
     ) -> List[CustomImageRecognizerResult]:
+        redaction_bboxes = []
         text_position = 0
+        for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
+            word_end = text_position + len(redaction_relevant_ocr_result.text)
+            #print("Checking relevant OCR result:", redaction_relevant_ocr_result)
+            for redaction_result in text_analyzer_results:
+                max_of_current_text_pos_or_result_start_pos = max(text_position, redaction_result.start)
+                min_of_result_end_pos_or_results_end = min(word_end, redaction_result.end)
+                redaction_result_bounding_box = (redaction_relevant_ocr_result.left, redaction_relevant_ocr_result.top,
+                    redaction_relevant_ocr_result.left + redaction_relevant_ocr_result.width,
+                    redaction_relevant_ocr_result.top + redaction_relevant_ocr_result.height)
+                if (max_of_current_text_pos_or_result_start_pos < min_of_result_end_pos_or_results_end) and (redaction_relevant_ocr_result.text not in allow_list):
+                    #print("result", redaction_result, "made it through if statement")
+                    # Find the corresponding entry in ocr_results_with_children that overlap with the redaction result
+                    child_info = ocr_results_with_children_child_info#.get(full_text)
+                    #print("child_info in sub function:", child_info)
+                    #print("redaction_result_bounding_box:", redaction_result_bounding_box)
+                    print("Overlaps?", bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']))
+                    if bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']):
                         # Use the bounding box from ocr_results_with_children
+                        bbox = redaction_result_bounding_box #child_info['bounding_box']
                         left, top, right, bottom = bbox
                         width = right - left
                         height = bottom - top
                     else:
+                        print("Could not find OCR result")
+                        continue
+                    redaction_bboxes.append(
                         CustomImageRecognizerResult(
+                            entity_type=redaction_result.entity_type,
+                            start=redaction_result.start,
+                            end=redaction_result.end,
+                            score=redaction_result.score,
                             left=left,
                             top=top,
                             width=width,
                             height=height,
+                            text=redaction_relevant_ocr_result.text
                         )
                     )
             text_position = word_end + 1  # +1 for the space between words
+        return redaction_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
     current_bbox = None
     line_counter = 1
+    def create_ocr_result_with_children(combined_results, i, current_bbox, current_line):
+        combined_results["text_line_" + str(i)] = {
+        "line": i,
+        'text': current_bbox.text,
+        'bounding_box': (current_bbox.left, current_bbox.top,
+                            current_bbox.left + current_bbox.width,
+                            current_bbox.top + current_bbox.height),
+        'words': [{'text': word.text,
+                    'bounding_box': (word.left, word.top,
+                                    word.left + word.width,
+                                    word.top + word.height)}
+                    for word in current_line]
+    }
+        return combined_results["text_line_" + str(i)]
     for result in sorted_results:
         if not current_line:
             # Start a new line
         else:
             # Check if the result is on the same line (y-axis) and close horizontally (x-axis)
             last_result = current_line[-1]
             if abs(result.top - last_result.top) <= y_threshold and \
                (result.left - (last_result.left + last_result.width)) <= x_threshold:
                 # Update the bounding box to include the new word
                 )
                 current_line.append(result)
             else:
                 # Commit the current line and start a new one
                 combined_results.append(current_bbox)
+                # new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
+                #     'bounding_box': (current_bbox.left, current_bbox.top,
+                #                      current_bbox.left + current_bbox.width,
+                #                      current_bbox.top + current_bbox.height),
+                #     'words': [{'text': word.text,
+                #                'bounding_box': (word.left, word.top,
+                #                                 word.left + word.width,
+                #                                 word.top + word.height)}
+                #               for word in current_line]
+                # }
+                new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
                 line_counter += 1
                 current_line = [result]
                 current_bbox = result
     # Append the last line
     if current_bbox:
         combined_results.append(current_bbox)
+        # new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
+        #     'bounding_box': (current_bbox.left, current_bbox.top,
+        #                      current_bbox.left + current_bbox.width,
+        #                      current_bbox.top + current_bbox.height),
+        #     'words': [{'text': word.text,
+        #                'bounding_box': (word.left, word.top,
+        #                                 word.left + word.width,
+        #                                 word.top + word.height)}
+        #               for word in current_line]
+        # }
+        new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
     return combined_results, new_format_results

tools/file_conversion.py CHANGED Viewed

@@ -219,7 +219,7 @@ def prepare_image_or_pdf(
             print(out_message)
             return out_message, out_file_paths
-        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."

             print(out_message)
             return out_message, out_file_paths
+        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."

tools/file_redaction.py CHANGED Viewed

@@ -9,7 +9,7 @@ import pandas as pd
 #from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
-from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal #, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 import gradio as gr
 from gradio import Progress
@@ -88,8 +88,11 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         print("In allow list:", in_allow_list_flat)
     else:
         in_allow_list_flat = []
-    for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
         if file_path:
@@ -104,14 +107,14 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             print(out_message)
             return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
-        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - AWS Textract, handwriting/signatures":
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
             print("Redacting file " + file_path_without_ext + " as an image-based file")
-            pdf_images, output_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
             # Save file
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
@@ -124,10 +127,14 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             # Save decision making process
-            output_logs_str = str(output_logs)
-            logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
-            with open(logs_output_file_name, "w") as f:
-                f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
            # Save Textract request metadata (if exists)
@@ -147,7 +154,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pdf_text, output_logs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
@@ -159,12 +166,19 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
             out_file_paths.extend(img_output_file_path)
-            output_logs_str = str(output_logs)
-            logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
-            with open(logs_output_file_name, "w") as f:
-                f.write(output_logs_str)
             log_files_output_paths.append(logs_output_file_name)
             out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
             out_message.append(out_message_new)
@@ -205,7 +219,7 @@ def bounding_boxes_overlap(box1, box2):
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
-def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold=150, vertical_threshold=25):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
@@ -348,9 +362,11 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     all_ocr_results = []
     all_decision_process = []
-    if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
-    elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
     for n in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
@@ -395,21 +411,21 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == "Quick image analysis - typed text":
-                ocr_results = image_analyser.perform_ocr(image)
                 # Combine OCR results
-                ocr_results, ocr_results_with_children = combine_ocr_results(ocr_results)
                 #print("ocr_results after:", ocr_results)
-                # Save decision making process
-                ocr_results_with_children_str = str(ocr_results_with_children)
                 logs_output_file_name = output_folder + "ocr_with_children.txt"
                 with open(logs_output_file_name, "w") as f:
                     f.write(ocr_results_with_children_str)
             # Import results from json and convert
-            if analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures":
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
@@ -429,18 +445,18 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                         text_blocks = json.load(json_file)
                         text_blocks = text_blocks['Blocks']
-                ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
-                # Save decision making process
-                ocr_results_with_children_str = str(ocr_results_with_children)
-                logs_output_file_name = output_folder + "ocr_with_children_textract.txt"
-                with open(logs_output_file_name, "w") as f:
-                    f.write(ocr_results_with_children_str)
             # Step 2: Analyze text and identify PII
-            bboxes = image_analyser.analyze_text(
-                ocr_results,
-                ocr_results_with_children,
                 language=language,
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
@@ -448,49 +464,80 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
             )
             if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
-            elif analysis_type == "Complex image analysis - AWS Textract, handwriting/signatures": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
             # Save decision making process
-            bboxes_str = str(bboxes)
             with open(interim_results_file_path, "w") as f:
                 f.write(bboxes_str)
             # Merge close bounding boxes
-            merged_bboxes = merge_img_bboxes(bboxes, ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
-            # Export the decision making process
-            if merged_bboxes:
-                for bbox in merged_bboxes:
-                    print(f"Entity: {bbox.entity_type}, Text: {bbox.text}, Bbox: ({bbox.left}, {bbox.top}, {bbox.width}, {bbox.height})")
-                decision_process_output_str = "Page " + reported_page_number + ":\n" + str(merged_bboxes)
-                all_decision_process.append(decision_process_output_str)
             # 3. Draw the merged boxes
             draw = ImageDraw.Draw(image)
-            for box in merged_bboxes:
                 x0 = box.left
                 y0 = box.top
                 x1 = x0 + box.width
                 y1 = y0 + box.height
                 draw.rectangle([x0, y0, x1, y1], fill=fill)
-            ocr_results_str = "Page:" + reported_page_number + "\n" + str(ocr_results)
-            all_ocr_results.append(ocr_results_str)
         images.append(image)
     # Write OCR results as a log file
-    ocr_results_out = "\n".join(all_ocr_results)
-    with open(ocr_results_file_path, "w") as f:
-        f.write(ocr_results_out)
-    logging_file_paths.append(ocr_results_file_path)
-    all_decision_process_str = "\n".join(all_decision_process)
-    return images, all_decision_process_str, logging_file_paths, request_metadata
 def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
     if isinstance(text_container, LTTextContainer):
@@ -512,7 +559,56 @@ def analyze_text_container(text_container, language, chosen_redact_entities, sco
         return analyzer_results, characters
     return [], []
-def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, vertical_padding=2):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
@@ -520,15 +616,19 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
     if len(analyzer_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
         bounding_boxes = []
         for result in analyzer_results:
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             if char_boxes:
                 # Calculate the bounding box that encompasses all characters
                 left = min(box[0] for box in char_boxes)
                 bottom = min(box[1] for box in char_boxes)
                 right = max(box[2] for box in char_boxes)
                 top = max(box[3] for box in char_boxes) + vertical_padding
-                bounding_boxes.append((bottom, left, result, [left, bottom, right, top]))  # (y, x, result, bbox)
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
@@ -537,22 +637,24 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
         current_box = None
         current_y = None
         current_result = None
-        for y, x, result, char_box in bounding_boxes:
-            print(f"Considering result: {result}")
-            print(f"Character box: {char_box}")
             if current_y is None or current_box is None:
                 current_box = char_box
                 current_y = char_box[1]
                 current_result = result
-                print(f"Starting new box: {current_box}")
             else:
                 vertical_diff_bboxes = abs(char_box[1] - current_y)
                 horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
-                print(f"Comparing boxes: current_box={current_box}, char_box={char_box}")
-                print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
                 if (
                     vertical_diff_bboxes <= 5
@@ -561,26 +663,30 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
                     current_box[2] = char_box[2]  # Extend the current box horizontally
                     current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
                     current_result.end = max(current_result.end, result.end)  # Extend the text range
-                    print(f"Extended current box: {current_box}")
                 else:
                     merged_bounding_boxes.append(
-                        {"boundingBox": current_box, "result": current_result})
-                    print(f"Appending merged box: {current_box}")
                     # Reset current_box and current_y after appending
                     current_box = char_box
                     current_y = char_box[1]
                     current_result = result
-                    print(f"Starting new box: {current_box}")
         # After finishing with the current result, add the last box for this result
         if current_box:
-            merged_bounding_boxes.append({"boundingBox": current_box, "result": current_result})
-            print(f"Appending final box for result: {current_box}")
         if not merged_bounding_boxes:
             analyzed_bounding_boxes.extend(
-                {"boundingBox": char.bbox, "result": result}
                 for result in analyzer_results
                 for char in characters[result.start:result.end]
                 if isinstance(char, LTChar)
@@ -588,7 +694,7 @@ def merge_bounding_boxes(analyzer_results, characters, combine_pixel_dist, verti
         else:
             analyzed_bounding_boxes.extend(merged_bounding_boxes)
-        print("Analyzed bounding boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
@@ -635,7 +741,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     annotations_all_pages = []
-    decision_process_table_all_pages = []
     combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
@@ -669,7 +775,7 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
         annotations_on_page = []
-        decision_process_table_on_page = []
         for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
@@ -678,25 +784,41 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
             text_container_analyzer_results = []
             text_container_analyzed_bounding_boxes = []
             characters = []
             if analysis_type == "Simple text analysis - PDFs with selectable text":
                 for i, text_container in enumerate(page_layout):
                     text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
-                    # Merge bounding boxes if very close together
-                    print("\n\ntext_container_analyzer_results:", text_container_analyzer_results)
-                    #print("\n\ncharacters:", characters)
-                    text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
-                    print("\n\ntext_container_analyzed_bounding_boxes:", text_container_analyzed_bounding_boxes)
-                    page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
                     page_analyzer_results.extend(text_container_analyzer_results)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
@@ -705,12 +827,15 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
             # Make page annotations
             page.Annots = pdf.make_indirect(annotations_on_page)
             annotations_all_pages.extend([annotations_on_page])
-            decision_process_table_all_pages.extend([decision_process_table_on_page])
             print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
             #page_num += 1
-    return pdf, decision_process_table_all_pages

 #from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 import gradio as gr
 from gradio import Progress
         print("In allow list:", in_allow_list_flat)
     else:
         in_allow_list_flat = []
+    progress(0.5, desc="Redacting file")
+    for file in file_paths_loop:
+    #for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
         if file_path:
             print(out_message)
             return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
+        if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
             print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pdf_images, redaction_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
             # Save file
             out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             # Save decision making process
+            # output_logs_str = str(output_logs)
+            # logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
+            # with open(logs_output_file_name, "w") as f:
+            #     f.write(output_logs_str)
+            # log_files_output_paths.append(logs_output_file_name)
+            logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
+            redaction_logs.to_csv(logs_output_file_name)
             log_files_output_paths.append(logs_output_file_name)
            # Save Textract request metadata (if exists)
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pdf_text, decision_process_logs, page_text_outputs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
             img_output_summary, img_output_file_path = convert_text_pdf_to_img_pdf(file_path, [out_text_file_path])
             out_file_paths.extend(img_output_file_path)
+            #decision_process_logs_str = str(decision_process_logs)
+            #logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
+            #with open(logs_output_file_name, "w") as f:
+            #    f.write(output_logs_str)
+            logs_output_file_name = img_output_file_path[0] + "_decision_process_output.csv"
+            decision_process_logs.to_csv(logs_output_file_name)
             log_files_output_paths.append(logs_output_file_name)
+            all_text_output_file_name = img_output_file_path[0] + "_all_text_output.csv"
+            page_text_outputs.to_csv(all_text_output_file_name)
+            log_files_output_paths.append(all_text_output_file_name)
             out_message_new = "File '" + file_path_without_ext + "' successfully redacted"
             out_message.append(out_message_new)
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
+def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
     all_ocr_results = []
     all_decision_process = []
+    all_line_level_ocr_results_df = pd.DataFrame()
+    all_decision_process_table = pd.DataFrame()
+    if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
+    elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
     for n in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
             # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
             if analysis_type == "Quick image analysis - typed text":
+                word_level_ocr_results = image_analyser.perform_ocr(image)
                 # Combine OCR results
+                line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
                 #print("ocr_results after:", ocr_results)
+                # Save ocr_with_children_outputs
+                ocr_results_with_children_str = str(line_level_ocr_results_with_children)
                 logs_output_file_name = output_folder + "ocr_with_children.txt"
                 with open(logs_output_file_name, "w") as f:
                     f.write(ocr_results_with_children_str)
             # Import results from json and convert
+            if analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
                 # Convert the image to bytes using an in-memory buffer
                 image_buffer = io.BytesIO()
                         text_blocks = json.load(json_file)
                         text_blocks = text_blocks['Blocks']
+                line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
+                # Save ocr_with_children_output
+                # ocr_results_with_children_str = str(line_level_ocr_results_with_children)
+                # logs_output_file_name = output_folder + "ocr_with_children_textract.txt"
+                # with open(logs_output_file_name, "w") as f:
+                #     f.write(ocr_results_with_children_str)
             # Step 2: Analyze text and identify PII
+            redaction_bboxes = image_analyser.analyze_text(
+                line_level_ocr_results,
+                line_level_ocr_results_with_children,
                 language=language,
                 entities=chosen_redact_entities,
                 allow_list=allow_list,
             )
             if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
+            elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
             # Save decision making process
+            bboxes_str = str(redaction_bboxes)
             with open(interim_results_file_path, "w") as f:
                 f.write(bboxes_str)
             # Merge close bounding boxes
+            merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
             # 3. Draw the merged boxes
             draw = ImageDraw.Draw(image)
+            for box in merged_redaction_bboxes:
                 x0 = box.left
                 y0 = box.top
                 x1 = x0 + box.width
                 y1 = y0 + box.height
                 draw.rectangle([x0, y0, x1, y1], fill=fill)
+            # Log OCR results
+            #line_level_ocr_results_str = "Page:" + reported_page_number + "\n" + str(line_level_ocr_results)
+            #all_ocr_results.append(line_level_ocr_results_str)
+            # Convert to DataFrame and add to ongoing logging table
+            line_level_ocr_results_df = pd.DataFrame([{
+                'page': reported_page_number,
+                'text': result.text,
+                'left': result.left,
+                'top': result.top,
+                'width': result.width,
+                'height': result.height
+            } for result in line_level_ocr_results])
+            all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, line_level_ocr_results_df])
+            # Convert decision process to table
+            # Export the decision making process
+            if merged_redaction_bboxes:
+                # for bbox in merged_redaction_bboxes:
+                #     print(f"Entity: {bbox.entity_type}, Text: {bbox.text}, Bbox: ({bbox.left}, {bbox.top}, {bbox.width}, {bbox.height})")
+                #decision_process_output_str = "Page " + reported_page_number + ":\n" + str(merged_redaction_bboxes)
+                #all_decision_process.append(decision_process_output_str)
+                decision_process_table = pd.DataFrame([{
+                    'page': reported_page_number,
+                    'entity_type': result.entity_type,
+                    'start': result.start,
+                    'end': result.end,
+                    'score': result.score,
+                    'left': result.left,
+                    'top': result.top,
+                    'width': result.width,
+                    'height': result.height,
+                    'text': result.text
+                } for result in merged_redaction_bboxes])
+                all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
         images.append(image)
     # Write OCR results as a log file
+    # line_level_ocr_results_out = "\n".join(all_ocr_results)
+    # with open(ocr_results_file_path, "w") as f:
+    #     f.write(line_level_ocr_results_out)
+    all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
+    logging_file_paths.append(ocr_results_file_path)
+    return images, all_decision_process_table, logging_file_paths, request_metadata
 def analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list):
     if isinstance(text_container, LTTextContainer):
         return analyzer_results, characters
     return [], []
+def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> OCRResult:
+    '''
+    Create an OCRResult object based on a list of pdfminer LTChar objects.
+    '''
+    # Initialize variables
+    full_text = ""
+    overall_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # [x0, y0, x1, y1]
+    word_bboxes = []
+    # Iterate through the character objects
+    current_word = ""
+    current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # [x0, y0, x1, y1]
+    for char in char_objects:
+        if isinstance(char, LTAnno):
+            # Handle space separately by finalizing the word
+            full_text += char.get_text()  # Adds space or newline
+            if current_word:  # Only finalize if there is a current word
+                word_bboxes.append((current_word, current_word_bbox))
+                current_word = ""
+                current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')]  # Reset for next word
+            continue
+        # Concatenate text for LTChar
+        full_text += char.get_text()
+        # Update overall bounding box
+        x0, y0, x1, y1 = char.bbox
+        overall_bbox[0] = min(overall_bbox[0], x0)  # x0
+        overall_bbox[1] = min(overall_bbox[1], y0)  # y0
+        overall_bbox[2] = max(overall_bbox[2], x1)  # x1
+        overall_bbox[3] = max(overall_bbox[3], y1)  # y1
+        # Update current word
+        current_word += char.get_text()
+        # Update current word bounding box
+        current_word_bbox[0] = min(current_word_bbox[0], x0)  # x0
+        current_word_bbox[1] = min(current_word_bbox[1], y0)  # y0
+        current_word_bbox[2] = max(current_word_bbox[2], x1)  # x1
+        current_word_bbox[3] = max(current_word_bbox[3], y1)  # y1
+    # Finalize the last word if any
+    if current_word:
+        word_bboxes.append((current_word, current_word_bbox))
+    return OCRResult(full_text, overall_bbox[0], overall_bbox[1], overall_bbox[2], overall_bbox[3])
+def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=2):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
     if len(analyzer_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
         bounding_boxes = []
+        text_out = []
         for result in analyzer_results:
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
+            char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             if char_boxes:
                 # Calculate the bounding box that encompasses all characters
                 left = min(box[0] for box in char_boxes)
                 bottom = min(box[1] for box in char_boxes)
                 right = max(box[2] for box in char_boxes)
                 top = max(box[3] for box in char_boxes) + vertical_padding
+                bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text))  # (y, x, result, bbox, text)
+        char_text = "".join(char_text)
         # Sort the results by y-coordinate and then by x-coordinate
         bounding_boxes.sort()
         current_box = None
         current_y = None
         current_result = None
+        current_text = []
+        for y, x, result, char_box, text in bounding_boxes:
+            #print(f"Considering result: {result}")
+            #print(f"Character box: {char_box}")
             if current_y is None or current_box is None:
                 current_box = char_box
                 current_y = char_box[1]
                 current_result = result
+                current_text = list(text)
+                #print(f"Starting new box: {current_box}")
             else:
                 vertical_diff_bboxes = abs(char_box[1] - current_y)
                 horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
+                #print(f"Comparing boxes: current_box={current_box}, char_box={char_box}")
+                #print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
                 if (
                     vertical_diff_bboxes <= 5
                     current_box[2] = char_box[2]  # Extend the current box horizontally
                     current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
                     current_result.end = max(current_result.end, result.end)  # Extend the text range
+                    # Add a space if current_text is not empty
+                    if current_text:
+                        current_text.append(" ")  # Add space between texts
+                    current_text.extend(text)
                 else:
                     merged_bounding_boxes.append(
+                        {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
+                    #print(f"Appending merged box: {current_box}")
                     # Reset current_box and current_y after appending
                     current_box = char_box
                     current_y = char_box[1]
                     current_result = result
+                    current_text = list(text)
+                    #print(f"Starting new box: {current_box}")
         # After finishing with the current result, add the last box for this result
         if current_box:
+            merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
+            #print(f"Appending final box for result: {current_box}")
         if not merged_bounding_boxes:
             analyzed_bounding_boxes.extend(
+                {"text":text, "boundingBox": char.bbox, "result": result}
                 for result in analyzer_results
                 for char in characters[result.start:result.end]
                 if isinstance(char, LTChar)
         else:
             analyzed_bounding_boxes.extend(merged_bounding_boxes)
+        #print("Analyzed bounding boxes:\n\n", analyzed_bounding_boxes)
     return analyzed_bounding_boxes
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     annotations_all_pages = []
+    decision_process_table_all_pages = pd.DataFrame()
     combine_pixel_dist = 100 # Horizontal distance between PII bounding boxes under/equal they are combined into one
         annotations_on_page = []
+        decision_process_table_on_page = pd.DataFrame()
         for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
             text_container_analyzer_results = []
             text_container_analyzed_bounding_boxes = []
             characters = []
+            page_text_outputs = pd.DataFrame()
             if analysis_type == "Simple text analysis - PDFs with selectable text":
                 for i, text_container in enumerate(page_layout):
                     text_container_analyzer_results, characters = analyze_text_container(text_container, language, chosen_redact_entities, score_threshold, allow_list)
+                    # Create dataframe for all the text on the page
+                    line_level_text_results = create_text_bounding_boxes_from_characters(characters)
+                    if line_level_text_results.text:
+                        line_level_text_results_list = [line_level_text_results]
+                        # Convert to DataFrame and add to ongoing logging table
+                        line_level_text_results_df = pd.DataFrame([{
+                            'page': page_no + 1,
+                            'text': result.text,
+                            'left': result.left,
+                            'top': result.top,
+                            'width': result.width,
+                            'height': result.height
+                        } for result in line_level_text_results_list])
+                        page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
+                    # Merge bounding boxes if very close together
+                    text_container_analyzed_bounding_boxes = merge_text_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist, vertical_padding = 2)
                     page_analyzer_results.extend(text_container_analyzer_results)
+                    page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
+            print("page_analyzer_results:", page_analyzer_results)
+            print("page_analyzed_bounding_boxes:", page_analyzed_bounding_boxes)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
             # Make page annotations
             page.Annots = pdf.make_indirect(annotations_on_page)
             annotations_all_pages.extend([annotations_on_page])
+            decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
+            page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
+            #page_text_outputs.to_csv("text_page_text_outputs.csv")
             print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
             #page_num += 1
+    return pdf, decision_process_table_all_pages, page_text_outputs