Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 13

Commit

0d3554e

1 Parent(s): 11770c9

Fix bug to identify all handwriting labels. Now only concatenates entity_type boxes if they have different labels.

Browse files

Files changed (3) hide show

doc_redaction_amplify_app +1 -0
tools/aws_textract.py +45 -23
tools/file_redaction.py +9 -2

doc_redaction_amplify_app ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 9585642e4d1f72fc49971789693d5584661084c8

tools/aws_textract.py CHANGED Viewed

@@ -145,8 +145,9 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
                 # Extract text and bounding box for the line
                 line_text = text_block.get('Text', '')
                 words = []
                 if 'Relationships' in text_block:
                     for relationship in text_block['Relationships']:
                         if relationship['Type'] == 'CHILD':
@@ -179,35 +180,56 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
                                     if text_type == "HANDWRITING":
                                         is_handwriting = True
                                         entity_name = "HANDWRITING"
-                                        word_end = len(entity_name)
-                                        recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
-                                        if recogniser_result not in handwriting:
-                                            handwriting.append(recogniser_result)
-                                            #print("Handwriting found:", handwriting[-1])
             # If handwriting or signature, add to bounding box
             elif (text_block['BlockType'] == 'SIGNATURE'):
                 line_text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
-                confidence = text_block['Confidence']
-                word_end = len(entity_name)
-                recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
-                if recogniser_result not in signatures:
-                    signatures.append(recogniser_result)
-                    #print("Signature found:", signatures[-1])
-                words = []
-                words.append({
-                            'text': line_text,
-                            'bounding_box': (line_left, line_top, line_right, line_bottom)
-                        })
             ocr_results_with_children["text_line_" + str(i)] = {
                 "line": i,

                 # Extract text and bounding box for the line
                 line_text = text_block.get('Text', '')
                 words = []
+                current_line_handwriting_results = []  # Track handwriting results for this line
                 if 'Relationships' in text_block:
                     for relationship in text_block['Relationships']:
                         if relationship['Type'] == 'CHILD':
                                     if text_type == "HANDWRITING":
                                         is_handwriting = True
                                         entity_name = "HANDWRITING"
+                                        word_end = len(word_text)
+                                        recogniser_result = CustomImageRecognizerResult(
+                                            entity_type=entity_name,
+                                            text=word_text,
+                                            score=confidence,
+                                            start=0,
+                                            end=word_end,
+                                            left=word_left,
+                                            top=word_top,
+                                            width=word_width_abs,
+                                            height=word_height_abs
+                                        )
+                                        # Add to handwriting collections immediately
+                                        handwriting.append(recogniser_result)
+                                        handwriting_recogniser_results.append(recogniser_result)
+                                        signature_or_handwriting_recogniser_results.append(recogniser_result)
+                                        current_line_handwriting_results.append(recogniser_result)
             # If handwriting or signature, add to bounding box
             elif (text_block['BlockType'] == 'SIGNATURE'):
                 line_text = "SIGNATURE"
                 is_signature = True
                 entity_name = "SIGNATURE"
+                confidence = text_block.get('Confidence', 0)
+                word_end = len(line_text)
+                recogniser_result = CustomImageRecognizerResult(
+                    entity_type=entity_name,
+                    text=line_text,
+                    score=confidence,
+                    start=0,
+                    end=word_end,
+                    left=line_left,
+                    top=line_top,
+                    width=width_abs,
+                    height=height_abs
+                )
+                # Add to signature collections immediately
+                signatures.append(recogniser_result)
+                signature_recogniser_results.append(recogniser_result)
+                signature_or_handwriting_recogniser_results.append(recogniser_result)
+                words = [{
+                    'text': line_text,
+                    'bounding_box': (line_left, line_top, line_right, line_bottom)
+                }]
             ocr_results_with_children["text_line_" + str(i)] = {
                 "line": i,

tools/file_redaction.py CHANGED Viewed

@@ -832,7 +832,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
                 new_text = merged_box.text + " " + next_box.text
-                new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
@@ -1442,7 +1446,10 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
                     merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
                     merged_result.end = max(current_result.end, result.end)  # Extend text range
                     try:
-                        merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
                     except Exception as e:
                         print("Unable to combine result entity types:", e)
                     if current_text:

         for next_box in group[1:]:
             if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
                 new_text = merged_box.text + " " + next_box.text
+                if merged_box.entity_type != next_box.entity_type:
+                    new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
+                else:
+                    new_entity_type = merged_box.entity_type
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
                     merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
                     merged_result.end = max(current_result.end, result.end)  # Extend text range
                     try:
+                        if current_result.entity_type != result.entity_type:
+                            merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
+                        else:
+                            merged_result.entity_type = current_result.entity_type
                     except Exception as e:
                         print("Unable to combine result entity types:", e)
                     if current_text: