Commit
·
0d3554e
1
Parent(s):
11770c9
Fix bug to identify all handwriting labels. Now only concatenates entity_type boxes if they have different labels.
Browse files- doc_redaction_amplify_app +1 -0
- tools/aws_textract.py +45 -23
- tools/file_redaction.py +9 -2
doc_redaction_amplify_app
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 9585642e4d1f72fc49971789693d5584661084c8
|
tools/aws_textract.py
CHANGED
@@ -145,8 +145,9 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
|
145 |
|
146 |
# Extract text and bounding box for the line
|
147 |
line_text = text_block.get('Text', '')
|
148 |
-
|
149 |
words = []
|
|
|
|
|
150 |
if 'Relationships' in text_block:
|
151 |
for relationship in text_block['Relationships']:
|
152 |
if relationship['Type'] == 'CHILD':
|
@@ -179,35 +180,56 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
|
179 |
if text_type == "HANDWRITING":
|
180 |
is_handwriting = True
|
181 |
entity_name = "HANDWRITING"
|
182 |
-
word_end = len(
|
183 |
-
|
184 |
-
recogniser_result = CustomImageRecognizerResult(
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
# If handwriting or signature, add to bounding box
|
191 |
|
192 |
elif (text_block['BlockType'] == 'SIGNATURE'):
|
193 |
line_text = "SIGNATURE"
|
194 |
-
|
195 |
is_signature = True
|
196 |
entity_name = "SIGNATURE"
|
197 |
-
confidence = text_block
|
198 |
-
word_end = len(
|
199 |
-
|
200 |
-
recogniser_result = CustomImageRecognizerResult(
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
ocr_results_with_children["text_line_" + str(i)] = {
|
213 |
"line": i,
|
|
|
145 |
|
146 |
# Extract text and bounding box for the line
|
147 |
line_text = text_block.get('Text', '')
|
|
|
148 |
words = []
|
149 |
+
current_line_handwriting_results = [] # Track handwriting results for this line
|
150 |
+
|
151 |
if 'Relationships' in text_block:
|
152 |
for relationship in text_block['Relationships']:
|
153 |
if relationship['Type'] == 'CHILD':
|
|
|
180 |
if text_type == "HANDWRITING":
|
181 |
is_handwriting = True
|
182 |
entity_name = "HANDWRITING"
|
183 |
+
word_end = len(word_text)
|
184 |
+
|
185 |
+
recogniser_result = CustomImageRecognizerResult(
|
186 |
+
entity_type=entity_name,
|
187 |
+
text=word_text,
|
188 |
+
score=confidence,
|
189 |
+
start=0,
|
190 |
+
end=word_end,
|
191 |
+
left=word_left,
|
192 |
+
top=word_top,
|
193 |
+
width=word_width_abs,
|
194 |
+
height=word_height_abs
|
195 |
+
)
|
196 |
+
|
197 |
+
# Add to handwriting collections immediately
|
198 |
+
handwriting.append(recogniser_result)
|
199 |
+
handwriting_recogniser_results.append(recogniser_result)
|
200 |
+
signature_or_handwriting_recogniser_results.append(recogniser_result)
|
201 |
+
current_line_handwriting_results.append(recogniser_result)
|
202 |
|
203 |
# If handwriting or signature, add to bounding box
|
204 |
|
205 |
elif (text_block['BlockType'] == 'SIGNATURE'):
|
206 |
line_text = "SIGNATURE"
|
|
|
207 |
is_signature = True
|
208 |
entity_name = "SIGNATURE"
|
209 |
+
confidence = text_block.get('Confidence', 0)
|
210 |
+
word_end = len(line_text)
|
211 |
+
|
212 |
+
recogniser_result = CustomImageRecognizerResult(
|
213 |
+
entity_type=entity_name,
|
214 |
+
text=line_text,
|
215 |
+
score=confidence,
|
216 |
+
start=0,
|
217 |
+
end=word_end,
|
218 |
+
left=line_left,
|
219 |
+
top=line_top,
|
220 |
+
width=width_abs,
|
221 |
+
height=height_abs
|
222 |
+
)
|
223 |
+
|
224 |
+
# Add to signature collections immediately
|
225 |
+
signatures.append(recogniser_result)
|
226 |
+
signature_recogniser_results.append(recogniser_result)
|
227 |
+
signature_or_handwriting_recogniser_results.append(recogniser_result)
|
228 |
+
|
229 |
+
words = [{
|
230 |
+
'text': line_text,
|
231 |
+
'bounding_box': (line_left, line_top, line_right, line_bottom)
|
232 |
+
}]
|
233 |
|
234 |
ocr_results_with_children["text_line_" + str(i)] = {
|
235 |
"line": i,
|
tools/file_redaction.py
CHANGED
@@ -832,7 +832,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
832 |
for next_box in group[1:]:
|
833 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
834 |
new_text = merged_box.text + " " + next_box.text
|
835 |
-
|
|
|
|
|
|
|
|
|
836 |
|
837 |
new_left = min(merged_box.left, next_box.left)
|
838 |
new_top = min(merged_box.top, next_box.top)
|
@@ -1442,7 +1446,10 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
|
|
1442 |
merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
|
1443 |
merged_result.end = max(current_result.end, result.end) # Extend text range
|
1444 |
try:
|
1445 |
-
|
|
|
|
|
|
|
1446 |
except Exception as e:
|
1447 |
print("Unable to combine result entity types:", e)
|
1448 |
if current_text:
|
|
|
832 |
for next_box in group[1:]:
|
833 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
834 |
new_text = merged_box.text + " " + next_box.text
|
835 |
+
|
836 |
+
if merged_box.entity_type != next_box.entity_type:
|
837 |
+
new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
|
838 |
+
else:
|
839 |
+
new_entity_type = merged_box.entity_type
|
840 |
|
841 |
new_left = min(merged_box.left, next_box.left)
|
842 |
new_top = min(merged_box.top, next_box.top)
|
|
|
1446 |
merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
|
1447 |
merged_result.end = max(current_result.end, result.end) # Extend text range
|
1448 |
try:
|
1449 |
+
if current_result.entity_type != result.entity_type:
|
1450 |
+
merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
1451 |
+
else:
|
1452 |
+
merged_result.entity_type = current_result.entity_type
|
1453 |
except Exception as e:
|
1454 |
print("Unable to combine result entity types:", e)
|
1455 |
if current_text:
|