seanpedrickcase commited on
Commit
0d3554e
·
1 Parent(s): 11770c9

Fix bug to identify all handwriting labels. Now only concatenates entity_type boxes if they have different labels.

Browse files
doc_redaction_amplify_app ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 9585642e4d1f72fc49971789693d5584661084c8
tools/aws_textract.py CHANGED
@@ -145,8 +145,9 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
145
 
146
  # Extract text and bounding box for the line
147
  line_text = text_block.get('Text', '')
148
-
149
  words = []
 
 
150
  if 'Relationships' in text_block:
151
  for relationship in text_block['Relationships']:
152
  if relationship['Type'] == 'CHILD':
@@ -179,35 +180,56 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
179
  if text_type == "HANDWRITING":
180
  is_handwriting = True
181
  entity_name = "HANDWRITING"
182
- word_end = len(entity_name)
183
-
184
- recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= word_text, score= confidence, start=0, end=word_end, left=word_left, top=word_top, width=word_width_abs, height=word_height_abs)
185
-
186
- if recogniser_result not in handwriting:
187
- handwriting.append(recogniser_result)
188
- #print("Handwriting found:", handwriting[-1])
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  # If handwriting or signature, add to bounding box
191
 
192
  elif (text_block['BlockType'] == 'SIGNATURE'):
193
  line_text = "SIGNATURE"
194
-
195
  is_signature = True
196
  entity_name = "SIGNATURE"
197
- confidence = text_block['Confidence']
198
- word_end = len(entity_name)
199
-
200
- recogniser_result = CustomImageRecognizerResult(entity_type=entity_name, text= line_text, score= confidence, start=0, end=word_end, left=line_left, top=line_top, width=width_abs, height=height_abs)
201
-
202
- if recogniser_result not in signatures:
203
- signatures.append(recogniser_result)
204
- #print("Signature found:", signatures[-1])
205
-
206
- words = []
207
- words.append({
208
- 'text': line_text,
209
- 'bounding_box': (line_left, line_top, line_right, line_bottom)
210
- })
 
 
 
 
 
 
 
 
 
 
211
 
212
  ocr_results_with_children["text_line_" + str(i)] = {
213
  "line": i,
 
145
 
146
  # Extract text and bounding box for the line
147
  line_text = text_block.get('Text', '')
 
148
  words = []
149
+ current_line_handwriting_results = [] # Track handwriting results for this line
150
+
151
  if 'Relationships' in text_block:
152
  for relationship in text_block['Relationships']:
153
  if relationship['Type'] == 'CHILD':
 
180
  if text_type == "HANDWRITING":
181
  is_handwriting = True
182
  entity_name = "HANDWRITING"
183
+ word_end = len(word_text)
184
+
185
+ recogniser_result = CustomImageRecognizerResult(
186
+ entity_type=entity_name,
187
+ text=word_text,
188
+ score=confidence,
189
+ start=0,
190
+ end=word_end,
191
+ left=word_left,
192
+ top=word_top,
193
+ width=word_width_abs,
194
+ height=word_height_abs
195
+ )
196
+
197
+ # Add to handwriting collections immediately
198
+ handwriting.append(recogniser_result)
199
+ handwriting_recogniser_results.append(recogniser_result)
200
+ signature_or_handwriting_recogniser_results.append(recogniser_result)
201
+ current_line_handwriting_results.append(recogniser_result)
202
 
203
  # If handwriting or signature, add to bounding box
204
 
205
  elif (text_block['BlockType'] == 'SIGNATURE'):
206
  line_text = "SIGNATURE"
 
207
  is_signature = True
208
  entity_name = "SIGNATURE"
209
+ confidence = text_block.get('Confidence', 0)
210
+ word_end = len(line_text)
211
+
212
+ recogniser_result = CustomImageRecognizerResult(
213
+ entity_type=entity_name,
214
+ text=line_text,
215
+ score=confidence,
216
+ start=0,
217
+ end=word_end,
218
+ left=line_left,
219
+ top=line_top,
220
+ width=width_abs,
221
+ height=height_abs
222
+ )
223
+
224
+ # Add to signature collections immediately
225
+ signatures.append(recogniser_result)
226
+ signature_recogniser_results.append(recogniser_result)
227
+ signature_or_handwriting_recogniser_results.append(recogniser_result)
228
+
229
+ words = [{
230
+ 'text': line_text,
231
+ 'bounding_box': (line_left, line_top, line_right, line_bottom)
232
+ }]
233
 
234
  ocr_results_with_children["text_line_" + str(i)] = {
235
  "line": i,
tools/file_redaction.py CHANGED
@@ -832,7 +832,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
832
  for next_box in group[1:]:
833
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
834
  new_text = merged_box.text + " " + next_box.text
835
- new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
 
 
 
 
836
 
837
  new_left = min(merged_box.left, next_box.left)
838
  new_top = min(merged_box.top, next_box.top)
@@ -1442,7 +1446,10 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
1442
  merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
1443
  merged_result.end = max(current_result.end, result.end) # Extend text range
1444
  try:
1445
- merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
 
 
 
1446
  except Exception as e:
1447
  print("Unable to combine result entity types:", e)
1448
  if current_text:
 
832
  for next_box in group[1:]:
833
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
834
  new_text = merged_box.text + " " + next_box.text
835
+
836
+ if merged_box.entity_type != next_box.entity_type:
837
+ new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
838
+ else:
839
+ new_entity_type = merged_box.entity_type
840
 
841
  new_left = min(merged_box.left, next_box.left)
842
  new_top = min(merged_box.top, next_box.top)
 
1446
  merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
1447
  merged_result.end = max(current_result.end, result.end) # Extend text range
1448
  try:
1449
+ if current_result.entity_type != result.entity_type:
1450
+ merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
1451
+ else:
1452
+ merged_result.entity_type = current_result.entity_type
1453
  except Exception as e:
1454
  print("Unable to combine result entity types:", e)
1455
  if current_text: