Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / tools /custom_image_analyser_engine.py

seanpedrickcase

Added AWS Textract support. Allowed for OCR logs export.

e9c4101 5 months ago

raw

history blame

3.89 kB

	import pytesseract
	from PIL import Image
	import numpy as np
	from presidio_analyzer import AnalyzerEngine, RecognizerResult
	from typing import List, Dict, Optional, Union, Tuple
	from dataclasses import dataclass

	@dataclass
	class OCRResult:
	text: str
	left: int
	top: int
	width: int
	height: int

	@dataclass
	class CustomImageRecognizerResult:
	entity_type: str
	start: int
	end: int
	score: float
	left: int
	top: int
	width: int
	height: int
	text: str

	class CustomImageAnalyzerEngine:
	def __init__(
	self,
	analyzer_engine: Optional[AnalyzerEngine] = None,
	tesseract_config: Optional[str] = None
	):
	if not analyzer_engine:
	analyzer_engine = AnalyzerEngine()
	self.analyzer_engine = analyzer_engine
	self.tesseract_config = tesseract_config or '--oem 3 --psm 11'

	def perform_ocr(self, image: Union[str, Image.Image, np.ndarray]) -> List[OCRResult]:
	# Ensure image is a PIL Image
	if isinstance(image, str):
	image = Image.open(image)
	elif isinstance(image, np.ndarray):
	image = Image.fromarray(image)

	ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT, config=self.tesseract_config)

	# Filter out empty strings and low confidence results
	valid_indices = [i for i, text in enumerate(ocr_data['text']) if text.strip() and int(ocr_data['conf'][i]) > 0]

	return [
	OCRResult(
	text=ocr_data['text'][i],
	left=ocr_data['left'][i],
	top=ocr_data['top'][i],
	width=ocr_data['width'][i],
	height=ocr_data['height'][i]
	)
	for i in valid_indices
	]

	def analyze_text(
	self,
	ocr_results: List[OCRResult],
	**text_analyzer_kwargs
	) -> List[CustomImageRecognizerResult]:
	# Combine all OCR text
	full_text = ' '.join([result.text for result in ocr_results])

	# Define English as default language, if not specified
	if "language" not in text_analyzer_kwargs:
	text_analyzer_kwargs["language"] = "en"

	analyzer_result = self.analyzer_engine.analyze(
	text=full_text, **text_analyzer_kwargs
	)

	allow_list = text_analyzer_kwargs.get('allow_list', [])

	return self.map_analyzer_results_to_bounding_boxes(
	analyzer_result, ocr_results, full_text, allow_list
	)

	@staticmethod
	def map_analyzer_results_to_bounding_boxes(
	text_analyzer_results: List[RecognizerResult],
	ocr_results: List[OCRResult],
	full_text: str,
	allow_list: List[str],
	) -> List[CustomImageRecognizerResult]:
	pii_bboxes = []
	text_position = 0

	for ocr_result in ocr_results:
	word_end = text_position + len(ocr_result.text)

	for result in text_analyzer_results:
	if (max(text_position, result.start) < min(word_end, result.end)) and (ocr_result.text not in allow_list):
	pii_bboxes.append(
	CustomImageRecognizerResult(
	entity_type=result.entity_type,
	start=result.start,
	end=result.end,
	score=result.score,
	left=ocr_result.left,
	top=ocr_result.top,
	width=ocr_result.width,
	height=ocr_result.height,
	text=ocr_result.text
	)
	)
	break

	text_position = word_end + 1 # +1 for the space between words

	return pii_bboxes