Spaces:

deprem-ml
/

deprem-ocr

Runtime error

App Files Files Community

deprem-ocr / ocr /utility.py

Goodsea

paddleocr

fc8c192 almost 2 years ago

raw

history blame

23.7 kB

	import argparse
	import math
	import os
	import platform

	import cv2
	import numpy as np
	import paddle
	from paddle import inference
	from PIL import Image, ImageDraw, ImageFont


	def str2bool(v):
	return v.lower() in ("true", "t", "1")


	def init_args():
	parser = argparse.ArgumentParser()
	# params for prediction engine
	parser.add_argument("--use_gpu", type=str2bool, default=False)
	parser.add_argument("--use_xpu", type=str2bool, default=False)
	parser.add_argument("--ir_optim", type=str2bool, default=False)
	parser.add_argument("--use_tensorrt", type=str2bool, default=False)
	parser.add_argument("--min_subgraph_size", type=int, default=15)
	parser.add_argument("--precision", type=str, default="fp32")
	parser.add_argument("--gpu_mem", type=int, default=500)

	# params for text detector
	parser.add_argument("--image_dir", type=str)
	parser.add_argument("--det_algorithm", type=str, default="DB")
	parser.add_argument("--det_model_dir", type=str, default="./ocr/ch_PP-OCRv3_det_infer/")
	parser.add_argument("--det_limit_side_len", type=float, default=960)
	parser.add_argument("--det_limit_type", type=str, default="max")

	# DB parmas
	parser.add_argument("--det_db_thresh", type=float, default=0.1)
	parser.add_argument("--det_db_box_thresh", type=float, default=0.1)
	parser.add_argument("--det_db_unclip_ratio", type=float, default=1.7)
	parser.add_argument("--max_batch_size", type=int, default=10)
	parser.add_argument("--use_dilation", type=str2bool, default=True)
	parser.add_argument("--det_db_score_mode", type=str, default="fast")

	# EAST parmas
	parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
	parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
	parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)

	# SAST parmas
	parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
	parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
	parser.add_argument("--det_sast_polygon", type=str2bool, default=False)

	# PSE parmas
	parser.add_argument("--det_pse_thresh", type=float, default=0)
	parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
	parser.add_argument("--det_pse_min_area", type=float, default=16)
	parser.add_argument("--det_pse_box_type", type=str, default="quad")
	parser.add_argument("--det_pse_scale", type=int, default=1)

	# FCE parmas
	parser.add_argument("--scales", type=list, default=[8, 16, 32])
	parser.add_argument("--alpha", type=float, default=1.0)
	parser.add_argument("--beta", type=float, default=1.0)
	parser.add_argument("--fourier_degree", type=int, default=5)
	parser.add_argument("--det_fce_box_type", type=str, default="poly")

	# params for text recognizer
	parser.add_argument("--rec_algorithm", type=str, default="SVTR_LCNet")
	parser.add_argument("--rec_model_dir", type=str, default="./ocr/ch_PP-OCRv3_rec_infer/")
	parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
	parser.add_argument("--rec_batch_num", type=int, default=6)
	parser.add_argument("--max_text_length", type=int, default=25)
	parser.add_argument(
	"--rec_char_dict_path", type=str, default="./ocr/ppocr/ppocr_keys_v1.txt"
	)
	parser.add_argument("--use_space_char", type=str2bool, default=True)
	parser.add_argument("--drop_score", type=float, default=0.5)

	# params for text classifier
	parser.add_argument("--use_angle_cls", type=str2bool, default=False)
	parser.add_argument("--cls_model_dir", type=str)
	parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
	parser.add_argument("--label_list", type=list, default=["0", "180"])
	parser.add_argument("--cls_batch_num", type=int, default=6)
	parser.add_argument("--cls_thresh", type=float, default=0.9)

	parser.add_argument("--enable_mkldnn", type=str2bool, default=True)
	parser.add_argument("--cpu_threads", type=int, default=10)
	parser.add_argument("--use_pdserving", type=str2bool, default=False)
	parser.add_argument("--warmup", type=str2bool, default=False)

	#
	parser.add_argument("--draw_img_save_dir", type=str, default="./inference_results")
	parser.add_argument("--save_crop_res", type=str2bool, default=False)
	parser.add_argument("--crop_res_save_dir", type=str, default="./output")

	# multi-process
	parser.add_argument("--use_mp", type=str2bool, default=False)
	parser.add_argument("--total_process_num", type=int, default=1)
	parser.add_argument("--process_id", type=int, default=0)

	parser.add_argument("--benchmark", type=str2bool, default=False)
	parser.add_argument("--save_log_path", type=str, default="./log_output/")

	parser.add_argument("--use_onnx", type=str2bool, default=False)
	return parser


	def parse_args():
	parser = init_args()
	return parser.parse_args()


	def create_predictor(args, mode):
	if mode == "det":
	model_dir = args.det_model_dir
	elif mode == "rec":
	model_dir = args.rec_model_dir

	if args.use_onnx:
	import onnxruntime as ort

	model_file_path = model_dir
	if not os.path.exists(model_file_path):
	raise ValueError("not find model file path {}".format(model_file_path))
	sess = ort.InferenceSession(model_file_path)
	return sess, sess.get_inputs()[0], None, None

	else:
	model_file_path = model_dir + "/inference.pdmodel"
	params_file_path = model_dir + "/inference.pdiparams"
	if not os.path.exists(model_file_path):
	raise ValueError("not find model file path {}".format(model_file_path))
	if not os.path.exists(params_file_path):
	raise ValueError("not find params file path {}".format(params_file_path))

	config = inference.Config(model_file_path, params_file_path)

	if hasattr(args, "precision"):
	if args.precision == "fp16" and args.use_tensorrt:
	precision = inference.PrecisionType.Half
	elif args.precision == "int8":
	precision = inference.PrecisionType.Int8
	else:
	precision = inference.PrecisionType.Float32
	else:
	precision = inference.PrecisionType.Float32

	if args.use_gpu:
	gpu_id = get_infer_gpuid()
	config.enable_use_gpu(args.gpu_mem, 0)
	if args.use_tensorrt:
	config.enable_tensorrt_engine(
	workspace_size=1 << 30,
	precision_mode=precision,
	max_batch_size=args.max_batch_size,
	min_subgraph_size=args.min_subgraph_size,
	)
	# skip the minmum trt subgraph
	use_dynamic_shape = True
	if mode == "det":
	min_input_shape = {
	"x": [1, 3, 50, 50],
	"conv2d_92.tmp_0": [1, 120, 20, 20],
	"conv2d_91.tmp_0": [1, 24, 10, 10],
	"conv2d_59.tmp_0": [1, 96, 20, 20],
	"nearest_interp_v2_1.tmp_0": [1, 256, 10, 10],
	"nearest_interp_v2_2.tmp_0": [1, 256, 20, 20],
	"conv2d_124.tmp_0": [1, 256, 20, 20],
	"nearest_interp_v2_3.tmp_0": [1, 64, 20, 20],
	"nearest_interp_v2_4.tmp_0": [1, 64, 20, 20],
	"nearest_interp_v2_5.tmp_0": [1, 64, 20, 20],
	"elementwise_add_7": [1, 56, 2, 2],
	"nearest_interp_v2_0.tmp_0": [1, 256, 2, 2],
	}
	max_input_shape = {
	"x": [1, 3, 1536, 1536],
	"conv2d_92.tmp_0": [1, 120, 400, 400],
	"conv2d_91.tmp_0": [1, 24, 200, 200],
	"conv2d_59.tmp_0": [1, 96, 400, 400],
	"nearest_interp_v2_1.tmp_0": [1, 256, 200, 200],
	"conv2d_124.tmp_0": [1, 256, 400, 400],
	"nearest_interp_v2_2.tmp_0": [1, 256, 400, 400],
	"nearest_interp_v2_3.tmp_0": [1, 64, 400, 400],
	"nearest_interp_v2_4.tmp_0": [1, 64, 400, 400],
	"nearest_interp_v2_5.tmp_0": [1, 64, 400, 400],
	"elementwise_add_7": [1, 56, 400, 400],
	"nearest_interp_v2_0.tmp_0": [1, 256, 400, 400],
	}
	opt_input_shape = {
	"x": [1, 3, 640, 640],
	"conv2d_92.tmp_0": [1, 120, 160, 160],
	"conv2d_91.tmp_0": [1, 24, 80, 80],
	"conv2d_59.tmp_0": [1, 96, 160, 160],
	"nearest_interp_v2_1.tmp_0": [1, 256, 80, 80],
	"nearest_interp_v2_2.tmp_0": [1, 256, 160, 160],
	"conv2d_124.tmp_0": [1, 256, 160, 160],
	"nearest_interp_v2_3.tmp_0": [1, 64, 160, 160],
	"nearest_interp_v2_4.tmp_0": [1, 64, 160, 160],
	"nearest_interp_v2_5.tmp_0": [1, 64, 160, 160],
	"elementwise_add_7": [1, 56, 40, 40],
	"nearest_interp_v2_0.tmp_0": [1, 256, 40, 40],
	}
	min_pact_shape = {
	"nearest_interp_v2_26.tmp_0": [1, 256, 20, 20],
	"nearest_interp_v2_27.tmp_0": [1, 64, 20, 20],
	"nearest_interp_v2_28.tmp_0": [1, 64, 20, 20],
	"nearest_interp_v2_29.tmp_0": [1, 64, 20, 20],
	}
	max_pact_shape = {
	"nearest_interp_v2_26.tmp_0": [1, 256, 400, 400],
	"nearest_interp_v2_27.tmp_0": [1, 64, 400, 400],
	"nearest_interp_v2_28.tmp_0": [1, 64, 400, 400],
	"nearest_interp_v2_29.tmp_0": [1, 64, 400, 400],
	}
	opt_pact_shape = {
	"nearest_interp_v2_26.tmp_0": [1, 256, 160, 160],
	"nearest_interp_v2_27.tmp_0": [1, 64, 160, 160],
	"nearest_interp_v2_28.tmp_0": [1, 64, 160, 160],
	"nearest_interp_v2_29.tmp_0": [1, 64, 160, 160],
	}
	min_input_shape.update(min_pact_shape)
	max_input_shape.update(max_pact_shape)
	opt_input_shape.update(opt_pact_shape)
	elif mode == "rec":
	if args.rec_algorithm not in ["CRNN", "SVTR_LCNet"]:
	use_dynamic_shape = False
	imgH = int(args.rec_image_shape.split(",")[-2])
	min_input_shape = {"x": [1, 3, imgH, 10]}
	max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]}
	opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]}
	config.exp_disable_tensorrt_ops(["transpose2"])
	elif mode == "cls":
	min_input_shape = {"x": [1, 3, 48, 10]}
	max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]}
	opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]}
	else:
	use_dynamic_shape = False
	if use_dynamic_shape:
	config.set_trt_dynamic_shape_info(
	min_input_shape, max_input_shape, opt_input_shape
	)

	elif args.use_xpu:
	config.enable_xpu(10 * 1024 * 1024)
	else:
	config.disable_gpu()
	if hasattr(args, "cpu_threads"):
	config.set_cpu_math_library_num_threads(args.cpu_threads)
	else:
	# default cpu threads as 10
	config.set_cpu_math_library_num_threads(10)
	if args.enable_mkldnn:
	# cache 10 different shapes for mkldnn to avoid memory leak
	config.set_mkldnn_cache_capacity(10)
	config.enable_mkldnn()
	if args.precision == "fp16":
	config.enable_mkldnn_bfloat16()
	# enable memory optim
	config.enable_memory_optim()
	config.disable_glog_info()
	config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
	config.delete_pass("matmul_transpose_reshape_fuse_pass")
	if mode == "table":
	config.delete_pass("fc_fuse_pass") # not supported for table
	config.switch_use_feed_fetch_ops(False)
	config.switch_ir_optim(True)

	# create predictor
	predictor = inference.create_predictor(config)
	input_names = predictor.get_input_names()
	for name in input_names:
	input_tensor = predictor.get_input_handle(name)
	output_tensors = get_output_tensors(args, mode, predictor)
	return predictor, input_tensor, output_tensors, config


	def get_output_tensors(args, mode, predictor):
	output_names = predictor.get_output_names()
	output_tensors = []
	if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]:
	output_name = "softmax_0.tmp_0"
	if output_name in output_names:
	return [predictor.get_output_handle(output_name)]
	else:
	for output_name in output_names:
	output_tensor = predictor.get_output_handle(output_name)
	output_tensors.append(output_tensor)
	else:
	for output_name in output_names:
	output_tensor = predictor.get_output_handle(output_name)
	output_tensors.append(output_tensor)
	return output_tensors


	def get_infer_gpuid():
	sysstr = platform.system()
	if sysstr == "Windows":
	return 0

	if not paddle.fluid.core.is_compiled_with_rocm():
	cmd = "env \| grep CUDA_VISIBLE_DEVICES"
	else:
	cmd = "env \| grep HIP_VISIBLE_DEVICES"
	env_cuda = os.popen(cmd).readlines()
	if len(env_cuda) == 0:
	return 0
	else:
	gpu_id = env_cuda[0].strip().split("=")[1]
	return int(gpu_id[0])


	def draw_e2e_res(dt_boxes, strs, img_path):
	src_im = cv2.imread(img_path)
	for box, str in zip(dt_boxes, strs):
	box = box.astype(np.int32).reshape((-1, 1, 2))
	cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
	cv2.putText(
	src_im,
	str,
	org=(int(box[0, 0, 0]), int(box[0, 0, 1])),
	fontFace=cv2.FONT_HERSHEY_COMPLEX,
	fontScale=0.7,
	color=(0, 255, 0),
	thickness=1,
	)
	return src_im


	def draw_text_det_res(dt_boxes, img_path):
	src_im = cv2.imread(img_path)
	for box in dt_boxes:
	box = np.array(box).astype(np.int32).reshape(-1, 2)
	cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
	return src_im


	def resize_img(img, input_size=600):
	"""
	resize img and limit the longest side of the image to input_size
	"""
	img = np.array(img)
	im_shape = img.shape
	im_size_max = np.max(im_shape[0:2])
	im_scale = float(input_size) / float(im_size_max)
	img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
	return img


	def draw_ocr(
	image,
	boxes,
	txts=None,
	scores=None,
	drop_score=0.5,
	font_path="./doc/fonts/simfang.ttf",
	):
	"""
	Visualize the results of OCR detection and recognition
	args:
	image(Image\|array): RGB image
	boxes(list): boxes with shape(N, 4, 2)
	txts(list): the texts
	scores(list): txxs corresponding scores
	drop_score(float): only scores greater than drop_threshold will be visualized
	font_path: the path of font which is used to draw text
	return(array):
	the visualized img
	"""
	if scores is None:
	scores = [1] * len(boxes)
	box_num = len(boxes)
	for i in range(box_num):
	if scores is not None and (scores[i] < drop_score or math.isnan(scores[i])):
	continue
	box = np.reshape(np.array(boxes[i]), [-1, 1, 2]).astype(np.int64)
	image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
	if txts is not None:
	img = np.array(resize_img(image, input_size=600))
	txt_img = text_visual(
	txts,
	scores,
	img_h=img.shape[0],
	img_w=600,
	threshold=drop_score,
	font_path=font_path,
	)
	img = np.concatenate([np.array(img), np.array(txt_img)], axis=1)
	return img
	return image


	def draw_ocr_box_txt(
	image, boxes, txts, scores=None, drop_score=0.5, font_path="./doc/simfang.ttf"
	):
	h, w = image.height, image.width
	img_left = image.copy()
	img_right = Image.new("RGB", (w, h), (255, 255, 255))

	import random

	random.seed(0)
	draw_left = ImageDraw.Draw(img_left)
	draw_right = ImageDraw.Draw(img_right)
	for idx, (box, txt) in enumerate(zip(boxes, txts)):
	if scores is not None and scores[idx] < drop_score:
	continue
	color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
	draw_left.polygon(box, fill=color)
	draw_right.polygon(
	[
	box[0][0],
	box[0][1],
	box[1][0],
	box[1][1],
	box[2][0],
	box[2][1],
	box[3][0],
	box[3][1],
	],
	outline=color,
	)
	box_height = math.sqrt(
	(box[0][0] - box[3][0]) 2 + (box[0][1] - box[3][1]) 2
	)
	box_width = math.sqrt(
	(box[0][0] - box[1][0]) 2 + (box[0][1] - box[1][1]) 2
	)
	if box_height > 2 * box_width:
	font_size = max(int(box_width * 0.9), 10)
	font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
	cur_y = box[0][1]
	for c in txt:
	char_size = font.getsize(c)
	draw_right.text((box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font)
	cur_y += char_size[1]
	else:
	font_size = max(int(box_height * 0.8), 10)
	font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
	draw_right.text([box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font)
	img_left = Image.blend(image, img_left, 0.5)
	img_show = Image.new("RGB", (w * 2, h), (255, 255, 255))
	img_show.paste(img_left, (0, 0, w, h))
	img_show.paste(img_right, (w, 0, w * 2, h))
	return np.array(img_show)


	def str_count(s):
	"""
	Count the number of Chinese characters,
	a single English character and a single number
	equal to half the length of Chinese characters.
	args:
	s(string): the input of string
	return(int):
	the number of Chinese characters
	"""
	import string

	count_zh = count_pu = 0
	s_len = len(s)
	en_dg_count = 0
	for c in s:
	if c in string.ascii_letters or c.isdigit() or c.isspace():
	en_dg_count += 1
	elif c.isalpha():
	count_zh += 1
	else:
	count_pu += 1
	return s_len - math.ceil(en_dg_count / 2)


	def text_visual(
	texts, scores, img_h=400, img_w=600, threshold=0.0, font_path="./doc/simfang.ttf"
	):
	"""
	create new blank img and draw txt on it
	args:
	texts(list): the text will be draw
	scores(list\|None): corresponding score of each txt
	img_h(int): the height of blank img
	img_w(int): the width of blank img
	font_path: the path of font which is used to draw text
	return(array):
	"""
	if scores is not None:
	assert len(texts) == len(
	scores
	), "The number of txts and corresponding scores must match"

	def create_blank_img():
	blank_img = np.ones(shape=[img_h, img_w], dtype=np.int8) * 255
	blank_img[:, img_w - 1 :] = 0
	blank_img = Image.fromarray(blank_img).convert("RGB")
	draw_txt = ImageDraw.Draw(blank_img)
	return blank_img, draw_txt

	blank_img, draw_txt = create_blank_img()

	font_size = 20
	txt_color = (0, 0, 0)
	font = ImageFont.truetype(font_path, font_size, encoding="utf-8")

	gap = font_size + 5
	txt_img_list = []
	count, index = 1, 0
	for idx, txt in enumerate(texts):
	index += 1
	if scores[idx] < threshold or math.isnan(scores[idx]):
	index -= 1
	continue
	first_line = True
	while str_count(txt) >= img_w // font_size - 4:
	tmp = txt
	txt = tmp[: img_w // font_size - 4]
	if first_line:
	new_txt = str(index) + ": " + txt
	first_line = False
	else:
	new_txt = " " + txt
	draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
	txt = tmp[img_w // font_size - 4 :]
	if count >= img_h // gap - 1:
	txt_img_list.append(np.array(blank_img))
	blank_img, draw_txt = create_blank_img()
	count = 0
	count += 1
	if first_line:
	new_txt = str(index) + ": " + txt + " " + "%.3f" % (scores[idx])
	else:
	new_txt = " " + txt + " " + "%.3f" % (scores[idx])
	draw_txt.text((0, gap * count), new_txt, txt_color, font=font)
	# whether add new blank img or not
	if count >= img_h // gap - 1 and idx + 1 < len(texts):
	txt_img_list.append(np.array(blank_img))
	blank_img, draw_txt = create_blank_img()
	count = 0
	count += 1
	txt_img_list.append(np.array(blank_img))
	if len(txt_img_list) == 1:
	blank_img = np.array(txt_img_list[0])
	else:
	blank_img = np.concatenate(txt_img_list, axis=1)
	return np.array(blank_img)


	def base64_to_cv2(b64str):
	import base64

	data = base64.b64decode(b64str.encode("utf8"))
	data = np.frombuffer(data, np.uint8)
	data = cv2.imdecode(data, cv2.IMREAD_COLOR)
	return data


	def draw_boxes(image, boxes, scores=None, drop_score=0.5):
	if scores is None:
	scores = [1] * len(boxes)
	for (box, score) in zip(boxes, scores):
	if score < drop_score:
	continue
	box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
	image = cv2.polylines(np.array(image), [box], True, (255, 0, 0), 2)
	return image


	def get_rotate_crop_image(img, points):
	"""
	img_height, img_width = img.shape[0:2]
	left = int(np.min(points[:, 0]))
	right = int(np.max(points[:, 0]))
	top = int(np.min(points[:, 1]))
	bottom = int(np.max(points[:, 1]))
	img_crop = img[top:bottom, left:right, :].copy()
	points[:, 0] = points[:, 0] - left
	points[:, 1] = points[:, 1] - top
	"""
	assert len(points) == 4, "shape of points must be 4*2"
	img_crop_width = int(
	max(
	np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3])
	)
	)
	img_crop_height = int(
	max(
	np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2])
	)
	)
	pts_std = np.float32(
	[
	[0, 0],
	[img_crop_width, 0],
	[img_crop_width, img_crop_height],
	[0, img_crop_height],
	]
	)
	M = cv2.getPerspectiveTransform(points, pts_std)
	dst_img = cv2.warpPerspective(
	img,
	M,
	(img_crop_width, img_crop_height),
	borderMode=cv2.BORDER_REPLICATE,
	flags=cv2.INTER_CUBIC,
	)
	dst_img_height, dst_img_width = dst_img.shape[0:2]
	if dst_img_height * 1.0 / dst_img_width >= 1.5:
	dst_img = np.rot90(dst_img)
	return dst_img


	def check_gpu(use_gpu):
	if use_gpu and not paddle.is_compiled_with_cuda():
	use_gpu = False
	return use_gpu