import cv2 import argparse import numpy as np from PIL import Image import sys sys.path.append("/home/wcx/wcx/GroundingDINO/LVLM/mmocr") # MMOCR from mmocr.apis.inferencers import MMOCRInferencer # BUILD MMOCR def arg_parse(): parser = argparse.ArgumentParser(description='MMOCR demo for gradio app') parser.add_argument( '--rec_config', type=str, default='/home/wcx/wcx/GroundingDINO/LVLM/mmocr/configs/textrecog/maerec/maerec_b_union14m.py', help='The recognition config file.') parser.add_argument( '--rec_weight', type=str, default= '/newdisk3/wcx/ocr_model/maerec_b.pth', help='The recognition weight file.') parser.add_argument( '--det_config', type=str, default='/home/wcx/wcx/GroundingDINO/LVLM/mmocr/configs/textdet/dbnetpp/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015.py', # noqa, help='The detection config file.') parser.add_argument( '--det_weight', type=str, default='/newdisk3/wcx/ocr_model/dbnetpp.pth', help='The detection weight file.') parser.add_argument( '--device', type=str, default='cuda:0', help='The device used for inference.') args = parser.parse_args() return args args = arg_parse() mmocr_inferencer = MMOCRInferencer( args.det_config, args.det_weight, args.rec_config, args.rec_weight, device=args.device) def run_mmocr(image_path, use_detector=False): """Run MMOCR and SAM Args: img (np.ndarray): Input image use_detector (bool, optional): Whether to use detector. Defaults to True. """ data = Image.open(image_path).convert("RGB") img = np.array(data) if use_detector: mode = 'det_rec' else: mode = 'rec' # Build MMOCR mmocr_inferencer.mode = mode result = mmocr_inferencer(img, return_vis=True) visualization = result['visualization'][0] result = result['predictions'][0] if mode == 'det_rec': rec_texts = result['rec_texts'] det_polygons = result['det_polygons'] det_results = [] for rec_text, det_polygon in zip(rec_texts, det_polygons): det_polygon = np.array(det_polygon).astype(np.int32).tolist() det_results.append(f'{rec_text}: {det_polygon}') out_results = '\n'.join(det_results) # visualization = cv2.cvtColor( # np.array(visualization), cv2.COLOR_RGB2BGR) cv2.imwrite("/home/wcx/wcx/Union14M/results/{}".format(image_path.split("/")[-1]), np.array(visualization)) visualization = "Done" else: rec_text = result['rec_texts'][0] rec_score = result['rec_scores'][0] out_results = f'pred: {rec_text} \n score: {rec_score:.2f}' visualization = None return visualization, out_results image_path = "/home/wcx/wcx/Union14M/image/temp.jpg" vis, res = run_mmocr(image_path) print(vis) print(res) # if __name__ == '__main__': # args = arg_parse() # mmocr_inferencer = MMOCRInferencer( # args.det_config, # args.det_weight, # args.rec_config, # args.rec_weight, # device=args.device) # with gr.Blocks() as demo: # with gr.Row(): # with gr.Column(scale=1): # gr.HTML(""" #
#

# MAERec: A MAE-pretrained Scene Text Recognizer #

#

# [arXiv] # [Code] #

#

# MAERec is a scene text recognition model composed of a ViT backbone and a Transformer decoder in auto-regressive # style. It shows an outstanding performance in scene text recognition, especially when pre-trained on the # Union14M-U through MAE. #

#

# In this demo, we combine MAERec with DBNet++ to build an # end-to-end scene text recognition model. #

#
# """) # gr.Image('github/maerec.png') # with gr.Column(scale=1): # input_image = gr.Image(label='Input Image') # output_image = gr.Image(label='Output Image') # use_detector = gr.Checkbox( # label= # 'Use Scene Text Detector or Not (Disabled for Recognition Only)', # default=True) # det_results = gr.Textbox(label='Detection Results') # mmocr = gr.Button('Run MMOCR') # gr.Markdown("## Image Examples") # with gr.Row(): # gr.Examples( # examples=[ # 'github/author.jpg', 'github/gradio1.jpeg', # 'github/Art_Curve_178.jpg', 'github/cute_3.jpg', # 'github/cute_168.jpg', 'github/hiercurve_2229.jpg', # 'github/ic15_52.jpg', 'github/ic15_698.jpg', # 'github/Art_Curve_352.jpg' # ], # inputs=input_image, # ) # mmocr.click( # fn=run_mmocr, # inputs=[input_image, use_detector], # outputs=[output_image, det_results]) # demo.launch(debug=True)