sunnychenxiwang's picture
Upload 1595 files
0b4516f verified
raw
history blame
1.8 kB
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os.path as osp
from mmocr.utils import dump_ocr_data
def convert_annotations(root_path, split):
"""Convert original annotations to mmocr format.
The annotation format of this dataset is as the following:
word_1.png, "flying"
word_2.png, "today"
word_3.png, "means"
See the format of converted annotation in mmocr.utils.dump_ocr_data.
Args:
root_path (str): The root path of the dataset
split (str): The split of dataset. Namely: Train or Test
"""
assert isinstance(root_path, str)
assert isinstance(split, str)
img_info = []
with open(
osp.join(root_path, 'annotations',
f'Challenge1_{split}_Task3_GT.txt'),
encoding='"utf-8-sig') as f:
annos = f.readlines()
for anno in annos:
# text may contain comma ','
dst_img_name, word = anno.split(', "')
word = word.replace('"\n', '')
img_info.append({
'file_name': dst_img_name,
'anno_info': [{
'text': word
}]
})
return img_info
def parse_args():
parser = argparse.ArgumentParser(
description='Generate training and test set of IC11')
parser.add_argument('root_path', help='Root dir path of IC11')
args = parser.parse_args()
return args
def main():
args = parse_args()
root_path = args.root_path
for split in ['Train', 'Test']:
img_info = convert_annotations(root_path, split)
dump_ocr_data(img_info,
osp.join(root_path, f'{split.lower()}_label.json'),
'textrecog')
print(f'{split} split converted.')
if __name__ == '__main__':
main()