sunnychenxiwang's picture
update all
24c4def
raw
history blame
5.59 kB
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os.path as osp
import time
import warnings
from mmengine import Config
from mmocr.datasets.preparers import DatasetPreparer
def parse_args():
parser = argparse.ArgumentParser(
description='Preparing datasets used in MMOCR.')
parser.add_argument(
'datasets',
help='A list of the dataset names that would like to prepare.',
nargs='+')
parser.add_argument(
'--nproc', help='Number of processes to run', default=4, type=int)
parser.add_argument(
'--task',
default='textdet',
choices=['textdet', 'textrecog', 'textspotting', 'kie'],
help='Task type. Options are "textdet", "textrecog", "textspotting"'
' and "kie".')
parser.add_argument(
'--splits',
default=['train', 'test', 'val'],
help='A list of the split that would like to prepare.',
nargs='+')
parser.add_argument(
'--lmdb',
action='store_true',
default=False,
help='Whether to dump the textrecog dataset to LMDB format, It\'s a '
'shortcut to force the dataset to be dumped in lmdb format. '
'Applicable when --task=textrecog')
parser.add_argument(
'--overwrite-cfg',
action='store_true',
default=False,
help='Whether to overwrite the dataset config file if it already'
' exists. If not specified, Dataset Preparer will not generate'
' new config for datasets whose configs are already in base.')
parser.add_argument(
'--dataset-zoo-path',
default='./dataset_zoo',
help='Path to dataset zoo config files.')
args = parser.parse_args()
return args
def parse_meta(task: str, meta_path: str) -> None:
"""Parse meta file.
Args:
cfg_path (str): Path to meta file.
"""
try:
meta = Config.fromfile(meta_path)
except FileNotFoundError:
return
assert task in meta['Data']['Tasks'], \
f'Task {task} not supported!'
# License related
if meta['Data']['License']['Type']:
print(f"\033[1;33;40mDataset Name: {meta['Name']}")
print(f"License Type: {meta['Data']['License']['Type']}")
print(f"License Link: {meta['Data']['License']['Link']}")
print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m")
print('\033[1;31;43mMMOCR does not own the dataset. Using this '
'dataset you must accept the license provided by the owners, '
'and cite the corresponding papers appropriately.')
print('If you do not agree with the above license, please cancel '
'the progress immediately by pressing ctrl+c. Otherwise, '
'you are deemed to accept the terms and conditions.\033[0m')
for i in range(5):
print(f'{5-i}...')
time.sleep(1)
def force_lmdb(cfg):
"""Force the dataset to be dumped in lmdb format.
Args:
cfg (Config): Config object.
Returns:
Config: Config object.
"""
for split in ['train', 'val', 'test']:
preparer_cfg = cfg.get(f'{split}_preparer')
if preparer_cfg:
if preparer_cfg.get('dumper') is None:
raise ValueError(
f'{split} split does not come with a dumper, '
'so most likely the annotations are MMOCR-ready and do '
'not need any adaptation, and it '
'cannot be dumped in LMDB format.')
preparer_cfg.dumper['type'] = 'TextRecogLMDBDumper'
cfg.config_generator['dataset_name'] = f'{cfg.dataset_name}_lmdb'
for split in ['train_anns', 'val_anns', 'test_anns']:
if split in cfg.config_generator:
# It can be None when users want to clear out the default
# value
if not cfg.config_generator[split]:
continue
ann_list = cfg.config_generator[split]
for ann_dict in ann_list:
ann_dict['ann_file'] = (
osp.splitext(ann_dict['ann_file'])[0] + '.lmdb')
else:
if split == 'train_anns':
ann_list = [dict(ann_file='textrecog_train.lmdb')]
elif split == 'test_anns':
ann_list = [dict(ann_file='textrecog_test.lmdb')]
else:
ann_list = []
cfg.config_generator[split] = ann_list
return cfg
def main():
args = parse_args()
if args.lmdb and args.task != 'textrecog':
raise ValueError('--lmdb only works with --task=textrecog')
for dataset in args.datasets:
if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)):
warnings.warn(f'{dataset} is not supported yet. Please check '
'dataset zoo for supported datasets.')
continue
meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml')
parse_meta(args.task, meta_path)
cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py')
cfg = Config.fromfile(cfg_path)
if args.overwrite_cfg and cfg.get('config_generator',
None) is not None:
cfg.config_generator.overwrite_cfg = args.overwrite_cfg
cfg.nproc = args.nproc
cfg.task = args.task
cfg.dataset_name = dataset
if args.lmdb:
cfg = force_lmdb(cfg)
preparer = DatasetPreparer.from_file(cfg)
preparer.run(args.splits)
if __name__ == '__main__':
main()