Spaces:
Sleeping
Sleeping
File size: 5,589 Bytes
0b4516f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os.path as osp
import time
import warnings
from mmengine import Config
from mmocr.datasets.preparers import DatasetPreparer
def parse_args():
parser = argparse.ArgumentParser(
description='Preparing datasets used in MMOCR.')
parser.add_argument(
'datasets',
help='A list of the dataset names that would like to prepare.',
nargs='+')
parser.add_argument(
'--nproc', help='Number of processes to run', default=4, type=int)
parser.add_argument(
'--task',
default='textdet',
choices=['textdet', 'textrecog', 'textspotting', 'kie'],
help='Task type. Options are "textdet", "textrecog", "textspotting"'
' and "kie".')
parser.add_argument(
'--splits',
default=['train', 'test', 'val'],
help='A list of the split that would like to prepare.',
nargs='+')
parser.add_argument(
'--lmdb',
action='store_true',
default=False,
help='Whether to dump the textrecog dataset to LMDB format, It\'s a '
'shortcut to force the dataset to be dumped in lmdb format. '
'Applicable when --task=textrecog')
parser.add_argument(
'--overwrite-cfg',
action='store_true',
default=False,
help='Whether to overwrite the dataset config file if it already'
' exists. If not specified, Dataset Preparer will not generate'
' new config for datasets whose configs are already in base.')
parser.add_argument(
'--dataset-zoo-path',
default='./dataset_zoo',
help='Path to dataset zoo config files.')
args = parser.parse_args()
return args
def parse_meta(task: str, meta_path: str) -> None:
"""Parse meta file.
Args:
cfg_path (str): Path to meta file.
"""
try:
meta = Config.fromfile(meta_path)
except FileNotFoundError:
return
assert task in meta['Data']['Tasks'], \
f'Task {task} not supported!'
# License related
if meta['Data']['License']['Type']:
print(f"\033[1;33;40mDataset Name: {meta['Name']}")
print(f"License Type: {meta['Data']['License']['Type']}")
print(f"License Link: {meta['Data']['License']['Link']}")
print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m")
print('\033[1;31;43mMMOCR does not own the dataset. Using this '
'dataset you must accept the license provided by the owners, '
'and cite the corresponding papers appropriately.')
print('If you do not agree with the above license, please cancel '
'the progress immediately by pressing ctrl+c. Otherwise, '
'you are deemed to accept the terms and conditions.\033[0m')
for i in range(5):
print(f'{5-i}...')
time.sleep(1)
def force_lmdb(cfg):
"""Force the dataset to be dumped in lmdb format.
Args:
cfg (Config): Config object.
Returns:
Config: Config object.
"""
for split in ['train', 'val', 'test']:
preparer_cfg = cfg.get(f'{split}_preparer')
if preparer_cfg:
if preparer_cfg.get('dumper') is None:
raise ValueError(
f'{split} split does not come with a dumper, '
'so most likely the annotations are MMOCR-ready and do '
'not need any adaptation, and it '
'cannot be dumped in LMDB format.')
preparer_cfg.dumper['type'] = 'TextRecogLMDBDumper'
cfg.config_generator['dataset_name'] = f'{cfg.dataset_name}_lmdb'
for split in ['train_anns', 'val_anns', 'test_anns']:
if split in cfg.config_generator:
# It can be None when users want to clear out the default
# value
if not cfg.config_generator[split]:
continue
ann_list = cfg.config_generator[split]
for ann_dict in ann_list:
ann_dict['ann_file'] = (
osp.splitext(ann_dict['ann_file'])[0] + '.lmdb')
else:
if split == 'train_anns':
ann_list = [dict(ann_file='textrecog_train.lmdb')]
elif split == 'test_anns':
ann_list = [dict(ann_file='textrecog_test.lmdb')]
else:
ann_list = []
cfg.config_generator[split] = ann_list
return cfg
def main():
args = parse_args()
if args.lmdb and args.task != 'textrecog':
raise ValueError('--lmdb only works with --task=textrecog')
for dataset in args.datasets:
if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)):
warnings.warn(f'{dataset} is not supported yet. Please check '
'dataset zoo for supported datasets.')
continue
meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml')
parse_meta(args.task, meta_path)
cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py')
cfg = Config.fromfile(cfg_path)
if args.overwrite_cfg and cfg.get('config_generator',
None) is not None:
cfg.config_generator.overwrite_cfg = args.overwrite_cfg
cfg.nproc = args.nproc
cfg.task = args.task
cfg.dataset_name = dataset
if args.lmdb:
cfg = force_lmdb(cfg)
preparer = DatasetPreparer.from_file(cfg)
preparer.run(args.splits)
if __name__ == '__main__':
main()
|