File size: 5,589 Bytes
0b4516f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os.path as osp
import time
import warnings

from mmengine import Config

from mmocr.datasets.preparers import DatasetPreparer


def parse_args():
    parser = argparse.ArgumentParser(
        description='Preparing datasets used in MMOCR.')
    parser.add_argument(
        'datasets',
        help='A list of the dataset names that would like to prepare.',
        nargs='+')
    parser.add_argument(
        '--nproc', help='Number of processes to run', default=4, type=int)
    parser.add_argument(
        '--task',
        default='textdet',
        choices=['textdet', 'textrecog', 'textspotting', 'kie'],
        help='Task type. Options are "textdet", "textrecog", "textspotting"'
        ' and "kie".')
    parser.add_argument(
        '--splits',
        default=['train', 'test', 'val'],
        help='A list of the split that would like to prepare.',
        nargs='+')
    parser.add_argument(
        '--lmdb',
        action='store_true',
        default=False,
        help='Whether to dump the textrecog dataset to LMDB format, It\'s a '
        'shortcut to force the dataset to be dumped in lmdb format. '
        'Applicable when --task=textrecog')
    parser.add_argument(
        '--overwrite-cfg',
        action='store_true',
        default=False,
        help='Whether to overwrite the dataset config file if it already'
        ' exists. If not specified, Dataset Preparer will not generate'
        ' new config for datasets whose configs are already in base.')
    parser.add_argument(
        '--dataset-zoo-path',
        default='./dataset_zoo',
        help='Path to dataset zoo config files.')
    args = parser.parse_args()
    return args


def parse_meta(task: str, meta_path: str) -> None:
    """Parse meta file.

    Args:
        cfg_path (str): Path to meta file.
    """
    try:
        meta = Config.fromfile(meta_path)
    except FileNotFoundError:
        return
    assert task in meta['Data']['Tasks'], \
        f'Task {task} not supported!'
    # License related
    if meta['Data']['License']['Type']:
        print(f"\033[1;33;40mDataset Name: {meta['Name']}")
        print(f"License Type: {meta['Data']['License']['Type']}")
        print(f"License Link: {meta['Data']['License']['Link']}")
        print(f"BibTeX: {meta['Paper']['BibTeX']}\033[0m")
        print('\033[1;31;43mMMOCR does not own the dataset. Using this '
              'dataset you must accept the license provided by the owners, '
              'and cite the corresponding papers appropriately.')
        print('If you do not agree with the above license, please cancel '
              'the progress immediately by pressing ctrl+c. Otherwise, '
              'you are deemed to accept the terms and conditions.\033[0m')
        for i in range(5):
            print(f'{5-i}...')
            time.sleep(1)


def force_lmdb(cfg):
    """Force the dataset to be dumped in lmdb format.

    Args:
        cfg (Config): Config object.

    Returns:
        Config: Config object.
    """
    for split in ['train', 'val', 'test']:
        preparer_cfg = cfg.get(f'{split}_preparer')
        if preparer_cfg:
            if preparer_cfg.get('dumper') is None:
                raise ValueError(
                    f'{split} split does not come with a dumper, '
                    'so most likely the annotations are MMOCR-ready and do '
                    'not need any adaptation, and it '
                    'cannot be dumped in LMDB format.')
            preparer_cfg.dumper['type'] = 'TextRecogLMDBDumper'

    cfg.config_generator['dataset_name'] = f'{cfg.dataset_name}_lmdb'

    for split in ['train_anns', 'val_anns', 'test_anns']:
        if split in cfg.config_generator:
            # It can be None when users want to clear out the default
            # value
            if not cfg.config_generator[split]:
                continue
            ann_list = cfg.config_generator[split]
            for ann_dict in ann_list:
                ann_dict['ann_file'] = (
                    osp.splitext(ann_dict['ann_file'])[0] + '.lmdb')
        else:
            if split == 'train_anns':
                ann_list = [dict(ann_file='textrecog_train.lmdb')]
            elif split == 'test_anns':
                ann_list = [dict(ann_file='textrecog_test.lmdb')]
            else:
                ann_list = []
        cfg.config_generator[split] = ann_list

    return cfg


def main():
    args = parse_args()
    if args.lmdb and args.task != 'textrecog':
        raise ValueError('--lmdb only works with --task=textrecog')
    for dataset in args.datasets:
        if not osp.isdir(osp.join(args.dataset_zoo_path, dataset)):
            warnings.warn(f'{dataset} is not supported yet. Please check '
                          'dataset zoo for supported datasets.')
            continue
        meta_path = osp.join(args.dataset_zoo_path, dataset, 'metafile.yml')
        parse_meta(args.task, meta_path)
        cfg_path = osp.join(args.dataset_zoo_path, dataset, args.task + '.py')
        cfg = Config.fromfile(cfg_path)
        if args.overwrite_cfg and cfg.get('config_generator',
                                          None) is not None:
            cfg.config_generator.overwrite_cfg = args.overwrite_cfg
        cfg.nproc = args.nproc
        cfg.task = args.task
        cfg.dataset_name = dataset
        if args.lmdb:
            cfg = force_lmdb(cfg)
        preparer = DatasetPreparer.from_file(cfg)
        preparer.run(args.splits)


if __name__ == '__main__':
    main()