nguyendu392 tomofi commited on
Commit
e198e1c
0 Parent(s):

Duplicate from tomofi/MMOCR

Browse files

Co-authored-by: Tomofumi Inoue <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +29 -0
  2. CITATION.cff +9 -0
  3. MANIFEST.in +4 -0
  4. README.md +14 -0
  5. README_zh-CN.md +183 -0
  6. app.py +36 -0
  7. configs/_base_/default_runtime.py +19 -0
  8. configs/_base_/det_datasets/ctw1500.py +18 -0
  9. configs/_base_/det_datasets/icdar2015.py +18 -0
  10. configs/_base_/det_datasets/icdar2017.py +18 -0
  11. configs/_base_/det_datasets/toy_data.py +39 -0
  12. configs/_base_/det_models/dbnet_r18_fpnc.py +21 -0
  13. configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py +23 -0
  14. configs/_base_/det_models/drrg_r50_fpn_unet.py +21 -0
  15. configs/_base_/det_models/fcenet_r50_fpn.py +33 -0
  16. configs/_base_/det_models/fcenet_r50dcnv2_fpn.py +35 -0
  17. configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py +126 -0
  18. configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py +126 -0
  19. configs/_base_/det_models/panet_r18_fpem_ffm.py +43 -0
  20. configs/_base_/det_models/panet_r50_fpem_ffm.py +21 -0
  21. configs/_base_/det_models/psenet_r50_fpnf.py +51 -0
  22. configs/_base_/det_models/textsnake_r50_fpn_unet.py +22 -0
  23. configs/_base_/det_pipelines/dbnet_pipeline.py +88 -0
  24. configs/_base_/det_pipelines/drrg_pipeline.py +60 -0
  25. configs/_base_/det_pipelines/fcenet_pipeline.py +118 -0
  26. configs/_base_/det_pipelines/maskrcnn_pipeline.py +57 -0
  27. configs/_base_/det_pipelines/panet_pipeline.py +156 -0
  28. configs/_base_/det_pipelines/psenet_pipeline.py +70 -0
  29. configs/_base_/det_pipelines/textsnake_pipeline.py +65 -0
  30. configs/_base_/recog_datasets/MJ_train.py +24 -0
  31. configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py +34 -0
  32. configs/_base_/recog_datasets/ST_MJ_train.py +32 -0
  33. configs/_base_/recog_datasets/ST_SA_MJ_real_train.py +79 -0
  34. configs/_base_/recog_datasets/ST_charbox_train.py +22 -0
  35. configs/_base_/recog_datasets/academic_test.py +56 -0
  36. configs/_base_/recog_datasets/seg_toy_data.py +32 -0
  37. configs/_base_/recog_datasets/toy_data.py +56 -0
  38. configs/_base_/recog_models/abinet.py +70 -0
  39. configs/_base_/recog_models/crnn.py +12 -0
  40. configs/_base_/recog_models/crnn_tps.py +18 -0
  41. configs/_base_/recog_models/nrtr_modality_transform.py +11 -0
  42. configs/_base_/recog_models/robust_scanner.py +24 -0
  43. configs/_base_/recog_models/sar.py +24 -0
  44. configs/_base_/recog_models/satrn.py +11 -0
  45. configs/_base_/recog_models/seg.py +21 -0
  46. configs/_base_/recog_pipelines/abinet_pipeline.py +96 -0
  47. configs/_base_/recog_pipelines/crnn_pipeline.py +35 -0
  48. configs/_base_/recog_pipelines/crnn_tps_pipeline.py +37 -0
  49. configs/_base_/recog_pipelines/nrtr_pipeline.py +38 -0
  50. configs/_base_/recog_pipelines/sar_pipeline.py +43 -0
.gitattributes ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.png filter=lfs diff=lfs merge=lfs -text
29
+ *.jpg filter=lfs diff=lfs merge=lfs -text
CITATION.cff ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ message: "If you use this software, please cite it as below."
3
+ title: "OpenMMLab Text Detection, Recognition and Understanding Toolbox"
4
+ authors:
5
+ - name: "MMOCR Contributors"
6
+ version: 0.3.0
7
+ date-released: 2020-08-15
8
+ repository-code: "https://github.com/open-mmlab/mmocr"
9
+ license: Apache-2.0
MANIFEST.in ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include requirements/*.txt
2
+ include mmocr/.mim/model-index.yml
3
+ recursive-include mmocr/.mim/configs *.py *.yml
4
+ recursive-include mmocr/.mim/tools *.sh *.py
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MMOCR
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 2.8.11
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: tomofi/MMOCR
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
README_zh-CN.md ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="resources/mmocr-logo.png" width="500px"/>
3
+ <div>&nbsp;</div>
4
+ <div align="center">
5
+ <b><font size="5">OpenMMLab 官网</font></b>
6
+ <sup>
7
+ <a href="https://openmmlab.com">
8
+ <i><font size="4">HOT</font></i>
9
+ </a>
10
+ </sup>
11
+ &nbsp;&nbsp;&nbsp;&nbsp;
12
+ <b><font size="5">OpenMMLab 开放平台</font></b>
13
+ <sup>
14
+ <a href="https://platform.openmmlab.com">
15
+ <i><font size="4">TRY IT OUT</font></i>
16
+ </a>
17
+ </sup>
18
+ </div>
19
+ <div>&nbsp;</div>
20
+ </div>
21
+
22
+ ## 简介
23
+
24
+ [English](/README.md) | 简体中文
25
+
26
+ [![build](https://github.com/open-mmlab/mmocr/workflows/build/badge.svg)](https://github.com/open-mmlab/mmocr/actions)
27
+ [![docs](https://readthedocs.org/projects/mmocr/badge/?version=latest)](https://mmocr.readthedocs.io/en/latest/?badge=latest)
28
+ [![codecov](https://codecov.io/gh/open-mmlab/mmocr/branch/main/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmocr)
29
+ [![license](https://img.shields.io/github/license/open-mmlab/mmocr.svg)](https://github.com/open-mmlab/mmocr/blob/main/LICENSE)
30
+ [![PyPI](https://badge.fury.io/py/mmocr.svg)](https://pypi.org/project/mmocr/)
31
+ [![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmocr.svg)](https://github.com/open-mmlab/mmocr/issues)
32
+ [![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmocr.svg)](https://github.com/open-mmlab/mmocr/issues)
33
+
34
+ MMOCR 是基于 PyTorch 和 mmdetection 的开源工具箱,专注于文本检测,文本识别以及相应的下游任务,如关键信息提取。 它是 OpenMMLab 项目的一部分。
35
+
36
+ 主分支目前支持 **PyTorch 1.6 以上**的版本。
37
+
38
+ 文档:https://mmocr.readthedocs.io/zh_CN/latest/
39
+
40
+ <div align="left">
41
+ <img src="resources/illustration.jpg"/>
42
+ </div>
43
+
44
+ ### 主要特性
45
+
46
+ -**全流程**
47
+
48
+ 该工具箱不仅支持文本检测和文本识别,还支持其下游任务,例如关键信息提取。
49
+
50
+ -**多种模型**
51
+
52
+ 该工具箱支持用于文本检测,文本识别和关键信息提取的各种最新模型。
53
+
54
+ -**模块化设计**
55
+
56
+ MMOCR 的模块化设计使用户可以定义自己的优化器,数据预处理器,模型组件如主干模块,颈部模块和头部模块,以及损失函数。有关如何构建自定义模型的信
57
+ 息,请参考[快速入门](https://mmocr.readthedocs.io/zh_CN/latest/getting_started.html)。
58
+
59
+ -**众多实用工具**
60
+
61
+ 该工具箱提供了一套全面的实用程序,可以帮助用户评估模型的性能。它包括可对图像,标注的真值以及预测结果进行可视化的可视化工具,以及用于在训练过程中评估模型的验证工具。它还包括数据转换器,演示了如何将用户自建的标注数据转换为 MMOCR 支持的标注文件。
62
+ ## [模型库](https://mmocr.readthedocs.io/en/latest/modelzoo.html)
63
+
64
+ 支持的算法:
65
+
66
+ <details open>
67
+ <summary>文字检测</summary>
68
+
69
+ - [x] [DBNet](configs/textdet/dbnet/README.md) (AAAI'2020)
70
+ - [x] [Mask R-CNN](configs/textdet/maskrcnn/README.md) (ICCV'2017)
71
+ - [x] [PANet](configs/textdet/panet/README.md) (ICCV'2019)
72
+ - [x] [PSENet](configs/textdet/psenet/README.md) (CVPR'2019)
73
+ - [x] [TextSnake](configs/textdet/textsnake/README.md) (ECCV'2018)
74
+ - [x] [DRRG](configs/textdet/drrg/README.md) (CVPR'2020)
75
+ - [x] [FCENet](configs/textdet/fcenet/README.md) (CVPR'2021)
76
+
77
+ </details>
78
+
79
+ <details open>
80
+ <summary>文字识别</summary>
81
+
82
+ - [x] [ABINet](configs/textrecog/abinet/README.md) (CVPR'2021)
83
+ - [x] [CRNN](configs/textrecog/crnn/README.md) (TPAMI'2016)
84
+ - [x] [NRTR](configs/textrecog/nrtr/README.md) (ICDAR'2019)
85
+ - [x] [RobustScanner](configs/textrecog/robust_scanner/README.md) (ECCV'2020)
86
+ - [x] [SAR](configs/textrecog/sar/README.md) (AAAI'2019)
87
+ - [x] [SATRN](configs/textrecog/satrn/README.md) (CVPR'2020 Workshop on Text and Documents in the Deep Learning Era)
88
+ - [x] [SegOCR](configs/textrecog/seg/README.md) (Manuscript'2021)
89
+
90
+ </details>
91
+
92
+ <details open>
93
+ <summary>关键信息提取</summary>
94
+
95
+ - [x] [SDMG-R](configs/kie/sdmgr/README.md) (ArXiv'2021)
96
+
97
+ </details>
98
+
99
+ <details open>
100
+ <summary>命名实体识别</summary>
101
+
102
+ - [x] [Bert-Softmax](configs/ner/bert_softmax/README.md) (NAACL'2019)
103
+
104
+ </details>
105
+
106
+ 请点击[模型库](https://mmocr.readthedocs.io/en/latest/modelzoo.html)查看更多关于上述算法的详细信息。
107
+
108
+ ## 开源许可证
109
+
110
+ 该项目采用 [Apache 2.0 license](LICENSE) 开源许可证。
111
+
112
+ ## 引用
113
+
114
+ 如果您发现此项目对您的研究有用,请考虑引用:
115
+
116
+ ```bibtex
117
+ @article{mmocr2021,
118
+ title={MMOCR: A Comprehensive Toolbox for Text Detection, Recognition and Understanding},
119
+ author={Kuang, Zhanghui and Sun, Hongbin and Li, Zhizhong and Yue, Xiaoyu and Lin, Tsui Hin and Chen, Jianyong and Wei, Huaqiang and Zhu, Yiqin and Gao, Tong and Zhang, Wenwei and Chen, Kai and Zhang, Wayne and Lin, Dahua},
120
+ journal= {arXiv preprint arXiv:2108.06543},
121
+ year={2021}
122
+ }
123
+ ```
124
+
125
+ ## 更新日志
126
+
127
+ 最新的月度版本 v0.4.1 在 2022.01.27 发布。
128
+
129
+ ## 安装
130
+
131
+ 请参考[安装文档](https://mmocr.readthedocs.io/zh_CN/latest/install.html)进行安装。
132
+
133
+ ## 快速入门
134
+
135
+ 请参考[快速入门](https://mmocr.readthedocs.io/zh_CN/latest/getting_started.html)文档学习 MMOCR 的基本使用。
136
+
137
+ ## 贡献指南
138
+
139
+ 我们感谢所有的贡献者为改进和提升 MMOCR 所作出的努力。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。
140
+
141
+ ## 致谢
142
+ MMOCR 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。 我们希望此工具箱可以帮助大家来复现已有的方法和开发新的方法,从而为研究社区贡献力量。
143
+
144
+ ## OpenMMLab 的其他项目
145
+
146
+
147
+ - [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
148
+ - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
149
+ - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
150
+ - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
151
+ - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
152
+ - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
153
+ - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
154
+ - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
155
+ - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
156
+ - [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
157
+ - [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
158
+ - [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
159
+ - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
160
+ - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
161
+ - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
162
+ - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
163
+ - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
164
+ - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
165
+
166
+ ## 欢迎加入 OpenMMLab 社区
167
+
168
+ 扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=aCvMxdr3)
169
+
170
+ <div align="center">
171
+ <img src="resources/zhihu_qrcode.jpg" height="400" /> <img src="resources/qq_group_qrcode.jpg" height="400" />
172
+ </div>
173
+
174
+ 我们会在 OpenMMLab 社区为大家
175
+
176
+ - 📢 分享 AI 框架的前沿核心技术
177
+ - 💻 解读 PyTorch 常用模块源码
178
+ - 📰 发布 OpenMMLab 的相关新闻
179
+ - 🚀 介绍 OpenMMLab 开发的前沿算法
180
+ - 🏃 获取更高效的问题答疑和意见反馈
181
+ - 🔥 提供与各行各业开发者充分交流的平台
182
+
183
+ 干货满满 📘,等你来撩 💗,OpenMMLab 社区期待您的加入 👬
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+
4
+ print(torch.__version__)
5
+ torch_ver, cuda_ver = torch.__version__.split('+')
6
+ os.system('pip list')
7
+ os.system(f'pip install opencv-contrib-python==4.5.5.62 --no-cache-dir')
8
+ os.system('pip list')
9
+ os.system(f'pip install pycocotools==2.0.0 mmdet mmcv-full==1.5.0 -f https://download.openmmlab.com/mmcv/dist/{cuda_ver}/torch1.10.0/index.html --no-cache-dir')
10
+ os.system('wget -nv -c https://download.openmmlab.com/mmocr/data/wildreceipt.tar; mkdir -p data; tar -xf wildreceipt.tar --directory data; rm -f wildreceipt.tar')
11
+
12
+ import datetime
13
+ import gradio as gr
14
+ import pandas as pd
15
+ from mmocr.utils.ocr import MMOCR
16
+
17
+ def inference(img):
18
+ print(datetime.datetime.now(), 'start')
19
+ ocr = MMOCR(det='PS_CTW', recog='SAR', kie='SDMGR')
20
+ print(datetime.datetime.now(), 'start read')
21
+ results = ocr.readtext(img.name, details=True, output='result.png')
22
+ print(datetime.datetime.now(), results)
23
+ return ['result.png', pd.DataFrame(results[0]['result']).iloc[: , 2:]]
24
+
25
+ description = 'Gradio demo for MMOCR. MMOCR is an open-source toolbox based on PyTorch and mmdetection for text detection, text recognition, and the corresponding downstream tasks including key information extraction. To use it, simply upload your image or click one of the examples to load them. Read more at the links below.'
26
+ article = "<p style='text-align: center'><a href='https://mmocr.readthedocs.io/en/latest/'>MMOCR is an open-source toolbox based on PyTorch and mmdetection for text detection, text recognition, and the corresponding downstream tasks including key information extraction.</a> | <a href='https://github.com/open-mmlab/mmocr'>Github Repo</a></p>"
27
+ gr.Interface(inference,
28
+ gr.inputs.Image(type='file', label='Input'),
29
+ [gr.outputs.Image(type='file', label='Output'), gr.outputs.Dataframe(headers=['text', 'text_score', 'label', 'label_score'])],
30
+ title='MMOCR',
31
+ description=description,
32
+ article=article,
33
+ examples=['demo/demo_kie.jpeg', 'demo/demo_text_ocr.jpg', 'demo/demo_text_det.jpg', 'demo/demo_densetext_det.jpg'],
34
+ css=".output_image, .input_image {height: 40rem !important; width: 100% !important;}",
35
+ enable_queue=True
36
+ ).launch(debug=True)
configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint_config = dict(interval=1)
2
+ # yapf:disable
3
+ log_config = dict(
4
+ interval=5,
5
+ hooks=[
6
+ dict(type='TextLoggerHook')
7
+
8
+ ])
9
+ # yapf:enable
10
+ dist_params = dict(backend='nccl')
11
+ log_level = 'INFO'
12
+ load_from = None
13
+ resume_from = None
14
+ workflow = [('train', 1)]
15
+
16
+ # disable opencv multithreading to avoid system being overloaded
17
+ opencv_num_threads = 0
18
+ # set multi-process start method as `fork` to speed up the training
19
+ mp_start_method = 'fork'
configs/_base_/det_datasets/ctw1500.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/ctw1500'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_test.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/icdar2015.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/icdar2015'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_test.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/icdar2017.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/icdar2017'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_val.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/toy_data.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root = 'tests/data/toy_dataset'
2
+
3
+ # dataset with type='TextDetDataset'
4
+ train1 = dict(
5
+ type='TextDetDataset',
6
+ img_prefix=f'{root}/imgs',
7
+ ann_file=f'{root}/instances_test.txt',
8
+ loader=dict(
9
+ type='HardDiskLoader',
10
+ repeat=4,
11
+ parser=dict(
12
+ type='LineJsonParser',
13
+ keys=['file_name', 'height', 'width', 'annotations'])),
14
+ pipeline=None,
15
+ test_mode=False)
16
+
17
+ # dataset with type='IcdarDataset'
18
+ train2 = dict(
19
+ type='IcdarDataset',
20
+ ann_file=f'{root}/instances_test.json',
21
+ img_prefix=f'{root}/imgs',
22
+ pipeline=None)
23
+
24
+ test = dict(
25
+ type='TextDetDataset',
26
+ img_prefix=f'{root}/imgs',
27
+ ann_file=f'{root}/instances_test.txt',
28
+ loader=dict(
29
+ type='HardDiskLoader',
30
+ repeat=1,
31
+ parser=dict(
32
+ type='LineJsonParser',
33
+ keys=['file_name', 'height', 'width', 'annotations'])),
34
+ pipeline=None,
35
+ test_mode=True)
36
+
37
+ train_list = [train1, train2]
38
+
39
+ test_list = [test]
configs/_base_/det_models/dbnet_r18_fpnc.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=18,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
11
+ norm_eval=False,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPNC', in_channels=[64, 128, 256, 512], lateral_channels=256),
15
+ bbox_head=dict(
16
+ type='DBHead',
17
+ in_channels=256,
18
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
19
+ postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
20
+ train_cfg=None,
21
+ test_cfg=None)
configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=False,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='FPNC', in_channels=[256, 512, 1024, 2048], lateral_channels=256),
17
+ bbox_head=dict(
18
+ type='DBHead',
19
+ in_channels=256,
20
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
21
+ postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
22
+ train_cfg=None,
23
+ test_cfg=None)
configs/_base_/det_models/drrg_r50_fpn_unet.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DRRG',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
15
+ bbox_head=dict(
16
+ type='DRRGHead',
17
+ in_channels=32,
18
+ text_region_thr=0.3,
19
+ center_region_thr=0.4,
20
+ loss=dict(type='DRRGLoss'),
21
+ postprocessor=dict(type='DRRGPostprocessor', link_thr=0.80)))
configs/_base_/det_models/fcenet_r50_fpn.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='FCENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=False,
12
+ style='pytorch'),
13
+ neck=dict(
14
+ type='mmdet.FPN',
15
+ in_channels=[512, 1024, 2048],
16
+ out_channels=256,
17
+ add_extra_convs='on_output',
18
+ num_outs=3,
19
+ relu_before_extra_convs=True,
20
+ act_cfg=None),
21
+ bbox_head=dict(
22
+ type='FCEHead',
23
+ in_channels=256,
24
+ scales=(8, 16, 32),
25
+ fourier_degree=5,
26
+ loss=dict(type='FCELoss', num_sample=50),
27
+ postprocessor=dict(
28
+ type='FCEPostprocessor',
29
+ text_repr_type='quad',
30
+ num_reconstr_points=50,
31
+ alpha=1.2,
32
+ beta=1.0,
33
+ score_thr=0.3)))
configs/_base_/det_models/fcenet_r50dcnv2_fpn.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='FCENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=True,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=2, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='mmdet.FPN',
17
+ in_channels=[512, 1024, 2048],
18
+ out_channels=256,
19
+ add_extra_convs='on_output',
20
+ num_outs=3,
21
+ relu_before_extra_convs=True,
22
+ act_cfg=None),
23
+ bbox_head=dict(
24
+ type='FCEHead',
25
+ in_channels=256,
26
+ scales=(8, 16, 32),
27
+ fourier_degree=5,
28
+ loss=dict(type='FCELoss', num_sample=50),
29
+ postprocessor=dict(
30
+ type='FCEPostprocessor',
31
+ text_repr_type='poly',
32
+ num_reconstr_points=50,
33
+ alpha=1.0,
34
+ beta=2.0,
35
+ score_thr=0.3)))
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='OCRMaskRCNN',
4
+ backbone=dict(
5
+ type='mmdet.ResNet',
6
+ depth=50,
7
+ num_stages=4,
8
+ out_indices=(0, 1, 2, 3),
9
+ frozen_stages=1,
10
+ norm_cfg=dict(type='BN', requires_grad=True),
11
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='mmdet.FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ num_outs=5),
19
+ rpn_head=dict(
20
+ type='RPNHead',
21
+ in_channels=256,
22
+ feat_channels=256,
23
+ anchor_generator=dict(
24
+ type='AnchorGenerator',
25
+ scales=[4],
26
+ ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
27
+ strides=[4, 8, 16, 32, 64]),
28
+ bbox_coder=dict(
29
+ type='DeltaXYWHBBoxCoder',
30
+ target_means=[.0, .0, .0, .0],
31
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
32
+ loss_cls=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
35
+ roi_head=dict(
36
+ type='StandardRoIHead',
37
+ bbox_roi_extractor=dict(
38
+ type='SingleRoIExtractor',
39
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
40
+ out_channels=256,
41
+ featmap_strides=[4, 8, 16, 32]),
42
+ bbox_head=dict(
43
+ type='Shared2FCBBoxHead',
44
+ in_channels=256,
45
+ fc_out_channels=1024,
46
+ roi_feat_size=7,
47
+ num_classes=1,
48
+ bbox_coder=dict(
49
+ type='DeltaXYWHBBoxCoder',
50
+ target_means=[0., 0., 0., 0.],
51
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
52
+ reg_class_agnostic=False,
53
+ loss_cls=dict(
54
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
55
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
56
+ mask_roi_extractor=dict(
57
+ type='SingleRoIExtractor',
58
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
59
+ out_channels=256,
60
+ featmap_strides=[4, 8, 16, 32]),
61
+ mask_head=dict(
62
+ type='FCNMaskHead',
63
+ num_convs=4,
64
+ in_channels=256,
65
+ conv_out_channels=256,
66
+ num_classes=1,
67
+ loss_mask=dict(
68
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
69
+
70
+ # model training and testing settings
71
+ train_cfg=dict(
72
+ rpn=dict(
73
+ assigner=dict(
74
+ type='MaxIoUAssigner',
75
+ pos_iou_thr=0.7,
76
+ neg_iou_thr=0.3,
77
+ min_pos_iou=0.3,
78
+ match_low_quality=True,
79
+ ignore_iof_thr=-1,
80
+ gpu_assign_thr=50),
81
+ sampler=dict(
82
+ type='RandomSampler',
83
+ num=256,
84
+ pos_fraction=0.5,
85
+ neg_pos_ub=-1,
86
+ add_gt_as_proposals=False),
87
+ allowed_border=-1,
88
+ pos_weight=-1,
89
+ debug=False),
90
+ rpn_proposal=dict(
91
+ nms_across_levels=False,
92
+ nms_pre=2000,
93
+ nms_post=1000,
94
+ max_per_img=1000,
95
+ nms=dict(type='nms', iou_threshold=0.7),
96
+ min_bbox_size=0),
97
+ rcnn=dict(
98
+ assigner=dict(
99
+ type='MaxIoUAssigner',
100
+ pos_iou_thr=0.5,
101
+ neg_iou_thr=0.5,
102
+ min_pos_iou=0.5,
103
+ match_low_quality=True,
104
+ ignore_iof_thr=-1),
105
+ sampler=dict(
106
+ type='OHEMSampler',
107
+ num=512,
108
+ pos_fraction=0.25,
109
+ neg_pos_ub=-1,
110
+ add_gt_as_proposals=True),
111
+ mask_size=28,
112
+ pos_weight=-1,
113
+ debug=False)),
114
+ test_cfg=dict(
115
+ rpn=dict(
116
+ nms_across_levels=False,
117
+ nms_pre=1000,
118
+ nms_post=1000,
119
+ max_per_img=1000,
120
+ nms=dict(type='nms', iou_threshold=0.7),
121
+ min_bbox_size=0),
122
+ rcnn=dict(
123
+ score_thr=0.05,
124
+ nms=dict(type='nms', iou_threshold=0.5),
125
+ max_per_img=100,
126
+ mask_thr_binary=0.5)))
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='OCRMaskRCNN',
4
+ text_repr_type='poly',
5
+ backbone=dict(
6
+ type='mmdet.ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ style='pytorch'),
15
+ neck=dict(
16
+ type='mmdet.FPN',
17
+ in_channels=[256, 512, 1024, 2048],
18
+ out_channels=256,
19
+ num_outs=5),
20
+ rpn_head=dict(
21
+ type='RPNHead',
22
+ in_channels=256,
23
+ feat_channels=256,
24
+ anchor_generator=dict(
25
+ type='AnchorGenerator',
26
+ scales=[4],
27
+ ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
28
+ strides=[4, 8, 16, 32, 64]),
29
+ bbox_coder=dict(
30
+ type='DeltaXYWHBBoxCoder',
31
+ target_means=[.0, .0, .0, .0],
32
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
33
+ loss_cls=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
35
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
36
+ roi_head=dict(
37
+ type='StandardRoIHead',
38
+ bbox_roi_extractor=dict(
39
+ type='SingleRoIExtractor',
40
+ roi_layer=dict(type='RoIAlign', output_size=7, sample_num=0),
41
+ out_channels=256,
42
+ featmap_strides=[4, 8, 16, 32]),
43
+ bbox_head=dict(
44
+ type='Shared2FCBBoxHead',
45
+ in_channels=256,
46
+ fc_out_channels=1024,
47
+ roi_feat_size=7,
48
+ num_classes=80,
49
+ bbox_coder=dict(
50
+ type='DeltaXYWHBBoxCoder',
51
+ target_means=[0., 0., 0., 0.],
52
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
53
+ reg_class_agnostic=False,
54
+ loss_cls=dict(
55
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
56
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
57
+ mask_roi_extractor=dict(
58
+ type='SingleRoIExtractor',
59
+ roi_layer=dict(type='RoIAlign', output_size=14, sample_num=0),
60
+ out_channels=256,
61
+ featmap_strides=[4, 8, 16, 32]),
62
+ mask_head=dict(
63
+ type='FCNMaskHead',
64
+ num_convs=4,
65
+ in_channels=256,
66
+ conv_out_channels=256,
67
+ num_classes=80,
68
+ loss_mask=dict(
69
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
70
+ # model training and testing settings
71
+ train_cfg=dict(
72
+ rpn=dict(
73
+ assigner=dict(
74
+ type='MaxIoUAssigner',
75
+ pos_iou_thr=0.7,
76
+ neg_iou_thr=0.3,
77
+ min_pos_iou=0.3,
78
+ match_low_quality=True,
79
+ ignore_iof_thr=-1),
80
+ sampler=dict(
81
+ type='RandomSampler',
82
+ num=256,
83
+ pos_fraction=0.5,
84
+ neg_pos_ub=-1,
85
+ add_gt_as_proposals=False),
86
+ allowed_border=-1,
87
+ pos_weight=-1,
88
+ debug=False),
89
+ rpn_proposal=dict(
90
+ nms_across_levels=False,
91
+ nms_pre=2000,
92
+ nms_post=1000,
93
+ max_per_img=1000,
94
+ nms=dict(type='nms', iou_threshold=0.7),
95
+ min_bbox_size=0),
96
+ rcnn=dict(
97
+ assigner=dict(
98
+ type='MaxIoUAssigner',
99
+ pos_iou_thr=0.5,
100
+ neg_iou_thr=0.5,
101
+ min_pos_iou=0.5,
102
+ match_low_quality=True,
103
+ ignore_iof_thr=-1,
104
+ gpu_assign_thr=50),
105
+ sampler=dict(
106
+ type='OHEMSampler',
107
+ num=512,
108
+ pos_fraction=0.25,
109
+ neg_pos_ub=-1,
110
+ add_gt_as_proposals=True),
111
+ mask_size=28,
112
+ pos_weight=-1,
113
+ debug=False)),
114
+ test_cfg=dict(
115
+ rpn=dict(
116
+ nms_across_levels=False,
117
+ nms_pre=1000,
118
+ nms_post=1000,
119
+ max_per_img=1000,
120
+ nms=dict(type='nms', iou_threshold=0.7),
121
+ min_bbox_size=0),
122
+ rcnn=dict(
123
+ score_thr=0.05,
124
+ nms=dict(type='nms', iou_threshold=0.5),
125
+ max_per_img=100,
126
+ mask_thr_binary=0.5)))
configs/_base_/det_models/panet_r18_fpem_ffm.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_poly = dict(
2
+ type='PANet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=18,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
14
+ bbox_head=dict(
15
+ type='PANHead',
16
+ in_channels=[128, 128, 128, 128],
17
+ out_channels=6,
18
+ loss=dict(type='PANLoss'),
19
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
20
+ train_cfg=None,
21
+ test_cfg=None)
22
+
23
+ model_quad = dict(
24
+ type='PANet',
25
+ backbone=dict(
26
+ type='mmdet.ResNet',
27
+ depth=18,
28
+ num_stages=4,
29
+ out_indices=(0, 1, 2, 3),
30
+ frozen_stages=-1,
31
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
32
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
33
+ norm_eval=True,
34
+ style='caffe'),
35
+ neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
36
+ bbox_head=dict(
37
+ type='PANHead',
38
+ in_channels=[128, 128, 128, 128],
39
+ out_channels=6,
40
+ loss=dict(type='PANLoss'),
41
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='quad')),
42
+ train_cfg=None,
43
+ test_cfg=None)
configs/_base_/det_models/panet_r50_fpem_ffm.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='PANet',
3
+ pretrained='torchvision://resnet50',
4
+ backbone=dict(
5
+ type='mmdet.ResNet',
6
+ depth=50,
7
+ num_stages=4,
8
+ out_indices=(0, 1, 2, 3),
9
+ frozen_stages=1,
10
+ norm_cfg=dict(type='BN', requires_grad=True),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(type='FPEM_FFM', in_channels=[256, 512, 1024, 2048]),
14
+ bbox_head=dict(
15
+ type='PANHead',
16
+ in_channels=[128, 128, 128, 128],
17
+ out_channels=6,
18
+ loss=dict(type='PANLoss', speedup_bbox_thr=32),
19
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
20
+ train_cfg=None,
21
+ test_cfg=None)
configs/_base_/det_models/psenet_r50_fpnf.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_poly = dict(
2
+ type='PSENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPNF',
15
+ in_channels=[256, 512, 1024, 2048],
16
+ out_channels=256,
17
+ fusion_type='concat'),
18
+ bbox_head=dict(
19
+ type='PSEHead',
20
+ in_channels=[256],
21
+ out_channels=7,
22
+ loss=dict(type='PSELoss'),
23
+ postprocessor=dict(type='PSEPostprocessor', text_repr_type='poly')),
24
+ train_cfg=None,
25
+ test_cfg=None)
26
+
27
+ model_quad = dict(
28
+ type='PSENet',
29
+ backbone=dict(
30
+ type='mmdet.ResNet',
31
+ depth=50,
32
+ num_stages=4,
33
+ out_indices=(0, 1, 2, 3),
34
+ frozen_stages=-1,
35
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
36
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
37
+ norm_eval=True,
38
+ style='caffe'),
39
+ neck=dict(
40
+ type='FPNF',
41
+ in_channels=[256, 512, 1024, 2048],
42
+ out_channels=256,
43
+ fusion_type='concat'),
44
+ bbox_head=dict(
45
+ type='PSEHead',
46
+ in_channels=[256],
47
+ out_channels=7,
48
+ loss=dict(type='PSELoss'),
49
+ postprocessor=dict(type='PSEPostprocessor', text_repr_type='quad')),
50
+ train_cfg=None,
51
+ test_cfg=None)
configs/_base_/det_models/textsnake_r50_fpn_unet.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='TextSnake',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
15
+ bbox_head=dict(
16
+ type='TextSnakeHead',
17
+ in_channels=32,
18
+ loss=dict(type='TextSnakeLoss'),
19
+ postprocessor=dict(
20
+ type='TextSnakePostprocessor', text_repr_type='poly')),
21
+ train_cfg=None,
22
+ test_cfg=None)
configs/_base_/det_pipelines/dbnet_pipeline.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline_r18 = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='ImgAug',
15
+ args=[['Fliplr', 0.5],
16
+ dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
17
+ dict(type='EastRandomCrop', target_size=(640, 640)),
18
+ dict(type='DBNetTargets', shrink_ratio=0.4),
19
+ dict(type='Pad', size_divisor=32),
20
+ dict(
21
+ type='CustomFormatBundle',
22
+ keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
23
+ visualize=dict(flag=False, boundary_key='gt_shrink')),
24
+ dict(
25
+ type='Collect',
26
+ keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
27
+ ]
28
+
29
+ test_pipeline_1333_736 = [
30
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
31
+ dict(
32
+ type='MultiScaleFlipAug',
33
+ img_scale=(1333, 736),
34
+ flip=False,
35
+ transforms=[
36
+ dict(type='Resize', img_scale=(2944, 736), keep_ratio=True),
37
+ dict(type='Normalize', **img_norm_cfg),
38
+ dict(type='Pad', size_divisor=32),
39
+ dict(type='ImageToTensor', keys=['img']),
40
+ dict(type='Collect', keys=['img']),
41
+ ])
42
+ ]
43
+
44
+ # for dbnet_r50dcnv2_fpnc
45
+ img_norm_cfg_r50dcnv2 = dict(
46
+ mean=[122.67891434, 116.66876762, 104.00698793],
47
+ std=[58.395, 57.12, 57.375],
48
+ to_rgb=True)
49
+
50
+ train_pipeline_r50dcnv2 = [
51
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
52
+ dict(
53
+ type='LoadTextAnnotations',
54
+ with_bbox=True,
55
+ with_mask=True,
56
+ poly2mask=False),
57
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
58
+ dict(type='Normalize', **img_norm_cfg_r50dcnv2),
59
+ dict(
60
+ type='ImgAug',
61
+ args=[['Fliplr', 0.5],
62
+ dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
63
+ dict(type='EastRandomCrop', target_size=(640, 640)),
64
+ dict(type='DBNetTargets', shrink_ratio=0.4),
65
+ dict(type='Pad', size_divisor=32),
66
+ dict(
67
+ type='CustomFormatBundle',
68
+ keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
69
+ visualize=dict(flag=False, boundary_key='gt_shrink')),
70
+ dict(
71
+ type='Collect',
72
+ keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
73
+ ]
74
+
75
+ test_pipeline_4068_1024 = [
76
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
77
+ dict(
78
+ type='MultiScaleFlipAug',
79
+ img_scale=(4068, 1024),
80
+ flip=False,
81
+ transforms=[
82
+ dict(type='Resize', img_scale=(2944, 736), keep_ratio=True),
83
+ dict(type='Normalize', **img_norm_cfg_r50dcnv2),
84
+ dict(type='Pad', size_divisor=32),
85
+ dict(type='ImageToTensor', keys=['img']),
86
+ dict(type='Collect', keys=['img']),
87
+ ])
88
+ ]
configs/_base_/det_pipelines/drrg_pipeline.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(type='RandomScaling', size=800, scale=(0.75, 2.5)),
14
+ dict(
15
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
16
+ dict(
17
+ type='RandomCropPolyInstances',
18
+ instance_key='gt_masks',
19
+ crop_ratio=0.8,
20
+ min_side_ratio=0.3),
21
+ dict(
22
+ type='RandomRotatePolyInstances',
23
+ rotate_ratio=0.5,
24
+ max_angle=60,
25
+ pad_with_fixed_color=False),
26
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
27
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
28
+ dict(type='DRRGTargets'),
29
+ dict(type='Pad', size_divisor=32),
30
+ dict(
31
+ type='CustomFormatBundle',
32
+ keys=[
33
+ 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
34
+ 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
35
+ 'gt_cos_map', 'gt_comp_attribs'
36
+ ],
37
+ visualize=dict(flag=False, boundary_key='gt_text_mask')),
38
+ dict(
39
+ type='Collect',
40
+ keys=[
41
+ 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
42
+ 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
43
+ 'gt_cos_map', 'gt_comp_attribs'
44
+ ])
45
+ ]
46
+
47
+ test_pipeline = [
48
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
49
+ dict(
50
+ type='MultiScaleFlipAug',
51
+ img_scale=(1024, 640),
52
+ flip=False,
53
+ transforms=[
54
+ dict(type='Resize', img_scale=(1024, 640), keep_ratio=True),
55
+ dict(type='Normalize', **img_norm_cfg),
56
+ dict(type='Pad', size_divisor=32),
57
+ dict(type='ImageToTensor', keys=['img']),
58
+ dict(type='Collect', keys=['img']),
59
+ ])
60
+ ]
configs/_base_/det_pipelines/fcenet_pipeline.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ # for icdar2015
5
+ leval_prop_range_icdar2015 = ((0, 0.4), (0.3, 0.7), (0.6, 1.0))
6
+ train_pipeline_icdar2015 = [
7
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
8
+ dict(
9
+ type='LoadTextAnnotations',
10
+ with_bbox=True,
11
+ with_mask=True,
12
+ poly2mask=False),
13
+ dict(
14
+ type='ColorJitter',
15
+ brightness=32.0 / 255,
16
+ saturation=0.5,
17
+ contrast=0.5),
18
+ dict(type='Normalize', **img_norm_cfg),
19
+ dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
20
+ dict(
21
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
22
+ dict(
23
+ type='RandomCropPolyInstances',
24
+ instance_key='gt_masks',
25
+ crop_ratio=0.8,
26
+ min_side_ratio=0.3),
27
+ dict(
28
+ type='RandomRotatePolyInstances',
29
+ rotate_ratio=0.5,
30
+ max_angle=30,
31
+ pad_with_fixed_color=False),
32
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
33
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
34
+ dict(type='Pad', size_divisor=32),
35
+ dict(
36
+ type='FCENetTargets',
37
+ fourier_degree=5,
38
+ level_proportion_range=leval_prop_range_icdar2015),
39
+ dict(
40
+ type='CustomFormatBundle',
41
+ keys=['p3_maps', 'p4_maps', 'p5_maps'],
42
+ visualize=dict(flag=False, boundary_key=None)),
43
+ dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
44
+ ]
45
+
46
+ img_scale_icdar2015 = (2260, 2260)
47
+ test_pipeline_icdar2015 = [
48
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
49
+ dict(
50
+ type='MultiScaleFlipAug',
51
+ img_scale=img_scale_icdar2015,
52
+ flip=False,
53
+ transforms=[
54
+ dict(type='Resize', img_scale=(1280, 800), keep_ratio=True),
55
+ dict(type='Normalize', **img_norm_cfg),
56
+ dict(type='Pad', size_divisor=32),
57
+ dict(type='ImageToTensor', keys=['img']),
58
+ dict(type='Collect', keys=['img']),
59
+ ])
60
+ ]
61
+
62
+ # for ctw1500
63
+ leval_prop_range_ctw1500 = ((0, 0.25), (0.2, 0.65), (0.55, 1.0))
64
+ train_pipeline_ctw1500 = [
65
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
66
+ dict(
67
+ type='LoadTextAnnotations',
68
+ with_bbox=True,
69
+ with_mask=True,
70
+ poly2mask=False),
71
+ dict(
72
+ type='ColorJitter',
73
+ brightness=32.0 / 255,
74
+ saturation=0.5,
75
+ contrast=0.5),
76
+ dict(type='Normalize', **img_norm_cfg),
77
+ dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
78
+ dict(
79
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
80
+ dict(
81
+ type='RandomCropPolyInstances',
82
+ instance_key='gt_masks',
83
+ crop_ratio=0.8,
84
+ min_side_ratio=0.3),
85
+ dict(
86
+ type='RandomRotatePolyInstances',
87
+ rotate_ratio=0.5,
88
+ max_angle=30,
89
+ pad_with_fixed_color=False),
90
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
91
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
92
+ dict(type='Pad', size_divisor=32),
93
+ dict(
94
+ type='FCENetTargets',
95
+ fourier_degree=5,
96
+ level_proportion_range=leval_prop_range_ctw1500),
97
+ dict(
98
+ type='CustomFormatBundle',
99
+ keys=['p3_maps', 'p4_maps', 'p5_maps'],
100
+ visualize=dict(flag=False, boundary_key=None)),
101
+ dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
102
+ ]
103
+
104
+ img_scale_ctw1500 = (1080, 736)
105
+ test_pipeline_ctw1500 = [
106
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
107
+ dict(
108
+ type='MultiScaleFlipAug',
109
+ img_scale=img_scale_ctw1500,
110
+ flip=False,
111
+ transforms=[
112
+ dict(type='Resize', img_scale=(1280, 800), keep_ratio=True),
113
+ dict(type='Normalize', **img_norm_cfg),
114
+ dict(type='Pad', size_divisor=32),
115
+ dict(type='ImageToTensor', keys=['img']),
116
+ dict(type='Collect', keys=['img']),
117
+ ])
118
+ ]
configs/_base_/det_pipelines/maskrcnn_pipeline.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
7
+ dict(
8
+ type='ScaleAspectJitter',
9
+ img_scale=None,
10
+ keep_ratio=False,
11
+ resize_type='indep_sample_in_range',
12
+ scale_range=(640, 2560)),
13
+ dict(type='RandomFlip', flip_ratio=0.5),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(
16
+ type='RandomCropInstances',
17
+ target_size=(640, 640),
18
+ mask_type='union_all',
19
+ instance_key='gt_masks'),
20
+ dict(type='Pad', size_divisor=32),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
23
+ ]
24
+
25
+ # for ctw1500
26
+ img_scale_ctw1500 = (1600, 1600)
27
+ test_pipeline_ctw1500 = [
28
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
29
+ dict(
30
+ type='MultiScaleFlipAug',
31
+ img_scale=img_scale_ctw1500,
32
+ flip=False,
33
+ transforms=[
34
+ dict(type='Resize', keep_ratio=True),
35
+ dict(type='RandomFlip'),
36
+ dict(type='Normalize', **img_norm_cfg),
37
+ dict(type='ImageToTensor', keys=['img']),
38
+ dict(type='Collect', keys=['img']),
39
+ ])
40
+ ]
41
+
42
+ # for icdar2015
43
+ img_scale_icdar2015 = (1920, 1920)
44
+ test_pipeline_icdar2015 = [
45
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
46
+ dict(
47
+ type='MultiScaleFlipAug',
48
+ img_scale=img_scale_icdar2015,
49
+ flip=False,
50
+ transforms=[
51
+ dict(type='Resize', keep_ratio=True),
52
+ dict(type='RandomFlip'),
53
+ dict(type='Normalize', **img_norm_cfg),
54
+ dict(type='ImageToTensor', keys=['img']),
55
+ dict(type='Collect', keys=['img']),
56
+ ])
57
+ ]
configs/_base_/det_pipelines/panet_pipeline.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ # for ctw1500
5
+ img_scale_train_ctw1500 = [(3000, 640)]
6
+ shrink_ratio_train_ctw1500 = (1.0, 0.7)
7
+ target_size_train_ctw1500 = (640, 640)
8
+ train_pipeline_ctw1500 = [
9
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
10
+ dict(
11
+ type='LoadTextAnnotations',
12
+ with_bbox=True,
13
+ with_mask=True,
14
+ poly2mask=False),
15
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
16
+ dict(type='Normalize', **img_norm_cfg),
17
+ dict(
18
+ type='ScaleAspectJitter',
19
+ img_scale=img_scale_train_ctw1500,
20
+ ratio_range=(0.7, 1.3),
21
+ aspect_ratio_range=(0.9, 1.1),
22
+ multiscale_mode='value',
23
+ keep_ratio=False),
24
+ # shrink_ratio is from big to small. The 1st must be 1.0
25
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_ctw1500),
26
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
27
+ dict(type='RandomRotateTextDet'),
28
+ dict(
29
+ type='RandomCropInstances',
30
+ target_size=target_size_train_ctw1500,
31
+ instance_key='gt_kernels'),
32
+ dict(type='Pad', size_divisor=32),
33
+ dict(
34
+ type='CustomFormatBundle',
35
+ keys=['gt_kernels', 'gt_mask'],
36
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
37
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
38
+ ]
39
+
40
+ img_scale_test_ctw1500 = (3000, 640)
41
+ test_pipeline_ctw1500 = [
42
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
43
+ dict(
44
+ type='MultiScaleFlipAug',
45
+ img_scale=img_scale_test_ctw1500,
46
+ flip=False,
47
+ transforms=[
48
+ dict(type='Resize', img_scale=(3000, 640), keep_ratio=True),
49
+ dict(type='Normalize', **img_norm_cfg),
50
+ dict(type='Pad', size_divisor=32),
51
+ dict(type='ImageToTensor', keys=['img']),
52
+ dict(type='Collect', keys=['img']),
53
+ ])
54
+ ]
55
+
56
+ # for icdar2015
57
+ img_scale_train_icdar2015 = [(3000, 736)]
58
+ shrink_ratio_train_icdar2015 = (1.0, 0.5)
59
+ target_size_train_icdar2015 = (736, 736)
60
+ train_pipeline_icdar2015 = [
61
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
62
+ dict(
63
+ type='LoadTextAnnotations',
64
+ with_bbox=True,
65
+ with_mask=True,
66
+ poly2mask=False),
67
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
68
+ dict(type='Normalize', **img_norm_cfg),
69
+ dict(
70
+ type='ScaleAspectJitter',
71
+ img_scale=img_scale_train_icdar2015,
72
+ ratio_range=(0.7, 1.3),
73
+ aspect_ratio_range=(0.9, 1.1),
74
+ multiscale_mode='value',
75
+ keep_ratio=False),
76
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2015),
77
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
78
+ dict(type='RandomRotateTextDet'),
79
+ dict(
80
+ type='RandomCropInstances',
81
+ target_size=target_size_train_icdar2015,
82
+ instance_key='gt_kernels'),
83
+ dict(type='Pad', size_divisor=32),
84
+ dict(
85
+ type='CustomFormatBundle',
86
+ keys=['gt_kernels', 'gt_mask'],
87
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
88
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
89
+ ]
90
+
91
+ img_scale_test_icdar2015 = (1333, 736)
92
+ test_pipeline_icdar2015 = [
93
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
94
+ dict(
95
+ type='MultiScaleFlipAug',
96
+ img_scale=img_scale_test_icdar2015,
97
+ flip=False,
98
+ transforms=[
99
+ dict(type='Resize', img_scale=(3000, 640), keep_ratio=True),
100
+ dict(type='Normalize', **img_norm_cfg),
101
+ dict(type='Pad', size_divisor=32),
102
+ dict(type='ImageToTensor', keys=['img']),
103
+ dict(type='Collect', keys=['img']),
104
+ ])
105
+ ]
106
+
107
+ # for icdar2017
108
+ img_scale_train_icdar2017 = [(3000, 800)]
109
+ shrink_ratio_train_icdar2017 = (1.0, 0.5)
110
+ target_size_train_icdar2017 = (800, 800)
111
+ train_pipeline_icdar2017 = [
112
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
113
+ dict(
114
+ type='LoadTextAnnotations',
115
+ with_bbox=True,
116
+ with_mask=True,
117
+ poly2mask=False),
118
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
119
+ dict(type='Normalize', **img_norm_cfg),
120
+ dict(
121
+ type='ScaleAspectJitter',
122
+ img_scale=img_scale_train_icdar2017,
123
+ ratio_range=(0.7, 1.3),
124
+ aspect_ratio_range=(0.9, 1.1),
125
+ multiscale_mode='value',
126
+ keep_ratio=False),
127
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2017),
128
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
129
+ dict(type='RandomRotateTextDet'),
130
+ dict(
131
+ type='RandomCropInstances',
132
+ target_size=target_size_train_icdar2017,
133
+ instance_key='gt_kernels'),
134
+ dict(type='Pad', size_divisor=32),
135
+ dict(
136
+ type='CustomFormatBundle',
137
+ keys=['gt_kernels', 'gt_mask'],
138
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
139
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
140
+ ]
141
+
142
+ img_scale_test_icdar2017 = (1333, 800)
143
+ test_pipeline_icdar2017 = [
144
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
145
+ dict(
146
+ type='MultiScaleFlipAug',
147
+ img_scale=img_scale_test_icdar2017,
148
+ flip=False,
149
+ transforms=[
150
+ dict(type='Resize', img_scale=(3000, 640), keep_ratio=True),
151
+ dict(type='Normalize', **img_norm_cfg),
152
+ dict(type='Pad', size_divisor=32),
153
+ dict(type='ImageToTensor', keys=['img']),
154
+ dict(type='Collect', keys=['img']),
155
+ ])
156
+ ]
configs/_base_/det_pipelines/psenet_pipeline.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='ScaleAspectJitter',
15
+ img_scale=[(3000, 736)],
16
+ ratio_range=(0.5, 3),
17
+ aspect_ratio_range=(1, 1),
18
+ multiscale_mode='value',
19
+ long_size_bound=1280,
20
+ short_size_bound=640,
21
+ resize_type='long_short_bound',
22
+ keep_ratio=False),
23
+ dict(type='PSENetTargets'),
24
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
25
+ dict(type='RandomRotateTextDet'),
26
+ dict(
27
+ type='RandomCropInstances',
28
+ target_size=(640, 640),
29
+ instance_key='gt_kernels'),
30
+ dict(type='Pad', size_divisor=32),
31
+ dict(
32
+ type='CustomFormatBundle',
33
+ keys=['gt_kernels', 'gt_mask'],
34
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
35
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
36
+ ]
37
+
38
+ # for ctw1500
39
+ img_scale_test_ctw1500 = (1280, 1280)
40
+ test_pipeline_ctw1500 = [
41
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
42
+ dict(
43
+ type='MultiScaleFlipAug',
44
+ img_scale=img_scale_test_ctw1500,
45
+ flip=False,
46
+ transforms=[
47
+ dict(type='Resize', img_scale=(1280, 1280), keep_ratio=True),
48
+ dict(type='Normalize', **img_norm_cfg),
49
+ dict(type='Pad', size_divisor=32),
50
+ dict(type='ImageToTensor', keys=['img']),
51
+ dict(type='Collect', keys=['img']),
52
+ ])
53
+ ]
54
+
55
+ # for icdar2015
56
+ img_scale_test_icdar2015 = (2240, 2240)
57
+ test_pipeline_icdar2015 = [
58
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
59
+ dict(
60
+ type='MultiScaleFlipAug',
61
+ img_scale=img_scale_test_icdar2015,
62
+ flip=False,
63
+ transforms=[
64
+ dict(type='Resize', img_scale=(1280, 1280), keep_ratio=True),
65
+ dict(type='Normalize', **img_norm_cfg),
66
+ dict(type='Pad', size_divisor=32),
67
+ dict(type='ImageToTensor', keys=['img']),
68
+ dict(type='Collect', keys=['img']),
69
+ ])
70
+ ]
configs/_base_/det_pipelines/textsnake_pipeline.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='RandomCropPolyInstances',
15
+ instance_key='gt_masks',
16
+ crop_ratio=0.65,
17
+ min_side_ratio=0.3),
18
+ dict(
19
+ type='RandomRotatePolyInstances',
20
+ rotate_ratio=0.5,
21
+ max_angle=20,
22
+ pad_with_fixed_color=False),
23
+ dict(
24
+ type='ScaleAspectJitter',
25
+ img_scale=[(3000, 736)], # unused
26
+ ratio_range=(0.7, 1.3),
27
+ aspect_ratio_range=(0.9, 1.1),
28
+ multiscale_mode='value',
29
+ long_size_bound=800,
30
+ short_size_bound=480,
31
+ resize_type='long_short_bound',
32
+ keep_ratio=False),
33
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
34
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
35
+ dict(type='TextSnakeTargets'),
36
+ dict(type='Pad', size_divisor=32),
37
+ dict(
38
+ type='CustomFormatBundle',
39
+ keys=[
40
+ 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
41
+ 'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
42
+ ],
43
+ visualize=dict(flag=False, boundary_key='gt_text_mask')),
44
+ dict(
45
+ type='Collect',
46
+ keys=[
47
+ 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
48
+ 'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
49
+ ])
50
+ ]
51
+
52
+ test_pipeline = [
53
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
54
+ dict(
55
+ type='MultiScaleFlipAug',
56
+ img_scale=(1333, 736),
57
+ flip=False,
58
+ transforms=[
59
+ dict(type='Resize', img_scale=(1333, 736), keep_ratio=True),
60
+ dict(type='Normalize', **img_norm_cfg),
61
+ dict(type='Pad', size_divisor=32),
62
+ dict(type='ImageToTensor', keys=['img']),
63
+ dict(type='Collect', keys=['img']),
64
+ ])
65
+ ]
configs/_base_/recog_datasets/MJ_train.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: Syn90k
3
+
4
+ train_root = 'data/mixture/Syn90k'
5
+
6
+ train_img_prefix = f'{train_root}/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file = f'{train_root}/label.lmdb'
8
+
9
+ train = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix,
12
+ ann_file=train_ann_file,
13
+ loader=dict(
14
+ type='LmdbLoader',
15
+ repeat=1,
16
+ parser=dict(
17
+ type='LineStrParser',
18
+ keys=['filename', 'text'],
19
+ keys_idx=[0, 1],
20
+ separator=' ')),
21
+ pipeline=None,
22
+ test_mode=False)
23
+
24
+ train_list = [train]
configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+ # Both annotations are filtered so that
4
+ # only alphanumeric terms are left
5
+
6
+ train_root = 'data/mixture'
7
+
8
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
9
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
10
+
11
+ train1 = dict(
12
+ type='OCRDataset',
13
+ img_prefix=train_img_prefix1,
14
+ ann_file=train_ann_file1,
15
+ loader=dict(
16
+ type='LmdbLoader',
17
+ repeat=1,
18
+ parser=dict(
19
+ type='LineStrParser',
20
+ keys=['filename', 'text'],
21
+ keys_idx=[0, 1],
22
+ separator=' ')),
23
+ pipeline=None,
24
+ test_mode=False)
25
+
26
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
27
+ 'synthtext/SynthText_patch_horizontal'
28
+ train_ann_file2 = f'{train_root}/SynthText/alphanumeric_label.lmdb'
29
+
30
+ train2 = {key: value for key, value in train1.items()}
31
+ train2['img_prefix'] = train_img_prefix2
32
+ train2['ann_file'] = train_ann_file2
33
+
34
+ train_list = [train1, train2]
configs/_base_/recog_datasets/ST_MJ_train.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+
4
+ train_root = 'data/mixture'
5
+
6
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
8
+
9
+ train1 = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix1,
12
+ ann_file=train_ann_file1,
13
+ loader=dict(
14
+ type='LmdbLoader',
15
+ repeat=1,
16
+ parser=dict(
17
+ type='LineStrParser',
18
+ keys=['filename', 'text'],
19
+ keys_idx=[0, 1],
20
+ separator=' ')),
21
+ pipeline=None,
22
+ test_mode=False)
23
+
24
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
25
+ 'synthtext/SynthText_patch_horizontal'
26
+ train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
27
+
28
+ train2 = {key: value for key, value in train1.items()}
29
+ train2['img_prefix'] = train_img_prefix2
30
+ train2['ann_file'] = train_ann_file2
31
+
32
+ train_list = [train1, train2]
configs/_base_/recog_datasets/ST_SA_MJ_real_train.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, SynthAdd, Syn90k
3
+ # Real Dataset: IC11, IC13, IC15, COCO-Test, IIIT5k
4
+
5
+ train_prefix = 'data/mixture'
6
+
7
+ train_img_prefix1 = f'{train_prefix}/icdar_2011'
8
+ train_img_prefix2 = f'{train_prefix}/icdar_2013'
9
+ train_img_prefix3 = f'{train_prefix}/icdar_2015'
10
+ train_img_prefix4 = f'{train_prefix}/coco_text'
11
+ train_img_prefix5 = f'{train_prefix}/IIIT5K'
12
+ train_img_prefix6 = f'{train_prefix}/SynthText_Add'
13
+ train_img_prefix7 = f'{train_prefix}/SynthText'
14
+ train_img_prefix8 = f'{train_prefix}/Syn90k'
15
+
16
+ train_ann_file1 = f'{train_prefix}/icdar_2011/train_label.txt',
17
+ train_ann_file2 = f'{train_prefix}/icdar_2013/train_label.txt',
18
+ train_ann_file3 = f'{train_prefix}/icdar_2015/train_label.txt',
19
+ train_ann_file4 = f'{train_prefix}/coco_text/train_label.txt',
20
+ train_ann_file5 = f'{train_prefix}/IIIT5K/train_label.txt',
21
+ train_ann_file6 = f'{train_prefix}/SynthText_Add/label.txt',
22
+ train_ann_file7 = f'{train_prefix}/SynthText/shuffle_labels.txt',
23
+ train_ann_file8 = f'{train_prefix}/Syn90k/shuffle_labels.txt'
24
+
25
+ train1 = dict(
26
+ type='OCRDataset',
27
+ img_prefix=train_img_prefix1,
28
+ ann_file=train_ann_file1,
29
+ loader=dict(
30
+ type='HardDiskLoader',
31
+ repeat=20,
32
+ parser=dict(
33
+ type='LineStrParser',
34
+ keys=['filename', 'text'],
35
+ keys_idx=[0, 1],
36
+ separator=' ')),
37
+ pipeline=None,
38
+ test_mode=False)
39
+
40
+ train2 = {key: value for key, value in train1.items()}
41
+ train2['img_prefix'] = train_img_prefix2
42
+ train2['ann_file'] = train_ann_file2
43
+
44
+ train3 = {key: value for key, value in train1.items()}
45
+ train3['img_prefix'] = train_img_prefix3
46
+ train3['ann_file'] = train_ann_file3
47
+
48
+ train4 = {key: value for key, value in train1.items()}
49
+ train4['img_prefix'] = train_img_prefix4
50
+ train4['ann_file'] = train_ann_file4
51
+
52
+ train5 = {key: value for key, value in train1.items()}
53
+ train5['img_prefix'] = train_img_prefix5
54
+ train5['ann_file'] = train_ann_file5
55
+
56
+ train6 = dict(
57
+ type='OCRDataset',
58
+ img_prefix=train_img_prefix6,
59
+ ann_file=train_ann_file6,
60
+ loader=dict(
61
+ type='HardDiskLoader',
62
+ repeat=1,
63
+ parser=dict(
64
+ type='LineStrParser',
65
+ keys=['filename', 'text'],
66
+ keys_idx=[0, 1],
67
+ separator=' ')),
68
+ pipeline=None,
69
+ test_mode=False)
70
+
71
+ train7 = {key: value for key, value in train6.items()}
72
+ train7['img_prefix'] = train_img_prefix7
73
+ train7['ann_file'] = train_ann_file7
74
+
75
+ train8 = {key: value for key, value in train6.items()}
76
+ train8['img_prefix'] = train_img_prefix8
77
+ train8['ann_file'] = train_ann_file8
78
+
79
+ train_list = [train1, train2, train3, train4, train5, train6, train7, train8]
configs/_base_/recog_datasets/ST_charbox_train.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText (with character level boxes)
3
+
4
+ train_img_root = 'data/mixture'
5
+
6
+ train_img_prefix = f'{train_img_root}/SynthText'
7
+
8
+ train_ann_file = f'{train_img_root}/SynthText/instances_train.txt'
9
+
10
+ train = dict(
11
+ type='OCRSegDataset',
12
+ img_prefix=train_img_prefix,
13
+ ann_file=train_ann_file,
14
+ loader=dict(
15
+ type='HardDiskLoader',
16
+ repeat=1,
17
+ parser=dict(
18
+ type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
19
+ pipeline=None,
20
+ test_mode=False)
21
+
22
+ train_list = [train]
configs/_base_/recog_datasets/academic_test.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Testing set, including:
2
+ # Regular Datasets: IIIT5K, SVT, IC13
3
+ # Irregular Datasets: IC15, SVTP, CT80
4
+
5
+ test_root = 'data/mixture'
6
+
7
+ test_img_prefix1 = f'{test_root}/IIIT5K/'
8
+ test_img_prefix2 = f'{test_root}/svt/'
9
+ test_img_prefix3 = f'{test_root}/icdar_2013/'
10
+ test_img_prefix4 = f'{test_root}/icdar_2015/'
11
+ test_img_prefix5 = f'{test_root}/svtp/'
12
+ test_img_prefix6 = f'{test_root}/ct80/'
13
+
14
+ test_ann_file1 = f'{test_root}/IIIT5K/test_label.txt'
15
+ test_ann_file2 = f'{test_root}/svt/test_label.txt'
16
+ test_ann_file3 = f'{test_root}/icdar_2013/test_label_1015.txt'
17
+ test_ann_file4 = f'{test_root}/icdar_2015/test_label.txt'
18
+ test_ann_file5 = f'{test_root}/svtp/test_label.txt'
19
+ test_ann_file6 = f'{test_root}/ct80/test_label.txt'
20
+
21
+ test1 = dict(
22
+ type='OCRDataset',
23
+ img_prefix=test_img_prefix1,
24
+ ann_file=test_ann_file1,
25
+ loader=dict(
26
+ type='HardDiskLoader',
27
+ repeat=1,
28
+ parser=dict(
29
+ type='LineStrParser',
30
+ keys=['filename', 'text'],
31
+ keys_idx=[0, 1],
32
+ separator=' ')),
33
+ pipeline=None,
34
+ test_mode=True)
35
+
36
+ test2 = {key: value for key, value in test1.items()}
37
+ test2['img_prefix'] = test_img_prefix2
38
+ test2['ann_file'] = test_ann_file2
39
+
40
+ test3 = {key: value for key, value in test1.items()}
41
+ test3['img_prefix'] = test_img_prefix3
42
+ test3['ann_file'] = test_ann_file3
43
+
44
+ test4 = {key: value for key, value in test1.items()}
45
+ test4['img_prefix'] = test_img_prefix4
46
+ test4['ann_file'] = test_ann_file4
47
+
48
+ test5 = {key: value for key, value in test1.items()}
49
+ test5['img_prefix'] = test_img_prefix5
50
+ test5['ann_file'] = test_ann_file5
51
+
52
+ test6 = {key: value for key, value in test1.items()}
53
+ test6['img_prefix'] = test_img_prefix6
54
+ test6['ann_file'] = test_ann_file6
55
+
56
+ test_list = [test1, test2, test3, test4, test5, test6]
configs/_base_/recog_datasets/seg_toy_data.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prefix = 'tests/data/ocr_char_ann_toy_dataset/'
2
+
3
+ train = dict(
4
+ type='OCRSegDataset',
5
+ img_prefix=f'{prefix}/imgs',
6
+ ann_file=f'{prefix}/instances_train.txt',
7
+ loader=dict(
8
+ type='HardDiskLoader',
9
+ repeat=100,
10
+ parser=dict(
11
+ type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
12
+ pipeline=None,
13
+ test_mode=True)
14
+
15
+ test = dict(
16
+ type='OCRDataset',
17
+ img_prefix=f'{prefix}/imgs',
18
+ ann_file=f'{prefix}/instances_test.txt',
19
+ loader=dict(
20
+ type='HardDiskLoader',
21
+ repeat=1,
22
+ parser=dict(
23
+ type='LineStrParser',
24
+ keys=['filename', 'text'],
25
+ keys_idx=[0, 1],
26
+ separator=' ')),
27
+ pipeline=None,
28
+ test_mode=True)
29
+
30
+ train_list = [train]
31
+
32
+ test_list = [test]
configs/_base_/recog_datasets/toy_data.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'OCRDataset'
2
+
3
+ root = 'tests/data/ocr_toy_dataset'
4
+ img_prefix = f'{root}/imgs'
5
+ train_anno_file1 = f'{root}/label.txt'
6
+
7
+ train1 = dict(
8
+ type=dataset_type,
9
+ img_prefix=img_prefix,
10
+ ann_file=train_anno_file1,
11
+ loader=dict(
12
+ type='HardDiskLoader',
13
+ repeat=100,
14
+ parser=dict(
15
+ type='LineStrParser',
16
+ keys=['filename', 'text'],
17
+ keys_idx=[0, 1],
18
+ separator=' ')),
19
+ pipeline=None,
20
+ test_mode=False)
21
+
22
+ train_anno_file2 = f'{root}/label.lmdb'
23
+ train2 = dict(
24
+ type=dataset_type,
25
+ img_prefix=img_prefix,
26
+ ann_file=train_anno_file2,
27
+ loader=dict(
28
+ type='LmdbLoader',
29
+ repeat=100,
30
+ parser=dict(
31
+ type='LineStrParser',
32
+ keys=['filename', 'text'],
33
+ keys_idx=[0, 1],
34
+ separator=' ')),
35
+ pipeline=None,
36
+ test_mode=False)
37
+
38
+ test_anno_file1 = f'{root}/label.lmdb'
39
+ test = dict(
40
+ type=dataset_type,
41
+ img_prefix=img_prefix,
42
+ ann_file=test_anno_file1,
43
+ loader=dict(
44
+ type='LmdbLoader',
45
+ repeat=1,
46
+ parser=dict(
47
+ type='LineStrParser',
48
+ keys=['filename', 'text'],
49
+ keys_idx=[0, 1],
50
+ separator=' ')),
51
+ pipeline=None,
52
+ test_mode=True)
53
+
54
+ train_list = [train1, train2]
55
+
56
+ test_list = [test]
configs/_base_/recog_models/abinet.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # num_chars depends on the configuration of label_convertor. The actual
2
+ # dictionary size is 36 + 1 (<BOS/EOS>).
3
+ # TODO: Automatically update num_chars based on the configuration of
4
+ # label_convertor
5
+ num_chars = 37
6
+ max_seq_len = 26
7
+
8
+ label_convertor = dict(
9
+ type='ABIConvertor',
10
+ dict_type='DICT36',
11
+ with_unknown=False,
12
+ with_padding=False,
13
+ lower=True,
14
+ )
15
+
16
+ model = dict(
17
+ type='ABINet',
18
+ backbone=dict(type='ResNetABI'),
19
+ encoder=dict(
20
+ type='ABIVisionModel',
21
+ encoder=dict(
22
+ type='TransformerEncoder',
23
+ n_layers=3,
24
+ n_head=8,
25
+ d_model=512,
26
+ d_inner=2048,
27
+ dropout=0.1,
28
+ max_len=8 * 32,
29
+ ),
30
+ decoder=dict(
31
+ type='ABIVisionDecoder',
32
+ in_channels=512,
33
+ num_channels=64,
34
+ attn_height=8,
35
+ attn_width=32,
36
+ attn_mode='nearest',
37
+ use_result='feature',
38
+ num_chars=num_chars,
39
+ max_seq_len=max_seq_len,
40
+ init_cfg=dict(type='Xavier', layer='Conv2d')),
41
+ ),
42
+ decoder=dict(
43
+ type='ABILanguageDecoder',
44
+ d_model=512,
45
+ n_head=8,
46
+ d_inner=2048,
47
+ n_layers=4,
48
+ dropout=0.1,
49
+ detach_tokens=True,
50
+ use_self_attn=False,
51
+ pad_idx=num_chars - 1,
52
+ num_chars=num_chars,
53
+ max_seq_len=max_seq_len,
54
+ init_cfg=None),
55
+ fuser=dict(
56
+ type='ABIFuser',
57
+ d_model=512,
58
+ num_chars=num_chars,
59
+ init_cfg=None,
60
+ max_seq_len=max_seq_len,
61
+ ),
62
+ loss=dict(
63
+ type='ABILoss',
64
+ enc_weight=1.0,
65
+ dec_weight=1.0,
66
+ fusion_weight=1.0,
67
+ num_classes=num_chars),
68
+ label_convertor=label_convertor,
69
+ max_seq_len=max_seq_len,
70
+ iter_size=3)
configs/_base_/recog_models/crnn.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
3
+
4
+ model = dict(
5
+ type='CRNNNet',
6
+ preprocessor=None,
7
+ backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
8
+ encoder=None,
9
+ decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
10
+ loss=dict(type='CTCLoss'),
11
+ label_convertor=label_convertor,
12
+ pretrained=None)
configs/_base_/recog_models/crnn_tps.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model
2
+ label_convertor = dict(
3
+ type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
4
+
5
+ model = dict(
6
+ type='CRNNNet',
7
+ preprocessor=dict(
8
+ type='TPSPreprocessor',
9
+ num_fiducial=20,
10
+ img_size=(32, 100),
11
+ rectified_img_size=(32, 100),
12
+ num_img_channel=1),
13
+ backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
14
+ encoder=None,
15
+ decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
16
+ loss=dict(type='CTCLoss'),
17
+ label_convertor=label_convertor,
18
+ pretrained=None)
configs/_base_/recog_models/nrtr_modality_transform.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='AttnConvertor', dict_type='DICT36', with_unknown=True, lower=True)
3
+
4
+ model = dict(
5
+ type='NRTR',
6
+ backbone=dict(type='NRTRModalityTransform'),
7
+ encoder=dict(type='NRTREncoder', n_layers=12),
8
+ decoder=dict(type='NRTRDecoder'),
9
+ loss=dict(type='TFLoss'),
10
+ label_convertor=label_convertor,
11
+ max_seq_len=40)
configs/_base_/recog_models/robust_scanner.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='AttnConvertor', dict_type='DICT90', with_unknown=True)
3
+
4
+ hybrid_decoder = dict(type='SequenceAttentionDecoder')
5
+
6
+ position_decoder = dict(type='PositionAttentionDecoder')
7
+
8
+ model = dict(
9
+ type='RobustScanner',
10
+ backbone=dict(type='ResNet31OCR'),
11
+ encoder=dict(
12
+ type='ChannelReductionEncoder',
13
+ in_channels=512,
14
+ out_channels=128,
15
+ ),
16
+ decoder=dict(
17
+ type='RobustScannerDecoder',
18
+ dim_input=512,
19
+ dim_model=128,
20
+ hybrid_decoder=hybrid_decoder,
21
+ position_decoder=position_decoder),
22
+ loss=dict(type='SARLoss'),
23
+ label_convertor=label_convertor,
24
+ max_seq_len=30)
configs/_base_/recog_models/sar.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='AttnConvertor', dict_type='DICT90', with_unknown=True)
3
+
4
+ model = dict(
5
+ type='SARNet',
6
+ backbone=dict(type='ResNet31OCR'),
7
+ encoder=dict(
8
+ type='SAREncoder',
9
+ enc_bi_rnn=False,
10
+ enc_do_rnn=0.1,
11
+ enc_gru=False,
12
+ ),
13
+ decoder=dict(
14
+ type='ParallelSARDecoder',
15
+ enc_bi_rnn=False,
16
+ dec_bi_rnn=False,
17
+ dec_do_rnn=0,
18
+ dec_gru=False,
19
+ pred_dropout=0.1,
20
+ d_k=512,
21
+ pred_concat=True),
22
+ loss=dict(type='SARLoss'),
23
+ label_convertor=label_convertor,
24
+ max_seq_len=30)
configs/_base_/recog_models/satrn.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='AttnConvertor', dict_type='DICT36', with_unknown=True, lower=True)
3
+
4
+ model = dict(
5
+ type='SATRN',
6
+ backbone=dict(type='ShallowCNN'),
7
+ encoder=dict(type='SatrnEncoder'),
8
+ decoder=dict(type='TFDecoder'),
9
+ loss=dict(type='TFLoss'),
10
+ label_convertor=label_convertor,
11
+ max_seq_len=40)
configs/_base_/recog_models/seg.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='SegConvertor', dict_type='DICT36', with_unknown=True, lower=True)
3
+
4
+ model = dict(
5
+ type='SegRecognizer',
6
+ backbone=dict(
7
+ type='ResNet31OCR',
8
+ layers=[1, 2, 5, 3],
9
+ channels=[32, 64, 128, 256, 512, 512],
10
+ out_indices=[0, 1, 2, 3],
11
+ stage4_pool_cfg=dict(kernel_size=2, stride=2),
12
+ last_stage_pool=True),
13
+ neck=dict(
14
+ type='FPNOCR', in_channels=[128, 256, 512, 512], out_channels=256),
15
+ head=dict(
16
+ type='SegHead',
17
+ in_channels=256,
18
+ upsample_param=dict(scale_factor=2.0, mode='nearest')),
19
+ loss=dict(
20
+ type='SegLoss', seg_downsample_ratio=1.0, seg_with_loss_weight=True),
21
+ label_convertor=label_convertor)
configs/_base_/recog_pipelines/abinet_pipeline.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
2
+ train_pipeline = [
3
+ dict(type='LoadImageFromFile'),
4
+ dict(
5
+ type='ResizeOCR',
6
+ height=32,
7
+ min_width=128,
8
+ max_width=128,
9
+ keep_aspect_ratio=False,
10
+ width_downsample_ratio=0.25),
11
+ dict(
12
+ type='RandomWrapper',
13
+ p=0.5,
14
+ transforms=[
15
+ dict(
16
+ type='OneOfWrapper',
17
+ transforms=[
18
+ dict(
19
+ type='RandomRotateTextDet',
20
+ max_angle=15,
21
+ ),
22
+ dict(
23
+ type='TorchVisionWrapper',
24
+ op='RandomAffine',
25
+ degrees=15,
26
+ translate=(0.3, 0.3),
27
+ scale=(0.5, 2.),
28
+ shear=(-45, 45),
29
+ ),
30
+ dict(
31
+ type='TorchVisionWrapper',
32
+ op='RandomPerspective',
33
+ distortion_scale=0.5,
34
+ p=1,
35
+ ),
36
+ ])
37
+ ],
38
+ ),
39
+ dict(
40
+ type='RandomWrapper',
41
+ p=0.25,
42
+ transforms=[
43
+ dict(type='PyramidRescale'),
44
+ dict(
45
+ type='Albu',
46
+ transforms=[
47
+ dict(type='GaussNoise', var_limit=(20, 20), p=0.5),
48
+ dict(type='MotionBlur', blur_limit=6, p=0.5),
49
+ ]),
50
+ ]),
51
+ dict(
52
+ type='RandomWrapper',
53
+ p=0.25,
54
+ transforms=[
55
+ dict(
56
+ type='TorchVisionWrapper',
57
+ op='ColorJitter',
58
+ brightness=0.5,
59
+ saturation=0.5,
60
+ contrast=0.5,
61
+ hue=0.1),
62
+ ]),
63
+ dict(type='ToTensorOCR'),
64
+ dict(type='NormalizeOCR', **img_norm_cfg),
65
+ dict(
66
+ type='Collect',
67
+ keys=['img'],
68
+ meta_keys=[
69
+ 'filename', 'ori_shape', 'img_shape', 'text', 'valid_ratio',
70
+ 'resize_shape'
71
+ ]),
72
+ ]
73
+ test_pipeline = [
74
+ dict(type='LoadImageFromFile'),
75
+ dict(
76
+ type='MultiRotateAugOCR',
77
+ rotate_degrees=[0, 90, 270],
78
+ transforms=[
79
+ dict(
80
+ type='ResizeOCR',
81
+ height=32,
82
+ min_width=128,
83
+ max_width=128,
84
+ keep_aspect_ratio=False,
85
+ width_downsample_ratio=0.25),
86
+ dict(type='ToTensorOCR'),
87
+ dict(type='NormalizeOCR', **img_norm_cfg),
88
+ dict(
89
+ type='Collect',
90
+ keys=['img'],
91
+ meta_keys=[
92
+ 'filename', 'ori_shape', 'img_shape', 'valid_ratio',
93
+ 'resize_shape', 'img_norm_cfg', 'ori_filename'
94
+ ]),
95
+ ])
96
+ ]
configs/_base_/recog_pipelines/crnn_pipeline.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(mean=[127], std=[127])
2
+
3
+ train_pipeline = [
4
+ dict(type='LoadImageFromFile', color_type='grayscale'),
5
+ dict(
6
+ type='ResizeOCR',
7
+ height=32,
8
+ min_width=100,
9
+ max_width=100,
10
+ keep_aspect_ratio=False),
11
+ dict(type='Normalize', **img_norm_cfg),
12
+ dict(type='DefaultFormatBundle'),
13
+ dict(
14
+ type='Collect',
15
+ keys=['img'],
16
+ meta_keys=['filename', 'resize_shape', 'text', 'valid_ratio']),
17
+ ]
18
+ test_pipeline = [
19
+ dict(type='LoadImageFromFile', color_type='grayscale'),
20
+ dict(
21
+ type='ResizeOCR',
22
+ height=32,
23
+ min_width=32,
24
+ max_width=None,
25
+ keep_aspect_ratio=True),
26
+ dict(type='Normalize', **img_norm_cfg),
27
+ dict(type='DefaultFormatBundle'),
28
+ dict(
29
+ type='Collect',
30
+ keys=['img'],
31
+ meta_keys=[
32
+ 'filename', 'resize_shape', 'valid_ratio', 'img_norm_cfg',
33
+ 'ori_filename', 'img_shape', 'ori_shape'
34
+ ]),
35
+ ]
configs/_base_/recog_pipelines/crnn_tps_pipeline.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(mean=[0.5], std=[0.5])
2
+
3
+ train_pipeline = [
4
+ dict(type='LoadImageFromFile', color_type='grayscale'),
5
+ dict(
6
+ type='ResizeOCR',
7
+ height=32,
8
+ min_width=100,
9
+ max_width=100,
10
+ keep_aspect_ratio=False),
11
+ dict(type='ToTensorOCR'),
12
+ dict(type='NormalizeOCR', **img_norm_cfg),
13
+ dict(
14
+ type='Collect',
15
+ keys=['img'],
16
+ meta_keys=[
17
+ 'filename', 'ori_shape', 'resize_shape', 'text', 'valid_ratio'
18
+ ]),
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile', color_type='grayscale'),
22
+ dict(
23
+ type='ResizeOCR',
24
+ height=32,
25
+ min_width=32,
26
+ max_width=100,
27
+ keep_aspect_ratio=False),
28
+ dict(type='ToTensorOCR'),
29
+ dict(type='NormalizeOCR', **img_norm_cfg),
30
+ dict(
31
+ type='Collect',
32
+ keys=['img'],
33
+ meta_keys=[
34
+ 'filename', 'ori_shape', 'resize_shape', 'valid_ratio',
35
+ 'img_norm_cfg', 'ori_filename', 'img_shape'
36
+ ]),
37
+ ]
configs/_base_/recog_pipelines/nrtr_pipeline.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
2
+ train_pipeline = [
3
+ dict(type='LoadImageFromFile'),
4
+ dict(
5
+ type='ResizeOCR',
6
+ height=32,
7
+ min_width=32,
8
+ max_width=160,
9
+ keep_aspect_ratio=True,
10
+ width_downsample_ratio=0.25),
11
+ dict(type='ToTensorOCR'),
12
+ dict(type='NormalizeOCR', **img_norm_cfg),
13
+ dict(
14
+ type='Collect',
15
+ keys=['img'],
16
+ meta_keys=[
17
+ 'filename', 'ori_shape', 'resize_shape', 'text', 'valid_ratio'
18
+ ]),
19
+ ]
20
+
21
+ test_pipeline = [
22
+ dict(type='LoadImageFromFile'),
23
+ dict(
24
+ type='ResizeOCR',
25
+ height=32,
26
+ min_width=32,
27
+ max_width=160,
28
+ keep_aspect_ratio=True),
29
+ dict(type='ToTensorOCR'),
30
+ dict(type='NormalizeOCR', **img_norm_cfg),
31
+ dict(
32
+ type='Collect',
33
+ keys=['img'],
34
+ meta_keys=[
35
+ 'filename', 'ori_shape', 'resize_shape', 'valid_ratio',
36
+ 'img_norm_cfg', 'ori_filename', 'img_shape'
37
+ ])
38
+ ]
configs/_base_/recog_pipelines/sar_pipeline.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
2
+ train_pipeline = [
3
+ dict(type='LoadImageFromFile'),
4
+ dict(
5
+ type='ResizeOCR',
6
+ height=48,
7
+ min_width=48,
8
+ max_width=160,
9
+ keep_aspect_ratio=True,
10
+ width_downsample_ratio=0.25),
11
+ dict(type='ToTensorOCR'),
12
+ dict(type='NormalizeOCR', **img_norm_cfg),
13
+ dict(
14
+ type='Collect',
15
+ keys=['img'],
16
+ meta_keys=[
17
+ 'filename', 'ori_shape', 'resize_shape', 'text', 'valid_ratio'
18
+ ]),
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiRotateAugOCR',
24
+ rotate_degrees=[0, 90, 270],
25
+ transforms=[
26
+ dict(
27
+ type='ResizeOCR',
28
+ height=48,
29
+ min_width=48,
30
+ max_width=160,
31
+ keep_aspect_ratio=True,
32
+ width_downsample_ratio=0.25),
33
+ dict(type='ToTensorOCR'),
34
+ dict(type='NormalizeOCR', **img_norm_cfg),
35
+ dict(
36
+ type='Collect',
37
+ keys=['img'],
38
+ meta_keys=[
39
+ 'filename', 'ori_shape', 'resize_shape', 'valid_ratio',
40
+ 'img_norm_cfg', 'ori_filename', 'img_shape'
41
+ ]),
42
+ ])
43
+ ]