NguyenPhong2612 commited on
Commit
ab576ba
·
1 Parent(s): 4185961

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /Test.ipynb/
2
+ /Test image/
3
+ /parseq/__pycache__/
4
+ /wpodnet/__pycache__/
5
+ /wpodnet/__init__/
6
+ /__init__/
7
+ /flagged/
8
+ /weights/
Test.ipynb ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 89,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import torch \n",
10
+ "import torch.nn as nn\n",
11
+ "from parseq.system import System\n",
12
+ "import yaml\n",
13
+ "import cv2\n",
14
+ "from parseq.augmentation import trans\n",
15
+ "import PIL\n",
16
+ "import imgaug\n",
17
+ "import torchvision\n",
18
+ "from wpodnet.lib_detection import load_model_wpod, detect_lp\n",
19
+ "import numpy as np\n",
20
+ "import gradio as gr \n",
21
+ "import tensorflow as tf\n",
22
+ "from tensorflow import keras\n",
23
+ "import timm\n",
24
+ "import pytorch_lightning as pl"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 91,
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "name": "stdout",
34
+ "output_type": "stream",
35
+ "text": [
36
+ "tensorflow==2.13.1\n",
37
+ "torch==2.4.1+cu118\n",
38
+ "gradio==4.44.1\n",
39
+ "timm==1.0.9\n",
40
+ "PIL==10.2.0\n",
41
+ "imgaug==0.4.0\n",
42
+ "opencv-python==4.10.0\n",
43
+ "torchvision==0.19.1+cu118\n"
44
+ ]
45
+ }
46
+ ],
47
+ "source": [
48
+ "print(f'tensorflow=={tf.__version__}')\n",
49
+ "print(f'torch=={torch.__version__}')\n",
50
+ "print(f'gradio=={gr.__version__}')\n",
51
+ "print(f'timm=={timm.__version__}')\n",
52
+ "print(f'PIL=={PIL.__version__}')\n",
53
+ "print(f'imgaug=={imgaug.__version__}')\n",
54
+ "print(f'opencv-python=={cv2.__version__}')\n",
55
+ "print(f'torchvision=={torchvision.__version__}')"
56
+ ]
57
+ }
58
+ ],
59
+ "metadata": {
60
+ "kernelspec": {
61
+ "display_name": "virtual",
62
+ "language": "python",
63
+ "name": "python3"
64
+ },
65
+ "language_info": {
66
+ "codemirror_mode": {
67
+ "name": "ipython",
68
+ "version": 3
69
+ },
70
+ "file_extension": ".py",
71
+ "mimetype": "text/x-python",
72
+ "name": "python",
73
+ "nbconvert_exporter": "python",
74
+ "pygments_lexer": "ipython3",
75
+ "version": "3.10.0"
76
+ }
77
+ },
78
+ "nbformat": 4,
79
+ "nbformat_minor": 2
80
+ }
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from parseq.system import System
4
+ import yaml
5
+ import cv2
6
+ from parseq.augmentation import trans
7
+ from PIL import Image
8
+ from wpodnet.lib_detection import load_model_wpod, detect_lp
9
+ import numpy as np
10
+ import gradio as gr
11
+
12
+
13
+
14
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
15
+
16
+ checkpoint_path = 'weights/parseq.ckpt'
17
+ config_path = 'parseq/config.yaml'
18
+ wpod_path = 'weights/wpod-net.h5'
19
+ wpod_net = load_model_wpod(wpod_path)
20
+
21
+ with open(config_path, 'r') as data:
22
+ config = yaml.safe_load(data)
23
+ system = System(config)
24
+ checkpoint_path = 'weights/parseq.ckpt'
25
+ checkpoint = torch.load(checkpoint_path, map_location = 'cuda')
26
+ system.load_state_dict(checkpoint['state_dict'])
27
+ system.to(device)
28
+
29
+ def predict(image):
30
+ if isinstance(image, str):
31
+ image = cv2.imread(image)
32
+ _, img_wapred, _, _ = detect_lp(wpod_net, image, 0.5)
33
+ img = (img_wapred[0] * 255).astype(np.uint8)
34
+ img = Image.fromarray(img).convert("RGB")
35
+ image = trans(img).unsqueeze(0)
36
+ with torch.no_grad():
37
+ pred = system(image).softmax(-1)
38
+ generated_text, _ = system.tokenizer.decode(pred)
39
+ return generated_text[0]
40
+
41
+ interface = gr.Interface(
42
+ fn = predict,
43
+ inputs =[gr.components.Image()],
44
+ outputs=[gr.components.Textbox(label = "License plate", lines = 2)])
45
+ interface.launch(share = True, debug = True)
parseq/augmentation.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image, ImageFilter
2
+ from timm.data.auto_augment import _LEVEL_DENOM, LEVEL_TO_ARG, NAME_TO_OP, _randomly_negate, rotate
3
+ from functools import partial
4
+ from timm.data import auto_augment
5
+ import imgaug.augmenters as iaa
6
+ from torchvision import transforms as T
7
+ import numpy as np
8
+
9
+ image_size = [224, 224]
10
+
11
+ def rotate_expand(img, degrees, **kwargs):
12
+ kwargs['expand'] = True
13
+ return rotate(img, degrees, **kwargs)
14
+
15
+
16
+ def _level_to_arg(level, hparams, key, default):
17
+ magnitude = hparams.get(key, default)
18
+ level = (level / _LEVEL_DENOM) * magnitude
19
+ level = _randomly_negate(level)
20
+ return (level,)
21
+
22
+
23
+ def apply():
24
+ NAME_TO_OP.update({
25
+ 'Rotate': rotate_expand,
26
+ })
27
+ LEVEL_TO_ARG.update({
28
+ 'Rotate': partial(_level_to_arg, key='rotate_deg', default=30.0),
29
+ 'ShearX': partial(_level_to_arg, key='shear_x_pct', default=0.3),
30
+ 'ShearY': partial(_level_to_arg, key='shear_y_pct', default=0.3),
31
+ 'TranslateXRel': partial(_level_to_arg, key='translate_x_pct', default=0.45),
32
+ 'TranslateYRel': partial(_level_to_arg, key='translate_y_pct', default=0.45),
33
+ })
34
+
35
+ apply()
36
+
37
+ _OP_CACHE = {}
38
+
39
+ def _get_op(key, factory):
40
+ try:
41
+ op = _OP_CACHE[key]
42
+ except KeyError:
43
+ op = factory()
44
+ _OP_CACHE[key] = op
45
+ return op
46
+
47
+
48
+ def _get_param(level, img, max_dim_factor, min_level=1):
49
+ max_level = max(min_level, max_dim_factor * max(img.size))
50
+ return round(min(level, max_level))
51
+
52
+
53
+ def gaussian_blur(img, radius, **__):
54
+ radius = _get_param(radius, img, 0.02)
55
+ key = 'gaussian_blur_' + str(radius)
56
+ op = _get_op(key, lambda: ImageFilter.GaussianBlur(radius))
57
+ return img.filter(op)
58
+
59
+
60
+ def motion_blur(img, k, **__):
61
+ k = _get_param(k, img, 0.08, 3) | 1 # bin to odd values
62
+ key = 'motion_blur_' + str(k)
63
+ op = _get_op(key, lambda: iaa.MotionBlur(k))
64
+ return Image.fromarray(op(image=np.asarray(img)))
65
+
66
+
67
+ def gaussian_noise(img, scale, **_):
68
+ scale = _get_param(scale, img, 0.25) | 1 # bin to odd values
69
+ key = 'gaussian_noise_' + str(scale)
70
+ op = _get_op(key, lambda: iaa.AdditiveGaussianNoise(scale=scale))
71
+ return Image.fromarray(op(image=np.asarray(img)))
72
+
73
+
74
+ def poisson_noise(img, lam, **_):
75
+ lam = _get_param(lam, img, 0.2) | 1 # bin to odd values
76
+ key = 'poisson_noise_' + str(lam)
77
+ op = _get_op(key, lambda: iaa.AdditivePoissonNoise(lam))
78
+ return Image.fromarray(op(image=np.asarray(img)))
79
+
80
+
81
+ def _level_to_arg(level, _hparams, max):
82
+ level = max * level / auto_augment._LEVEL_DENOM
83
+ return (level,)
84
+
85
+
86
+ _RAND_TRANSFORMS = auto_augment._RAND_INCREASING_TRANSFORMS.copy()
87
+ _RAND_TRANSFORMS.remove('SharpnessIncreasing') # remove, interferes with *blur ops
88
+ _RAND_TRANSFORMS.extend([
89
+ 'GaussianBlur',
90
+ 'PoissonNoise',
91
+ ])
92
+ auto_augment.LEVEL_TO_ARG.update({
93
+ 'GaussianBlur': partial(_level_to_arg, max=4),
94
+ 'MotionBlur': partial(_level_to_arg, max=20),
95
+ 'GaussianNoise': partial(_level_to_arg, max=0.1 * 255),
96
+ 'PoissonNoise': partial(_level_to_arg, max=40),
97
+ })
98
+ auto_augment.NAME_TO_OP.update({
99
+ 'GaussianBlur': gaussian_blur,
100
+ 'MotionBlur': motion_blur,
101
+ 'GaussianNoise': gaussian_noise,
102
+ 'PoissonNoise': poisson_noise,
103
+ })
104
+
105
+
106
+ def rand_augment_transform(magnitude=5, num_layers=3):
107
+ hparams = {
108
+ 'rotate_deg': 30,
109
+ 'shear_x_pct': 0.9,
110
+ 'shear_y_pct': 0.2,
111
+ 'translate_x_pct': 0.10,
112
+ 'translate_y_pct': 0.30,
113
+ }
114
+ ra_ops = auto_augment.rand_augment_ops(magnitude, hparams=hparams, transforms=_RAND_TRANSFORMS)
115
+ choice_weights = [1.0 / len(ra_ops) for _ in range(len(ra_ops))]
116
+ return auto_augment.RandAugment(ra_ops, num_layers, choice_weights)
117
+
118
+
119
+
120
+ trans = [rand_augment_transform()]
121
+ trans.append(lambda img: img.rotate(0, expand = True))
122
+ trans.extend([
123
+ T.Resize(image_size, T.InterpolationMode.BICUBIC),
124
+ T.ToTensor(),
125
+ T.Normalize(0.5, 0.5),
126
+ ])
127
+ trans = T.Compose(trans)
parseq/config.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ image_size : [224, 224]
3
+ patch_size : [16, 16]
4
+ max_len : 25
5
+ d_model : 384
6
+ enc_num_heads : 6
7
+ enc_mlp_ratio : 4
8
+ enc_depth : 12
9
+ dec_num_heads : 12
10
+ dec_mlp_ratio : 4
11
+ dec_depth : 1
12
+ perm_num : 8
13
+ perm_forward : true
14
+ perm_mirrored : true
15
+ decode_ar : true
16
+ refine_iter : 2
17
+ num_tokens : 97
18
+ pretrained : false
19
+ train_charset : 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~
20
+ weight_url : https://github.com/baudm/parseq/releases/download/v1.0.0/parseq_small_patch16_224-fcf06f5a.pt
21
+ trainer:
22
+ lr : 3e-4
23
+ batch_size : 4
24
+ weight_decay : 0.0
25
+ warm_pct : 0.075
parseq/module.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from timm.models.vision_transformer import PatchEmbed, VisionTransformer
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ import math
6
+
7
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
8
+
9
+
10
+
11
+ class Encoder(VisionTransformer):
12
+
13
+ def __init__(
14
+ self,
15
+ image_size=224,
16
+ patch_size=16,
17
+ in_chans=3,
18
+ embed_dim=768,
19
+ depth=12,
20
+ num_heads=12,
21
+ mlp_ratio=4.0,
22
+ qkv_bias=True,
23
+ drop_rate=0.0,
24
+ attn_drop_rate=0.0,
25
+ drop_path_rate=0.0,
26
+ embed_layer=PatchEmbed):
27
+ super().__init__(
28
+ image_size,
29
+ patch_size,
30
+ in_chans,
31
+ embed_dim=embed_dim,
32
+ depth=depth,
33
+ num_heads=num_heads,
34
+ mlp_ratio=mlp_ratio,
35
+ qkv_bias=qkv_bias,
36
+ drop_rate=drop_rate,
37
+ attn_drop_rate=attn_drop_rate,
38
+ drop_path_rate=drop_path_rate,
39
+ embed_layer=embed_layer,
40
+ num_classes=0,
41
+ global_pool='',
42
+ class_token=False)
43
+
44
+ def forward(self, x):
45
+ return self.forward_features(x.to(device))
46
+
47
+ class DecoderLayer(nn.Module):
48
+
49
+ def __init__(self, config):
50
+ super().__init__()
51
+ self.d_model = config['model']['d_model']
52
+ self.dec_num_heads = config['model']['dec_num_heads']
53
+ self.d_ff = config['model']['dec_mlp_ratio'] * self.d_model
54
+ self.eps = 1e-5
55
+ self.self_attn = nn.MultiheadAttention(self.d_model, self.dec_num_heads, dropout = 0.1, batch_first=True)
56
+ self.cross_attn = nn.MultiheadAttention(self.d_model, self.dec_num_heads, dropout = 0.1, batch_first=True)
57
+
58
+ self.linear1 = nn.Linear(self.d_model, self.d_ff)
59
+ self.dropout = nn.Dropout(p = 0.1)
60
+ self.linear2 = nn.Linear(self.d_ff, self.d_model)
61
+
62
+ self.norm1 = nn.LayerNorm(self.d_model, eps=self.eps)
63
+ self.norm2 = nn.LayerNorm(self.d_model, eps=self.eps)
64
+ self.norm_q = nn.LayerNorm(self.d_model, eps=self.eps)
65
+ self.norm_c = nn.LayerNorm(self.d_model, eps=self.eps)
66
+ self.dropout1 = nn.Dropout(p = 0.1)
67
+ self.dropout2 = nn.Dropout(p = 0.1)
68
+ self.dropout3 = nn.Dropout(p = 0.1)
69
+
70
+
71
+ def forward_stream(
72
+ self,
73
+ tgt,
74
+ tgt_norm,
75
+ tgt_kv,
76
+ memory,
77
+ tgt_mask,
78
+ tgt_key_padding_mask):
79
+
80
+ tgt2, sa_weights = self.self_attn(
81
+ tgt_norm, tgt_kv, tgt_kv, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
82
+ )
83
+ tgt = tgt + self.dropout1(tgt2)
84
+
85
+ tgt2, ca_weights = self.cross_attn(self.norm1(tgt), memory, memory)
86
+ tgt = tgt + self.dropout2(tgt2)
87
+
88
+ tgt2 = self.linear2(self.dropout(F.gelu(self.linear1(self.norm2(tgt)))))
89
+ tgt = tgt + self.dropout3(tgt2)
90
+ return tgt, sa_weights, ca_weights
91
+
92
+ def forward(
93
+ self,
94
+ query,
95
+ content,
96
+ memory,
97
+ query_mask = None,
98
+ content_mask = None,
99
+ content_key_padding_mask = None,
100
+ update_content: bool = True):
101
+
102
+ query_norm = self.norm_q(query)
103
+ content_norm = self.norm_c(content)
104
+ query = self.forward_stream(query, query_norm, content_norm, memory, query_mask, content_key_padding_mask)[0]
105
+ if update_content:
106
+ content = self.forward_stream(
107
+ content, content_norm, content_norm, memory, content_mask, content_key_padding_mask
108
+ )[0]
109
+ return query, content
110
+
111
+
112
+
113
+ class Decoder(nn.Module):
114
+ __constants__ = ['norm']
115
+
116
+ def __init__(self, config):
117
+ super().__init__()
118
+ self.d_model = config['model']['d_model']
119
+ self.num_layers = config['model']['dec_depth']
120
+ self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(self.num_layers)])
121
+ self.norm = nn.LayerNorm(self.d_model)
122
+
123
+ def forward(self, query, content, memory, query_mask = None, content_mask = None, content_key_padding_mask = None):
124
+ for i, mod in enumerate(self.layers):
125
+ last = i == len(self.layers) - 1
126
+ query, content = mod(
127
+ query, content, memory, query_mask, content_mask, content_key_padding_mask, update_content = not last)
128
+ query = self.norm(query)
129
+ return query
130
+
131
+
132
+ class TokenEmbedding(nn.Module):
133
+ def __init__(self, config):
134
+ super().__init__()
135
+ self.num_tokens = config['model']['num_tokens']
136
+ self.d_model = config['model']['d_model']
137
+ self.embedding = nn.Embedding(self.num_tokens, self.d_model)
138
+
139
+ def forward(self, tokens):
140
+ return math.sqrt(self.d_model) * self.embedding(tokens)
parseq/system.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from timm.models.helpers import named_apply
4
+ from functools import partial
5
+ from .module import Encoder, Decoder, TokenEmbedding
6
+ from .utils import init_weights
7
+ import pytorch_lightning as pl
8
+ from .utils import Tokenizer, CharsetAdapter
9
+ import numpy as np
10
+ import math
11
+ from torch.optim import Optimizer
12
+ from timm.optim import create_optimizer_v2
13
+ from torch.optim.lr_scheduler import OneCycleLR
14
+ from itertools import permutations
15
+ import torch.nn.functional as F
16
+
17
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
18
+
19
+ class PARSeq(nn.Module):
20
+
21
+ def __init__(self, config, device = device):
22
+ super().__init__()
23
+
24
+ self.max_len = config['model']['max_len']
25
+ self.decode_ar = config['model']['decode_ar']
26
+ self.refine_iters = config['model']['refine_iter']
27
+ self.embed_dim = config['model']['d_model']
28
+ self.num_tokens = config['model']['num_tokens']
29
+ self.dropout = 0.1
30
+ self.encoder = Encoder(config['model']['image_size'], config['model']['patch_size'], embed_dim = config['model']['d_model'], depth = config['model']['enc_depth'], num_heads = config['model']['enc_num_heads'], mlp_ratio = config['model']['enc_mlp_ratio'])
31
+ self.decoder = Decoder(config)
32
+ self.text_embed = TokenEmbedding(config)
33
+ self.head = nn.Linear(self.embed_dim, self.num_tokens - 2)
34
+
35
+ self.pos_queries = nn.Parameter(torch.Tensor(1, self.max_len + 1, self.embed_dim))
36
+ self.dropout = nn.Dropout(self.dropout)
37
+ named_apply(partial(init_weights, exclude=['encoder']), self)
38
+ nn.init.trunc_normal_(self.pos_queries, std = 0.02)
39
+ self._device = device
40
+
41
+ @torch.jit.ignore
42
+ def no_weight_decay(self):
43
+ param_names = {'text_embed.embedding.weight', 'pos_queries'}
44
+ enc_param_names = {'encoder.' + n for n in self.encoder.no_weight_decay()}
45
+ return param_names.union(enc_param_names)
46
+
47
+ def encode(self, img: torch.Tensor):
48
+ return self.encoder(img.to(self._device))
49
+
50
+ def decode(
51
+ self,
52
+ tgt: torch.Tensor,
53
+ memory: torch.Tensor,
54
+ tgt_mask = None,
55
+ tgt_padding_mask = None,
56
+ tgt_query = None,
57
+ tgt_query_mask = None):
58
+ N, L = tgt.shape
59
+ null_ctx = self.text_embed(tgt[:, :1])
60
+ tgt_emb = self.pos_queries[:, : L - 1] + self.text_embed(tgt[:, 1:])
61
+ tgt_emb = self.dropout(torch.cat([null_ctx, tgt_emb], dim=1))
62
+ if tgt_query is None:
63
+ tgt_query = self.pos_queries[:, :L].expand(N, -1, -1)
64
+ tgt_query = self.dropout(tgt_query)
65
+ return self.decoder(tgt_query, tgt_emb, memory, tgt_query_mask, tgt_mask, tgt_padding_mask)
66
+
67
+ def forward(self, tokenizer: Tokenizer, images, max_length):
68
+ testing = max_length is None
69
+ max_length = self.max_len if max_length is None else min(max_length, self.max_len)
70
+ bs = images.shape[0]
71
+ num_steps = max_length + 1
72
+ memory = self.encode(images).to(device)
73
+ pos_queries = self.pos_queries[:, :num_steps].expand(bs, -1, -1)
74
+
75
+ tgt_mask = query_mask = torch.triu(torch.ones((num_steps, num_steps), dtype=torch.bool, device=self._device), 1)
76
+
77
+ if self.decode_ar:
78
+ tgt_in = torch.full((bs, num_steps), tokenizer.pad_id, dtype=torch.long, device=self._device)
79
+ tgt_in[:, 0] = tokenizer.sos_id
80
+
81
+ logits = []
82
+ for i in range(num_steps):
83
+ j = i + 1
84
+ tgt_out = self.decode(
85
+ tgt_in[:, :j],
86
+ memory,
87
+ tgt_mask[:j, :j],
88
+ tgt_query = pos_queries[:, i:j],
89
+ tgt_query_mask = query_mask[i:j, :j],)
90
+
91
+ p_i = self.head(tgt_out)
92
+ logits.append(p_i)
93
+ if j < num_steps:
94
+ tgt_in[:, j] = p_i.squeeze().argmax(-1)
95
+ if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
96
+ break
97
+
98
+ logits = torch.cat(logits, dim=1)
99
+ else:
100
+ tgt_in = torch.full((bs, 1), tokenizer.sos_id, dtype=torch.long, device=self._device)
101
+ tgt_out = self.decode(tgt_in, memory, tgt_query=pos_queries)
102
+ logits = self.head(tgt_out)
103
+
104
+ if self.refine_iters:
105
+ query_mask[torch.triu(torch.ones(num_steps, num_steps, dtype=torch.bool, device=self._device), 2)] = 0
106
+ bos = torch.full((bs, 1), tokenizer.sos_id, dtype=torch.long, device=self._device)
107
+ for i in range(self.refine_iters):
108
+ tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
109
+ tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(-1) > 0
110
+ tgt_out = self.decode(
111
+ tgt_in, memory, tgt_mask, tgt_padding_mask, pos_queries, query_mask[:, : tgt_in.shape[1]])
112
+ logits = self.head(tgt_out)
113
+
114
+ return logits
115
+
116
+
117
+
118
+ class System(pl.LightningModule):
119
+
120
+ def __init__( self, config):
121
+
122
+ super().__init__()
123
+ self.save_hyperparameters()
124
+ self.max_len = int(config['model']['max_len'])
125
+ self.charset_adapter = CharsetAdapter()
126
+ self.charset = config['model']['train_charset']
127
+ self.lr = float(config['trainer']['lr'])
128
+ self.batch_size = config['trainer']['batch_size']
129
+ self.warm_pct = float(config['trainer']['warm_pct'])
130
+ self.weight_decay = float(config['trainer']['weight_decay'])
131
+ self.tokenizer = Tokenizer(self.charset, self.max_len)
132
+ self.sos_id = self.tokenizer.sos_id
133
+ self.eos_id = self.tokenizer.eos_id
134
+ self.pad_id = self.tokenizer.pad_id
135
+
136
+ self.model = PARSeq(config)
137
+ self.rng = np.random.default_rng()
138
+ self.max_gen_perms = config['model']['perm_num'] // 2 if config['model']['perm_mirrored'] else config['model']['perm_num']
139
+ self.perm_forward = config['model']['perm_forward']
140
+ self.perm_mirrored = config['model']['perm_mirrored']
141
+ if config['model']['pretrained']:
142
+ self.weight_ulr = config['model']['weight_url']
143
+ self.load_weight(self.weight_ulr)
144
+ self.set_seed()
145
+
146
+ def set_seed(self, seed = 42):
147
+ torch.manual_seed(seed)
148
+ np.random.seed(seed)
149
+ if torch.cuda.is_available():
150
+ torch.cuda.manual_seed(seed)
151
+ torch.cuda.manual_seed_all(seed)
152
+ torch.backends.cudnn.deterministic = True
153
+ torch.backends.cudnn.benchmark = False
154
+
155
+
156
+ def configure_optimizers(self):
157
+ agb = self.trainer.accumulate_grad_batches
158
+ lr_scale = agb * math.sqrt(self.trainer.num_devices) * self.batch_size / 256.0
159
+ lr = float(lr_scale) * float(self.lr)
160
+ optim = create_optimizer_v2(self, 'adamw', lr, self.weight_decay)
161
+ sched = OneCycleLR(
162
+ optim, lr, self.trainer.estimated_stepping_batches, pct_start=self.warm_pct, cycle_momentum=False
163
+ )
164
+ return {'optimizer': optim, 'lr_scheduler': {'scheduler': sched, 'interval': 'step'}}
165
+
166
+ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer) -> None:
167
+ optimizer.zero_grad(set_to_none=True)
168
+
169
+ def forward(self, images, max_length = None):
170
+ return self.model.forward(self.tokenizer, images, max_length)
171
+
172
+ def gen_tgt_perms(self, tgt):
173
+ max_num_chars = tgt.shape[1] - 2
174
+ if max_num_chars == 1:
175
+ return torch.arange(3, device=self._device).unsqueeze(0)
176
+ perms = [torch.arange(max_num_chars, device=self._device)] if self.perm_forward else []
177
+ max_perms = math.factorial(max_num_chars)
178
+ if self.perm_mirrored:
179
+ max_perms //= 2
180
+ num_gen_perms = min(self.max_gen_perms, max_perms)
181
+
182
+ if max_num_chars < 5:
183
+
184
+ if max_num_chars == 4 and self.perm_mirrored:
185
+ selector = [0, 3, 4, 6, 9, 10, 12, 16, 17, 18, 19, 21]
186
+ else:
187
+ selector = list(range(max_perms))
188
+ perm_pool = torch.as_tensor(
189
+ list(permutations(range(max_num_chars), max_num_chars)),
190
+ device=self._device,
191
+ )[selector]
192
+ if self.perm_forward:
193
+ perm_pool = perm_pool[1:]
194
+ perms = torch.stack(perms)
195
+ if len(perm_pool):
196
+ i = self.rng.choice(len(perm_pool), size=num_gen_perms - len(perms), replace=False)
197
+ perms = torch.cat([perms, perm_pool[i]])
198
+ else:
199
+ perms.extend(
200
+ [torch.randperm(max_num_chars, device = self._device) for _ in range(num_gen_perms - len(perms))]
201
+ )
202
+ perms = torch.stack(perms)
203
+ if self.perm_mirrored:
204
+ comp = perms.flip(-1)
205
+ perms = torch.stack([perms, comp]).transpose(0, 1).reshape(-1, max_num_chars)
206
+ sos_idx = perms.new_zeros((len(perms), 1))
207
+ eos_idx = perms.new_full((len(perms), 1), max_num_chars + 1)
208
+ perms = torch.cat([sos_idx, perms + 1, eos_idx], dim=1)
209
+ if len(perms) > 1:
210
+ perms[1, 1:] = max_num_chars + 1 - torch.arange(max_num_chars + 1, device=self._device)
211
+ return perms
212
+
213
+ def generate_attn_masks(self, perm):
214
+ sz = perm.shape[0]
215
+ mask = torch.zeros((sz, sz), dtype=torch.bool, device=self._device)
216
+ for i in range(sz):
217
+ query_idx = perm[i]
218
+ masked_keys = perm[i + 1 :]
219
+ mask[query_idx, masked_keys] = True
220
+ content_mask = mask[:-1, :-1].clone()
221
+ mask[torch.eye(sz, dtype=torch.bool, device=self._device)] = True # mask "self"
222
+ query_mask = mask[1:, :-1]
223
+ return content_mask, query_mask
224
+
225
+ def training_step(self, batch, batch_idx):
226
+ images, labels = batch
227
+ images = images.to(device)
228
+ tgt = labels.to(device)
229
+
230
+ memory = self.model.encode(images.to(device))
231
+
232
+ tgt_perms = self.gen_tgt_perms(tgt)
233
+ tgt_in = tgt[:, :-1]
234
+ tgt_out = tgt[:, 1:]
235
+ tgt_padding_mask = (tgt_in == self.pad_id) | (tgt_in == self.eos_id)
236
+
237
+ loss = 0
238
+ loss_numel = 0
239
+ n = (tgt_out != self.pad_id).sum().item()
240
+ for i, perm in enumerate(tgt_perms):
241
+ tgt_mask, query_mask = self.generate_attn_masks(perm)
242
+ out = self.model.decode(tgt_in, memory, tgt_mask, tgt_padding_mask, tgt_query_mask=query_mask)
243
+ logits = self.model.head(out).flatten(end_dim=1)
244
+ loss += n * F.cross_entropy(logits, tgt_out.flatten(), ignore_index=self.pad_id)
245
+ loss_numel += n
246
+ if i == 1:
247
+ tgt_out = torch.where(tgt_out == self.eos_id, self.pad_id, tgt_out)
248
+ n = (tgt_out != self.pad_id).sum().item()
249
+
250
+ loss /= loss_numel
251
+ with torch.no_grad():
252
+ self.eval()
253
+ logits, _, _ = self.forward_logits_loss(images, labels)
254
+ predicted_labels, _ = self.tokenizer.decode(logits.softmax(-1))
255
+ predicted_labels = [self.charset_adapter(label) for label in predicted_labels]
256
+ true_labels = self.decode(labels)
257
+ count = 0
258
+ for i in range(len(true_labels)):
259
+ if true_labels[i] == predicted_labels[i]:
260
+ count += 1
261
+ train_acc = float(count / len(true_labels))
262
+ self.log("train_loss", loss, on_epoch = True, prog_bar = True, logger = True)
263
+ self.log("train_acc", train_acc, on_epoch = True, prog_bar = True, logger = True)
264
+ return loss
265
+
266
+ def forward_logits_loss(self, images, targets: list[str]):
267
+ targets = targets[:, 1:]
268
+ max_len = targets.shape[1] - 1
269
+ logits = self.forward(images, max_len)
270
+ loss = F.cross_entropy(logits.flatten(end_dim = 1), targets.flatten(), ignore_index = self.pad_id)
271
+ loss_numel = (targets != self.pad_id).sum()
272
+ return logits, loss, loss_numel
273
+
274
+ def validation_step(self, batch, batch_idx):
275
+ self.eval()
276
+ images, labels = batch
277
+ with torch.no_grad():
278
+ logits, loss, loss_numel = self.forward_logits_loss(images, labels)
279
+ predicted_labels, _ = self.tokenizer.decode(logits.softmax(-1))
280
+ predicted_labels = [self.charset_adapter(label) for label in predicted_labels]
281
+ true_labels = self.decode(labels)
282
+ count = 0
283
+ for i in range(len(true_labels)):
284
+ if true_labels[i] == predicted_labels[i]:
285
+ count += 1
286
+ val_acc = float(count / len(true_labels))
287
+ self.log("val_loss", loss / loss_numel, on_epoch = True, prog_bar = True, logger = True)
288
+ self.log("val_acc", val_acc, on_epoch = True, prog_bar = True, logger = True)
289
+
290
+ def on_train_epoch_end(self):
291
+ train_loss = self.trainer.callback_metrics["train_loss"].item()
292
+ train_acc = self.trainer.callback_metrics["train_acc"].item()
293
+ val_loss = self.trainer.callback_metrics["val_loss"].item()
294
+ val_acc = self.trainer.callback_metrics["val_acc"].item()
295
+ combined_acc = val_acc + 1e-1 * train_acc
296
+ self.log("combined_acc", combined_acc, prog_bar = False, logger = True)
297
+ print(f"Epoch {self.current_epoch}: train_loss = {train_loss:.3f}, train_acc = {train_acc:.3f}, val_loss = {val_loss:.3f}, val_acc = {val_acc:.3f}")
298
+
299
+ def load_weight(self, url):
300
+ state_dict = torch.hub.load_state_dict_from_url(url = url, map_location = 'cuda', check_hash = True)
301
+ self.model.load_state_dict(state_dict)
302
+ print("Load weights sucessfully !!!")
303
+
304
+ def decode(self, ids):
305
+ true_labels = []
306
+ if isinstance(ids):
307
+ ids = ids.tolist()
308
+ for label in ids:
309
+ true_label = self.tokenizer._ids2tok(label)
310
+ true_labels.append(self.charset_adapter(true_label))
311
+ return true_labels
parseq/utils.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from torch import Tensor
4
+ import torch.nn as nn
5
+ from typing import Sequence
6
+ class CharsetAdapter:
7
+
8
+ def __init__(self):
9
+ super().__init__()
10
+ self.charset = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
11
+ self.unsupported = re.compile(f'[^{re.escape(self.charset)}]')
12
+
13
+ def __call__(self, label):
14
+ label = label.upper()
15
+ label = self.unsupported.sub('', label)
16
+ return label
17
+
18
+
19
+ class Vocab:
20
+ def __init__(self, charset):
21
+ self.c2i = dict()
22
+ self.c2i['<EOS>'] = 0
23
+ count = 1
24
+ for c in charset:
25
+ if c not in self.c2i.keys():
26
+ self.c2i[c] = count
27
+ count += 1
28
+ self.c2i['<SOS>'] = len(self.c2i)
29
+ self.c2i['<PAD>'] = len(self.c2i)
30
+
31
+ self.i2c = {v : k for k, v in self.c2i.items()}
32
+
33
+ def __len__(self):
34
+ return len(self.c2i)
35
+
36
+
37
+ class Tokenizer:
38
+ def __init__(self, charset, max_len):
39
+
40
+ self.max_len = max_len
41
+ self.vocab = Vocab(charset)
42
+ self.sos_id = self.vocab.c2i['<SOS>']
43
+ self.eos_id = self.vocab.c2i['<EOS>']
44
+ self.pad_id = self.vocab.c2i['<PAD>']
45
+ self.special = [self.sos_id, self.eos_id, self.pad_id]
46
+
47
+ def __len__(self):
48
+ return len(self.vocab)
49
+
50
+ def _tok2ids(self, token : str):
51
+ return [self.vocab.c2i[c] for c in token]
52
+
53
+ def _ids2tok(self, token_ids, join = True):
54
+ if isinstance(token_ids, torch.Tensor):
55
+ token_ids = token_ids.tolist()
56
+ token = [self.vocab.i2c[i] for i in token_ids if i not in self.special]
57
+ return ''.join(token) if join else token
58
+
59
+
60
+
61
+ def encode_batch(self, labels : list[str], device):
62
+ encoded_labels = []
63
+ for label in labels:
64
+ encoded_label = [self.sos_id] + self._tok2ids(label) + [self.eos_id]
65
+ if len(encoded_label) > self.max_len:
66
+ encoded_label = encoded_label[ : self.max_len]
67
+ else:
68
+ encoded_label = encoded_label + [self.pad_id] * (self.max_len - len(encoded_label))
69
+ encoded_labels.append(torch.tensor(encoded_label, dtype = torch.long, device = device))
70
+ return torch.stack(encoded_labels, dim = 0)
71
+
72
+ def _filter(self, probs : Tensor, ids : Tensor):
73
+ ids = ids.tolist()
74
+ try:
75
+ eos_idx = ids.index(self.eos_id)
76
+ except ValueError:
77
+ eos_idx = len(ids)
78
+ ids = ids[ : eos_idx]
79
+ probs = probs[: eos_idx + 1]
80
+ return probs, ids
81
+
82
+ def decode(self, token_dists : Tensor, raw : bool = False):
83
+
84
+ batch_tokens = []
85
+ batch_probs = []
86
+ for dist in token_dists:
87
+ probs, ids = dist.max(-1)
88
+ if not raw:
89
+ probs, ids = self._filter(probs, ids)
90
+ tokens = self._ids2tok(ids, not raw)
91
+ batch_tokens.append(tokens)
92
+ batch_probs.append(probs)
93
+ return batch_tokens, batch_probs
94
+
95
+
96
+ def init_weights(module: nn.Module, name: str = '', exclude: Sequence[str] = ()):
97
+ if any(map(name.startswith, exclude)):
98
+ return
99
+ if isinstance(module, nn.Linear):
100
+ nn.init.trunc_normal_(module.weight, std=0.02)
101
+ if module.bias is not None:
102
+ nn.init.zeros_(module.bias)
103
+ elif isinstance(module, nn.Embedding):
104
+ nn.init.trunc_normal_(module.weight, std=0.02)
105
+ if module.padding_idx is not None:
106
+ module.weight.data[module.padding_idx].zero_()
107
+ elif isinstance(module, nn.Conv2d):
108
+ nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
109
+ if module.bias is not None:
110
+ nn.init.zeros_(module.bias)
111
+ elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
112
+ nn.init.ones_(module.weight)
113
+ nn.init.zeros_(module.bias)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ tensorflow==2.13.1
2
+ torch==2.4.1
3
+ gradio==4.44.1
4
+ timm==1.0.9
5
+ PIL==10.2.0
6
+ imgaug==0.4.0
7
+ opencv-python==4.10.0
8
+ torchvision==0.19.1
wpodnet/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __version__ = '1.0.3'
2
+
3
+ from .backend import Prediction, Predictor
4
+
5
+ __all__ = [
6
+ 'Prediction', 'Predictor'
7
+ ]
wpodnet/backend.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ import numpy as np
4
+ import torch
5
+ from PIL import Image, ImageDraw
6
+ from torchvision.transforms.functional import (to_tensor)
7
+ import cv2
8
+ from .model import WPODNet
9
+
10
+
11
+ class Prediction:
12
+ def __init__(self, image: Image.Image, bounds: np.ndarray, confidence: float):
13
+ self.image = image
14
+ self.bounds = bounds
15
+ self.confidence = confidence
16
+
17
+ def _get_width_height(self):
18
+ def distance(point1,point2):
19
+ x1=point1[0]
20
+ y1=point1[1]
21
+ x2=point2[0]
22
+ y2=point2[1]
23
+ distance = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
24
+ return distance
25
+ box = self.bounds
26
+ dis1= distance(box[0],box[1])
27
+ dis2 = distance(box[1],box[2])
28
+ dis3 = distance(box[2],box[3])
29
+ dis4 = distance(box[3],box[0])
30
+ width = (dis1+dis3)/2
31
+ height= (dis2+dis4)/2
32
+ if height/width >0.49:
33
+ return 64,46
34
+ return 100, 23
35
+ def get_perspective_M(self, width: int, height: int) -> List[float]:
36
+ # Get the perspective matrix
37
+ src_points = np.array(self.bounds,dtype=np.float32)
38
+ dst_points = np.array([[0, 0], [width, 0], [width, height], [0, height]],np.float32)
39
+ return cv2.getPerspectiveTransform(src_points,dst_points)
40
+ def annotate(self, outline: str = 'red', width: int = 3) -> Image.Image:
41
+ canvas = self.image.copy()
42
+ drawer = ImageDraw.Draw(canvas)
43
+ drawer.polygon(
44
+ [(x, y) for x, y in self.bounds],
45
+ outline=outline,
46
+ width=width
47
+ )
48
+ return canvas
49
+
50
+ def warp(self):#, width: int = 208, height: int = 60) -> Image.Image:
51
+ # Get the perspective matrix
52
+ width, height = self._get_width_height()
53
+
54
+ M= self.get_perspective_M(width, height)
55
+
56
+ n_image = np.array(self.image)
57
+ warped = cv2.warpPerspective(n_image,M,(int(width),int(height)))
58
+ return warped
59
+
60
+
61
+ class Predictor:
62
+ _q = np.array([
63
+ [-.5, .5, .5, -.5],
64
+ [-.5, -.5, .5, .5],
65
+ [1., 1., 1., 1.]
66
+ ])
67
+ _scaling_const = 7.75
68
+ _stride = 16
69
+
70
+ def __init__(self, wpodnet:WPODNet):
71
+ self.wpodnet = wpodnet
72
+ self.wpodnet.eval()
73
+
74
+ def _resize_to_fixed_ratio(self, image: Image.Image, dim_min: int, dim_max: int) -> Image.Image:
75
+ h, w = image.height, image.width
76
+
77
+ wh_ratio = max(h, w) / min(h, w)
78
+ side = int(wh_ratio * dim_min)
79
+ bound_dim = min(side + side % self._stride, dim_max)
80
+
81
+ factor = bound_dim / max(h, w)
82
+ reg_w, reg_h = int(w * factor), int(h * factor)
83
+
84
+ # Ensure the both width and height are the multiply of `self._stride`
85
+ reg_w_mod = reg_w % self._stride
86
+ if reg_w_mod > 0:
87
+ reg_w += self._stride - reg_w_mod
88
+
89
+ reg_h_mod = reg_h % self._stride
90
+ if reg_h_mod > 0:
91
+ reg_h += self._stride - reg_h % self._stride
92
+
93
+ return image.resize((reg_w, reg_h))
94
+
95
+ def _to_torch_image(self, image: Image.Image) -> torch.Tensor:
96
+ tensor = to_tensor(image)
97
+ return tensor.unsqueeze_(0)
98
+
99
+ def _inference(self, image: torch.Tensor) -> Tuple[np.ndarray, np.ndarray]:
100
+ with torch.no_grad():
101
+ probs, affines = self.wpodnet.forward(image)
102
+
103
+ # Convert to squeezed numpy array
104
+ # grid_w: The number of anchors in row
105
+ # grid_h: The number of anchors in column
106
+ probs = np.squeeze(probs.cpu().numpy())[0] # (grid_h, grid_w)
107
+ affines = np.squeeze(affines.cpu().numpy()) # (6, grid_h, grid_w)
108
+
109
+ return probs, affines
110
+
111
+ def _get_max_anchor(self, probs: np.ndarray) -> Tuple[int, int]:
112
+ return np.unravel_index(probs.argmax(), probs.shape)
113
+
114
+ def _get_bounds(self, affines: np.ndarray, anchor_y: int, anchor_x: int, scaling_ratio: float = 1.0) -> np.ndarray:
115
+ # Compute theta
116
+ theta = affines[:, anchor_y, anchor_x]
117
+ theta = theta.reshape((2, 3))
118
+ theta[0, 0] = max(theta[0, 0], 0.0)
119
+ theta[1, 1] = max(theta[1, 1], 0.0)
120
+
121
+ # Convert theta into the bounding polygon
122
+ bounds = np.matmul(theta, self._q) * self._scaling_const * scaling_ratio
123
+
124
+ # Normalize the bounds
125
+ _, grid_h, grid_w = affines.shape
126
+ bounds[0] = (bounds[0] + anchor_x + .5) / grid_w
127
+ bounds[1] = (bounds[1] + anchor_y + .5) / grid_h
128
+
129
+ return np.transpose(bounds)
130
+
131
+ def predict(self, image: Image.Image, scaling_ratio: float = 1.0, dim_min: int = 288, dim_max: int = 608) -> Prediction:
132
+ orig_h, orig_w = image.height, image.width
133
+
134
+ # Resize the image to fixed ratio
135
+ # This operation is convienence for setup the anchors
136
+ resized = self._resize_to_fixed_ratio(image, dim_min=dim_min, dim_max=dim_max)
137
+ resized = self._to_torch_image(resized)
138
+ resized = resized.to(self.wpodnet.device)
139
+
140
+ # Inference with WPODNet
141
+ # probs: The probability distribution of the location of license plate
142
+ # affines: The predicted affine matrix
143
+ probs, affines = self._inference(resized)
144
+
145
+ # Get the theta with maximum probability
146
+ max_prob = np.amax(probs)
147
+ anchor_y, anchor_x = self._get_max_anchor(probs)
148
+ bounds = self._get_bounds(affines, anchor_y, anchor_x, scaling_ratio)
149
+
150
+ bounds[:, 0] *= orig_w
151
+ bounds[:, 1] *= orig_h
152
+
153
+ return Prediction(
154
+ image=image,
155
+ bounds=bounds.astype(np.int32),
156
+ confidence=max_prob.item()
157
+ )
wpodnet/lib_detection.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pylint: disable=invalid-name, redefined-outer-name, missing-docstring, non-parent-init-called, trailing-whitespace, line-too-long
2
+ from os.path import splitext
3
+ import cv2
4
+ import numpy as np
5
+ from keras.models import load_model
6
+ import os
7
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
8
+
9
+ class Label:
10
+ def __init__(self, cl=-1, tl=np.array([0., 0.]), br=np.array([0., 0.]), prob=None):
11
+ self.__tl = tl
12
+ self.__br = br
13
+ self.__cl = cl
14
+ self.__prob = prob
15
+
16
+ def __str__(self):
17
+ return 'Class: %d, top left(x: %f, y: %f), bottom right(x: %f, y: %f)' % (
18
+ self.__cl, self.__tl[0], self.__tl[1], self.__br[0], self.__br[1])
19
+
20
+ def copy(self):
21
+ return Label(self.__cl, self.__tl, self.__br)
22
+
23
+ def wh(self): return self.__br - self.__tl
24
+
25
+ def cc(self): return self.__tl + self.wh() / 2
26
+
27
+ def tl(self): return self.__tl
28
+
29
+ def br(self): return self.__br
30
+
31
+ def tr(self): return np.array([self.__br[0], self.__tl[1]])
32
+
33
+ def bl(self): return np.array([self.__tl[0], self.__br[1]])
34
+
35
+ def cl(self): return self.__cl
36
+
37
+ def area(self): return np.prod(self.wh())
38
+
39
+ def prob(self): return self.__prob
40
+
41
+ def set_class(self, cl):
42
+ self.__cl = cl
43
+
44
+ def set_tl(self, tl):
45
+ self.__tl = tl
46
+
47
+ def set_br(self, br):
48
+ self.__br = br
49
+
50
+ def set_wh(self, wh):
51
+ cc = self.cc()
52
+ self.__tl = cc - .5 * wh
53
+ self.__br = cc + .5 * wh
54
+
55
+ def set_prob(self, prob):
56
+ self.__prob = prob
57
+
58
+ class DLabel(Label):
59
+ def __init__(self, cl, pts, prob):
60
+ self.pts = pts
61
+ tl = np.amin(pts, axis=1)
62
+ br = np.amax(pts, axis=1)
63
+ Label.__init__(self, cl, tl, br, prob)
64
+
65
+ # Hàm normalize ảnh
66
+ def im2single(Image):
67
+ return Image.astype('float32') / 255
68
+
69
+ def getWH(shape):
70
+ return np.array(shape[1::-1]).astype(float)
71
+
72
+ def IOU(tl1, br1, tl2, br2):
73
+ wh1, wh2 = br1-tl1, br2-tl2
74
+ assert((wh1 >= 0).all() and (wh2 >= 0).all())
75
+
76
+ intersection_wh = np.maximum(np.minimum(br1, br2) - np.maximum(tl1, tl2), 0)
77
+ intersection_area = np.prod(intersection_wh)
78
+ area1, area2 = (np.prod(wh1), np.prod(wh2))
79
+ union_area = area1 + area2 - intersection_area
80
+ return intersection_area/union_area
81
+
82
+ def IOU_labels(l1, l2):
83
+ return IOU(l1.tl(), l1.br(), l2.tl(), l2.br())
84
+
85
+ def nms(Labels, iou_threshold=0.5):
86
+ SelectedLabels = []
87
+ Labels.sort(key=lambda l: l.prob(), reverse=True)
88
+
89
+ for label in Labels:
90
+ non_overlap = True
91
+ for sel_label in SelectedLabels:
92
+ if IOU_labels(label, sel_label) > iou_threshold:
93
+ non_overlap = False
94
+ break
95
+
96
+ if non_overlap:
97
+ SelectedLabels.append(label)
98
+ return SelectedLabels
99
+
100
+ def load_model_wpod(path):
101
+ model = load_model(path)
102
+ return model
103
+
104
+ def find_T_matrix(pts, t_pts):
105
+ A = np.zeros((8, 9))
106
+ for i in range(0, 4):
107
+ xi = pts[:, i]
108
+ xil = t_pts[:, i]
109
+ xi = xi.T
110
+
111
+ A[i*2, 3:6] = -xil[2]*xi
112
+ A[i*2, 6:] = xil[1]*xi
113
+ A[i*2+1, :3] = xil[2]*xi
114
+ A[i*2+1, 6:] = -xil[0]*xi
115
+
116
+ [U, S, V] = np.linalg.svd(A)
117
+ H = V[-1, :].reshape((3, 3))
118
+ return H
119
+
120
+ def getRectPts(a, b):
121
+ return np.array([[0,0], [a, 0], [a, b],[0,b]],np.float32)
122
+
123
+ def normal(pts, side, mn, MN):
124
+ pts_MN_center_mn = pts * side
125
+ pts_MN = pts_MN_center_mn + mn.reshape((2, 1))
126
+ pts_prop = pts_MN / MN.reshape((2, 1))
127
+ return pts_prop
128
+ def get_bound(x,y):
129
+ bound =[]
130
+ for i in range(0,len(x)):
131
+ point =[x[i],y[i]]
132
+ bound.append(point)
133
+ return bound
134
+ def calculate_ratio(bound):
135
+ def distance(point1,point2):
136
+ x1=point1[0]
137
+ y1=point1[1]
138
+ x2=point2[0]
139
+ y2=point2[1]
140
+ distance = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
141
+ return distance
142
+ box = bound
143
+ dis1= distance(box[0],box[1])
144
+ dis2 = distance(box[1],box[2])
145
+ dis3 = distance(box[2],box[3])
146
+ dis4 = distance(box[3],box[0])
147
+ width = (dis1+dis3)/2
148
+ height= (dis2+dis4)/2
149
+ ratio = height/width
150
+ if ratio>0.55:
151
+ return 2
152
+ return 1
153
+ # Hàm tái tạo từ predict value thành biến số, cắt từ ảnh chính ra biển số, nhãn...
154
+ def reconstruct(I, Iresized, Yr, lp_threshold):
155
+ bounds=[]
156
+ # 4 max-pooling layers, stride = 2
157
+ net_stride = 2**4
158
+ side = ((208 + 40)/2)/net_stride
159
+
160
+ # one line and two lines license plate size
161
+ one_line = (100, 23)
162
+ two_lines = (64, 46)
163
+
164
+ Probs = Yr[..., 0]
165
+ Affines = Yr[..., 2:]
166
+
167
+ xx, yy = np.where(Probs > lp_threshold)
168
+ # CNN input image size
169
+ WH = getWH(Iresized.shape)
170
+ # output feature map size
171
+ MN = WH/net_stride
172
+
173
+ vxx = vyy = 0.5 #alpha
174
+ base = lambda vx, vy: np.matrix([[-vx, -vy, 1], [vx, -vy, 1], [vx, vy, 1], [-vx, vy, 1]]).T
175
+ labels = []
176
+ labels_frontal = []
177
+
178
+ for i in range(len(xx)):
179
+ x, y = xx[i], yy[i]
180
+ affine = Affines[x, y]
181
+ prob = Probs[x, y]
182
+
183
+ mn = np.array([float(y) + 0.5, float(x) + 0.5])
184
+
185
+ # affine transformation matrix
186
+ A = np.reshape(affine, (2, 3))
187
+ A[0, 0] = max(A[0, 0], 0)
188
+ A[1, 1] = max(A[1, 1], 0)
189
+ # identity transformation
190
+ B = np.zeros((2, 3))
191
+ B[0, 0] = max(A[0, 0], 0)
192
+ B[1, 1] = max(A[1, 1], 0)
193
+
194
+ pts = np.array(A*base(vxx, vyy))
195
+ pts_frontal = np.array(B*base(vxx, vyy))
196
+
197
+ pts_prop = normal(pts, side, mn, MN)
198
+ frontal = normal(pts_frontal, side, mn, MN)
199
+
200
+ labels.append(DLabel(0, pts_prop, prob))
201
+ labels_frontal.append(DLabel(0, frontal, prob))
202
+
203
+ final_labels = nms(labels, 0.1)
204
+ final_labels_frontal = nms(labels_frontal, 0.1)
205
+ if (len(final_labels_frontal)>0):
206
+
207
+
208
+ # LP size and type
209
+ #out_size, lp_type = (two_lines, 2) if ((final_labels_frontal[0].wh()[1] / final_labels_frontal[0].wh()[1]) >0.49) else (one_line, 1)
210
+ lp_type=0
211
+ TLp = []
212
+ if len(final_labels):
213
+ final_labels.sort(key=lambda x: x.prob(), reverse=True)
214
+ for _, label in enumerate(final_labels):
215
+ ptsh = np.concatenate((label.pts * getWH(I.shape).reshape((2, 1)), np.ones((1, 4))))
216
+ bound = get_bound(ptsh[0],ptsh[1])
217
+ pts=np.array(bound,dtype=np.float32)
218
+ bounds.append(bound)
219
+ lp_type = calculate_ratio(bound)
220
+ if lp_type==2:
221
+ out_size=two_lines
222
+ else: out_size=one_line
223
+ t_ptsh = getRectPts(out_size[0], out_size[1])
224
+ H=cv2.getPerspectiveTransform(pts,t_ptsh)
225
+ Ilp = cv2.warpPerspective(I,H, (int(out_size[0]),int(out_size[1])))
226
+ TLp.append(Ilp)
227
+
228
+ return final_labels, TLp, lp_type,bounds
229
+ else:
230
+ return None,[], None,None
231
+ def detect_lp(model, I, lp_threshold):
232
+ Dmax = 350
233
+ Dmin = 288
234
+
235
+ # Lấy tỷ lệ giữa W và H của ảnh và tìm ra chiều nhỏ nhất
236
+ ratio = float(max(I.shape[:2])) / min(I.shape[:2])
237
+ side = int(ratio * Dmin)
238
+ max_dim = min(side, Dmax)
239
+ I=im2single(I)
240
+ # Tính factor resize ảnh
241
+ min_dim_img = min(I.shape[:2])
242
+ factor = float(max_dim) / min_dim_img
243
+
244
+ # Tính W và H mới sau khi resize
245
+ w, h = (np.array(I.shape[1::-1], dtype=float) * factor).astype(int).tolist()
246
+
247
+ # Tiến hành resize ảnh
248
+ Iresized = cv2.resize(I, (w, h))
249
+
250
+ T = Iresized.copy()
251
+
252
+ # Chuyển thành Tensor
253
+ T = T.reshape((1, T.shape[0], T.shape[1], T.shape[2]))
254
+
255
+ # Tiến hành detect biển số bằng Wpod-net pretrain
256
+ Yr = model.predict(T,verbose=0)
257
+
258
+ # Remove các chiều =1 của Yr
259
+ Yr = np.squeeze(Yr)
260
+
261
+
262
+
263
+ # Tái tạo và trả về các biến gồm: Nhãn, Ảnh biến số, Loại biển số (1: dài: 2 vuông)
264
+ L, TLp, lp_type,bounds = reconstruct(I, Iresized, Yr, lp_threshold)
265
+ return L, TLp, lp_type,bounds
wpodnet/model.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class BasicConvBlock(nn.Module):
6
+ def __init__(self, in_channels: int, out_channels: int):
7
+ super(BasicConvBlock, self).__init__()
8
+ self.conv_layer = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
9
+ self.bn_layer = nn.BatchNorm2d(out_channels, momentum=0.99, eps=0.001)
10
+ self.act_layer = nn.ReLU(inplace=True)
11
+
12
+ def forward(self, x):
13
+ x = self.conv_layer(x)
14
+ x = self.bn_layer(x)
15
+ return self.act_layer(x)
16
+
17
+
18
+ class ResBlock(nn.Module):
19
+ def __init__(self, channels: int):
20
+ super(ResBlock, self).__init__()
21
+ self.conv_block = BasicConvBlock(channels, channels)
22
+ self.sec_layer = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
23
+ self.bn_layer = nn.BatchNorm2d(channels, momentum=0.99, eps=0.001)
24
+ self.act_layer = nn.ReLU(inplace=True)
25
+
26
+ def forward(self, x):
27
+ h = self.conv_block(x)
28
+ h = self.sec_layer(h)
29
+ h = self.bn_layer(h)
30
+ return self.act_layer(x + h)
31
+
32
+
33
+ class WPODNet(nn.Module):
34
+ def __init__(self):
35
+ super(WPODNet, self).__init__()
36
+ self.backbone = nn.Sequential(
37
+ BasicConvBlock(3, 16),
38
+ BasicConvBlock(16, 16),
39
+ nn.MaxPool2d(2),
40
+ BasicConvBlock(16, 32),
41
+ ResBlock(32),
42
+ nn.MaxPool2d(2),
43
+ BasicConvBlock(32, 64),
44
+ ResBlock(64),
45
+ ResBlock(64),
46
+ nn.MaxPool2d(2),
47
+ BasicConvBlock(64, 64),
48
+ ResBlock(64),
49
+ ResBlock(64),
50
+ nn.MaxPool2d(2),
51
+ BasicConvBlock(64, 128),
52
+ ResBlock(128),
53
+ ResBlock(128),
54
+ ResBlock(128),
55
+ ResBlock(128)
56
+ )
57
+ self.prob_layer = nn.Conv2d(128, 2, kernel_size=3, padding=1)
58
+ self.bbox_layer = nn.Conv2d(128, 6, kernel_size=3, padding=1)
59
+
60
+ # Registry a dummy tensor for retrieve the attached device
61
+ self.register_buffer('dummy', torch.Tensor(), persistent=False)
62
+
63
+ @property
64
+ def device(self) -> torch.device:
65
+ return self.dummy.device
66
+
67
+ def forward(self, image: torch.Tensor):
68
+ feature: torch.Tensor = self.backbone(image)
69
+ probs: torch.Tensor = self.prob_layer(feature)
70
+ probs = torch.softmax(probs, dim=1)
71
+ affines: torch.Tensor = self.bbox_layer(feature)
72
+
73
+ return probs, affines
wpodnet/stream.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Generator, Union
3
+
4
+ from PIL import Image
5
+
6
+
7
+ class ImageStreamer:
8
+ def __init__(self, image_or_folder: Union[str, Path]):
9
+ path = Path(image_or_folder)
10
+ self.generator = self._get_image_generator(path)
11
+
12
+ def _get_image_generator(self, path: Path) -> Generator[Image.Image, None, None]:
13
+ if path.is_file():
14
+ image_paths = [path] if self._is_image_file(path) else []
15
+ elif path.is_dir():
16
+ image_paths = [
17
+ p
18
+ for p in path.rglob('**/*')
19
+ if self._is_image_file(p)
20
+ ]
21
+ else:
22
+ raise TypeError(f'Invalid path to images {path}')
23
+
24
+ for p in image_paths:
25
+ yield Image.open(p)
26
+
27
+ def _is_image_file(self, path: Path) -> bool:
28
+ try:
29
+ image = Image.open(path)
30
+ image.verify()
31
+ return True
32
+ except Exception:
33
+ return False
34
+
35
+ def __iter__(self):
36
+ return self.generator