Spaces:
Sleeping
Sleeping
NguyenPhong2612
commited on
Commit
·
ab576ba
1
Parent(s):
4185961
first commit
Browse files- .gitignore +8 -0
- Test.ipynb +80 -0
- app.py +45 -0
- parseq/augmentation.py +127 -0
- parseq/config.yaml +25 -0
- parseq/module.py +140 -0
- parseq/system.py +311 -0
- parseq/utils.py +113 -0
- requirements.txt +8 -0
- wpodnet/__init__.py +7 -0
- wpodnet/backend.py +157 -0
- wpodnet/lib_detection.py +265 -0
- wpodnet/model.py +73 -0
- wpodnet/stream.py +36 -0
.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/Test.ipynb/
|
2 |
+
/Test image/
|
3 |
+
/parseq/__pycache__/
|
4 |
+
/wpodnet/__pycache__/
|
5 |
+
/wpodnet/__init__/
|
6 |
+
/__init__/
|
7 |
+
/flagged/
|
8 |
+
/weights/
|
Test.ipynb
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 89,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import torch \n",
|
10 |
+
"import torch.nn as nn\n",
|
11 |
+
"from parseq.system import System\n",
|
12 |
+
"import yaml\n",
|
13 |
+
"import cv2\n",
|
14 |
+
"from parseq.augmentation import trans\n",
|
15 |
+
"import PIL\n",
|
16 |
+
"import imgaug\n",
|
17 |
+
"import torchvision\n",
|
18 |
+
"from wpodnet.lib_detection import load_model_wpod, detect_lp\n",
|
19 |
+
"import numpy as np\n",
|
20 |
+
"import gradio as gr \n",
|
21 |
+
"import tensorflow as tf\n",
|
22 |
+
"from tensorflow import keras\n",
|
23 |
+
"import timm\n",
|
24 |
+
"import pytorch_lightning as pl"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"cell_type": "code",
|
29 |
+
"execution_count": 91,
|
30 |
+
"metadata": {},
|
31 |
+
"outputs": [
|
32 |
+
{
|
33 |
+
"name": "stdout",
|
34 |
+
"output_type": "stream",
|
35 |
+
"text": [
|
36 |
+
"tensorflow==2.13.1\n",
|
37 |
+
"torch==2.4.1+cu118\n",
|
38 |
+
"gradio==4.44.1\n",
|
39 |
+
"timm==1.0.9\n",
|
40 |
+
"PIL==10.2.0\n",
|
41 |
+
"imgaug==0.4.0\n",
|
42 |
+
"opencv-python==4.10.0\n",
|
43 |
+
"torchvision==0.19.1+cu118\n"
|
44 |
+
]
|
45 |
+
}
|
46 |
+
],
|
47 |
+
"source": [
|
48 |
+
"print(f'tensorflow=={tf.__version__}')\n",
|
49 |
+
"print(f'torch=={torch.__version__}')\n",
|
50 |
+
"print(f'gradio=={gr.__version__}')\n",
|
51 |
+
"print(f'timm=={timm.__version__}')\n",
|
52 |
+
"print(f'PIL=={PIL.__version__}')\n",
|
53 |
+
"print(f'imgaug=={imgaug.__version__}')\n",
|
54 |
+
"print(f'opencv-python=={cv2.__version__}')\n",
|
55 |
+
"print(f'torchvision=={torchvision.__version__}')"
|
56 |
+
]
|
57 |
+
}
|
58 |
+
],
|
59 |
+
"metadata": {
|
60 |
+
"kernelspec": {
|
61 |
+
"display_name": "virtual",
|
62 |
+
"language": "python",
|
63 |
+
"name": "python3"
|
64 |
+
},
|
65 |
+
"language_info": {
|
66 |
+
"codemirror_mode": {
|
67 |
+
"name": "ipython",
|
68 |
+
"version": 3
|
69 |
+
},
|
70 |
+
"file_extension": ".py",
|
71 |
+
"mimetype": "text/x-python",
|
72 |
+
"name": "python",
|
73 |
+
"nbconvert_exporter": "python",
|
74 |
+
"pygments_lexer": "ipython3",
|
75 |
+
"version": "3.10.0"
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"nbformat": 4,
|
79 |
+
"nbformat_minor": 2
|
80 |
+
}
|
app.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from parseq.system import System
|
4 |
+
import yaml
|
5 |
+
import cv2
|
6 |
+
from parseq.augmentation import trans
|
7 |
+
from PIL import Image
|
8 |
+
from wpodnet.lib_detection import load_model_wpod, detect_lp
|
9 |
+
import numpy as np
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
15 |
+
|
16 |
+
checkpoint_path = 'weights/parseq.ckpt'
|
17 |
+
config_path = 'parseq/config.yaml'
|
18 |
+
wpod_path = 'weights/wpod-net.h5'
|
19 |
+
wpod_net = load_model_wpod(wpod_path)
|
20 |
+
|
21 |
+
with open(config_path, 'r') as data:
|
22 |
+
config = yaml.safe_load(data)
|
23 |
+
system = System(config)
|
24 |
+
checkpoint_path = 'weights/parseq.ckpt'
|
25 |
+
checkpoint = torch.load(checkpoint_path, map_location = 'cuda')
|
26 |
+
system.load_state_dict(checkpoint['state_dict'])
|
27 |
+
system.to(device)
|
28 |
+
|
29 |
+
def predict(image):
|
30 |
+
if isinstance(image, str):
|
31 |
+
image = cv2.imread(image)
|
32 |
+
_, img_wapred, _, _ = detect_lp(wpod_net, image, 0.5)
|
33 |
+
img = (img_wapred[0] * 255).astype(np.uint8)
|
34 |
+
img = Image.fromarray(img).convert("RGB")
|
35 |
+
image = trans(img).unsqueeze(0)
|
36 |
+
with torch.no_grad():
|
37 |
+
pred = system(image).softmax(-1)
|
38 |
+
generated_text, _ = system.tokenizer.decode(pred)
|
39 |
+
return generated_text[0]
|
40 |
+
|
41 |
+
interface = gr.Interface(
|
42 |
+
fn = predict,
|
43 |
+
inputs =[gr.components.Image()],
|
44 |
+
outputs=[gr.components.Textbox(label = "License plate", lines = 2)])
|
45 |
+
interface.launch(share = True, debug = True)
|
parseq/augmentation.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image, ImageFilter
|
2 |
+
from timm.data.auto_augment import _LEVEL_DENOM, LEVEL_TO_ARG, NAME_TO_OP, _randomly_negate, rotate
|
3 |
+
from functools import partial
|
4 |
+
from timm.data import auto_augment
|
5 |
+
import imgaug.augmenters as iaa
|
6 |
+
from torchvision import transforms as T
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
image_size = [224, 224]
|
10 |
+
|
11 |
+
def rotate_expand(img, degrees, **kwargs):
|
12 |
+
kwargs['expand'] = True
|
13 |
+
return rotate(img, degrees, **kwargs)
|
14 |
+
|
15 |
+
|
16 |
+
def _level_to_arg(level, hparams, key, default):
|
17 |
+
magnitude = hparams.get(key, default)
|
18 |
+
level = (level / _LEVEL_DENOM) * magnitude
|
19 |
+
level = _randomly_negate(level)
|
20 |
+
return (level,)
|
21 |
+
|
22 |
+
|
23 |
+
def apply():
|
24 |
+
NAME_TO_OP.update({
|
25 |
+
'Rotate': rotate_expand,
|
26 |
+
})
|
27 |
+
LEVEL_TO_ARG.update({
|
28 |
+
'Rotate': partial(_level_to_arg, key='rotate_deg', default=30.0),
|
29 |
+
'ShearX': partial(_level_to_arg, key='shear_x_pct', default=0.3),
|
30 |
+
'ShearY': partial(_level_to_arg, key='shear_y_pct', default=0.3),
|
31 |
+
'TranslateXRel': partial(_level_to_arg, key='translate_x_pct', default=0.45),
|
32 |
+
'TranslateYRel': partial(_level_to_arg, key='translate_y_pct', default=0.45),
|
33 |
+
})
|
34 |
+
|
35 |
+
apply()
|
36 |
+
|
37 |
+
_OP_CACHE = {}
|
38 |
+
|
39 |
+
def _get_op(key, factory):
|
40 |
+
try:
|
41 |
+
op = _OP_CACHE[key]
|
42 |
+
except KeyError:
|
43 |
+
op = factory()
|
44 |
+
_OP_CACHE[key] = op
|
45 |
+
return op
|
46 |
+
|
47 |
+
|
48 |
+
def _get_param(level, img, max_dim_factor, min_level=1):
|
49 |
+
max_level = max(min_level, max_dim_factor * max(img.size))
|
50 |
+
return round(min(level, max_level))
|
51 |
+
|
52 |
+
|
53 |
+
def gaussian_blur(img, radius, **__):
|
54 |
+
radius = _get_param(radius, img, 0.02)
|
55 |
+
key = 'gaussian_blur_' + str(radius)
|
56 |
+
op = _get_op(key, lambda: ImageFilter.GaussianBlur(radius))
|
57 |
+
return img.filter(op)
|
58 |
+
|
59 |
+
|
60 |
+
def motion_blur(img, k, **__):
|
61 |
+
k = _get_param(k, img, 0.08, 3) | 1 # bin to odd values
|
62 |
+
key = 'motion_blur_' + str(k)
|
63 |
+
op = _get_op(key, lambda: iaa.MotionBlur(k))
|
64 |
+
return Image.fromarray(op(image=np.asarray(img)))
|
65 |
+
|
66 |
+
|
67 |
+
def gaussian_noise(img, scale, **_):
|
68 |
+
scale = _get_param(scale, img, 0.25) | 1 # bin to odd values
|
69 |
+
key = 'gaussian_noise_' + str(scale)
|
70 |
+
op = _get_op(key, lambda: iaa.AdditiveGaussianNoise(scale=scale))
|
71 |
+
return Image.fromarray(op(image=np.asarray(img)))
|
72 |
+
|
73 |
+
|
74 |
+
def poisson_noise(img, lam, **_):
|
75 |
+
lam = _get_param(lam, img, 0.2) | 1 # bin to odd values
|
76 |
+
key = 'poisson_noise_' + str(lam)
|
77 |
+
op = _get_op(key, lambda: iaa.AdditivePoissonNoise(lam))
|
78 |
+
return Image.fromarray(op(image=np.asarray(img)))
|
79 |
+
|
80 |
+
|
81 |
+
def _level_to_arg(level, _hparams, max):
|
82 |
+
level = max * level / auto_augment._LEVEL_DENOM
|
83 |
+
return (level,)
|
84 |
+
|
85 |
+
|
86 |
+
_RAND_TRANSFORMS = auto_augment._RAND_INCREASING_TRANSFORMS.copy()
|
87 |
+
_RAND_TRANSFORMS.remove('SharpnessIncreasing') # remove, interferes with *blur ops
|
88 |
+
_RAND_TRANSFORMS.extend([
|
89 |
+
'GaussianBlur',
|
90 |
+
'PoissonNoise',
|
91 |
+
])
|
92 |
+
auto_augment.LEVEL_TO_ARG.update({
|
93 |
+
'GaussianBlur': partial(_level_to_arg, max=4),
|
94 |
+
'MotionBlur': partial(_level_to_arg, max=20),
|
95 |
+
'GaussianNoise': partial(_level_to_arg, max=0.1 * 255),
|
96 |
+
'PoissonNoise': partial(_level_to_arg, max=40),
|
97 |
+
})
|
98 |
+
auto_augment.NAME_TO_OP.update({
|
99 |
+
'GaussianBlur': gaussian_blur,
|
100 |
+
'MotionBlur': motion_blur,
|
101 |
+
'GaussianNoise': gaussian_noise,
|
102 |
+
'PoissonNoise': poisson_noise,
|
103 |
+
})
|
104 |
+
|
105 |
+
|
106 |
+
def rand_augment_transform(magnitude=5, num_layers=3):
|
107 |
+
hparams = {
|
108 |
+
'rotate_deg': 30,
|
109 |
+
'shear_x_pct': 0.9,
|
110 |
+
'shear_y_pct': 0.2,
|
111 |
+
'translate_x_pct': 0.10,
|
112 |
+
'translate_y_pct': 0.30,
|
113 |
+
}
|
114 |
+
ra_ops = auto_augment.rand_augment_ops(magnitude, hparams=hparams, transforms=_RAND_TRANSFORMS)
|
115 |
+
choice_weights = [1.0 / len(ra_ops) for _ in range(len(ra_ops))]
|
116 |
+
return auto_augment.RandAugment(ra_ops, num_layers, choice_weights)
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
trans = [rand_augment_transform()]
|
121 |
+
trans.append(lambda img: img.rotate(0, expand = True))
|
122 |
+
trans.extend([
|
123 |
+
T.Resize(image_size, T.InterpolationMode.BICUBIC),
|
124 |
+
T.ToTensor(),
|
125 |
+
T.Normalize(0.5, 0.5),
|
126 |
+
])
|
127 |
+
trans = T.Compose(trans)
|
parseq/config.yaml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
image_size : [224, 224]
|
3 |
+
patch_size : [16, 16]
|
4 |
+
max_len : 25
|
5 |
+
d_model : 384
|
6 |
+
enc_num_heads : 6
|
7 |
+
enc_mlp_ratio : 4
|
8 |
+
enc_depth : 12
|
9 |
+
dec_num_heads : 12
|
10 |
+
dec_mlp_ratio : 4
|
11 |
+
dec_depth : 1
|
12 |
+
perm_num : 8
|
13 |
+
perm_forward : true
|
14 |
+
perm_mirrored : true
|
15 |
+
decode_ar : true
|
16 |
+
refine_iter : 2
|
17 |
+
num_tokens : 97
|
18 |
+
pretrained : false
|
19 |
+
train_charset : 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~
|
20 |
+
weight_url : https://github.com/baudm/parseq/releases/download/v1.0.0/parseq_small_patch16_224-fcf06f5a.pt
|
21 |
+
trainer:
|
22 |
+
lr : 3e-4
|
23 |
+
batch_size : 4
|
24 |
+
weight_decay : 0.0
|
25 |
+
warm_pct : 0.075
|
parseq/module.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from timm.models.vision_transformer import PatchEmbed, VisionTransformer
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import math
|
6 |
+
|
7 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
class Encoder(VisionTransformer):
|
12 |
+
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
image_size=224,
|
16 |
+
patch_size=16,
|
17 |
+
in_chans=3,
|
18 |
+
embed_dim=768,
|
19 |
+
depth=12,
|
20 |
+
num_heads=12,
|
21 |
+
mlp_ratio=4.0,
|
22 |
+
qkv_bias=True,
|
23 |
+
drop_rate=0.0,
|
24 |
+
attn_drop_rate=0.0,
|
25 |
+
drop_path_rate=0.0,
|
26 |
+
embed_layer=PatchEmbed):
|
27 |
+
super().__init__(
|
28 |
+
image_size,
|
29 |
+
patch_size,
|
30 |
+
in_chans,
|
31 |
+
embed_dim=embed_dim,
|
32 |
+
depth=depth,
|
33 |
+
num_heads=num_heads,
|
34 |
+
mlp_ratio=mlp_ratio,
|
35 |
+
qkv_bias=qkv_bias,
|
36 |
+
drop_rate=drop_rate,
|
37 |
+
attn_drop_rate=attn_drop_rate,
|
38 |
+
drop_path_rate=drop_path_rate,
|
39 |
+
embed_layer=embed_layer,
|
40 |
+
num_classes=0,
|
41 |
+
global_pool='',
|
42 |
+
class_token=False)
|
43 |
+
|
44 |
+
def forward(self, x):
|
45 |
+
return self.forward_features(x.to(device))
|
46 |
+
|
47 |
+
class DecoderLayer(nn.Module):
|
48 |
+
|
49 |
+
def __init__(self, config):
|
50 |
+
super().__init__()
|
51 |
+
self.d_model = config['model']['d_model']
|
52 |
+
self.dec_num_heads = config['model']['dec_num_heads']
|
53 |
+
self.d_ff = config['model']['dec_mlp_ratio'] * self.d_model
|
54 |
+
self.eps = 1e-5
|
55 |
+
self.self_attn = nn.MultiheadAttention(self.d_model, self.dec_num_heads, dropout = 0.1, batch_first=True)
|
56 |
+
self.cross_attn = nn.MultiheadAttention(self.d_model, self.dec_num_heads, dropout = 0.1, batch_first=True)
|
57 |
+
|
58 |
+
self.linear1 = nn.Linear(self.d_model, self.d_ff)
|
59 |
+
self.dropout = nn.Dropout(p = 0.1)
|
60 |
+
self.linear2 = nn.Linear(self.d_ff, self.d_model)
|
61 |
+
|
62 |
+
self.norm1 = nn.LayerNorm(self.d_model, eps=self.eps)
|
63 |
+
self.norm2 = nn.LayerNorm(self.d_model, eps=self.eps)
|
64 |
+
self.norm_q = nn.LayerNorm(self.d_model, eps=self.eps)
|
65 |
+
self.norm_c = nn.LayerNorm(self.d_model, eps=self.eps)
|
66 |
+
self.dropout1 = nn.Dropout(p = 0.1)
|
67 |
+
self.dropout2 = nn.Dropout(p = 0.1)
|
68 |
+
self.dropout3 = nn.Dropout(p = 0.1)
|
69 |
+
|
70 |
+
|
71 |
+
def forward_stream(
|
72 |
+
self,
|
73 |
+
tgt,
|
74 |
+
tgt_norm,
|
75 |
+
tgt_kv,
|
76 |
+
memory,
|
77 |
+
tgt_mask,
|
78 |
+
tgt_key_padding_mask):
|
79 |
+
|
80 |
+
tgt2, sa_weights = self.self_attn(
|
81 |
+
tgt_norm, tgt_kv, tgt_kv, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
|
82 |
+
)
|
83 |
+
tgt = tgt + self.dropout1(tgt2)
|
84 |
+
|
85 |
+
tgt2, ca_weights = self.cross_attn(self.norm1(tgt), memory, memory)
|
86 |
+
tgt = tgt + self.dropout2(tgt2)
|
87 |
+
|
88 |
+
tgt2 = self.linear2(self.dropout(F.gelu(self.linear1(self.norm2(tgt)))))
|
89 |
+
tgt = tgt + self.dropout3(tgt2)
|
90 |
+
return tgt, sa_weights, ca_weights
|
91 |
+
|
92 |
+
def forward(
|
93 |
+
self,
|
94 |
+
query,
|
95 |
+
content,
|
96 |
+
memory,
|
97 |
+
query_mask = None,
|
98 |
+
content_mask = None,
|
99 |
+
content_key_padding_mask = None,
|
100 |
+
update_content: bool = True):
|
101 |
+
|
102 |
+
query_norm = self.norm_q(query)
|
103 |
+
content_norm = self.norm_c(content)
|
104 |
+
query = self.forward_stream(query, query_norm, content_norm, memory, query_mask, content_key_padding_mask)[0]
|
105 |
+
if update_content:
|
106 |
+
content = self.forward_stream(
|
107 |
+
content, content_norm, content_norm, memory, content_mask, content_key_padding_mask
|
108 |
+
)[0]
|
109 |
+
return query, content
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
class Decoder(nn.Module):
|
114 |
+
__constants__ = ['norm']
|
115 |
+
|
116 |
+
def __init__(self, config):
|
117 |
+
super().__init__()
|
118 |
+
self.d_model = config['model']['d_model']
|
119 |
+
self.num_layers = config['model']['dec_depth']
|
120 |
+
self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(self.num_layers)])
|
121 |
+
self.norm = nn.LayerNorm(self.d_model)
|
122 |
+
|
123 |
+
def forward(self, query, content, memory, query_mask = None, content_mask = None, content_key_padding_mask = None):
|
124 |
+
for i, mod in enumerate(self.layers):
|
125 |
+
last = i == len(self.layers) - 1
|
126 |
+
query, content = mod(
|
127 |
+
query, content, memory, query_mask, content_mask, content_key_padding_mask, update_content = not last)
|
128 |
+
query = self.norm(query)
|
129 |
+
return query
|
130 |
+
|
131 |
+
|
132 |
+
class TokenEmbedding(nn.Module):
|
133 |
+
def __init__(self, config):
|
134 |
+
super().__init__()
|
135 |
+
self.num_tokens = config['model']['num_tokens']
|
136 |
+
self.d_model = config['model']['d_model']
|
137 |
+
self.embedding = nn.Embedding(self.num_tokens, self.d_model)
|
138 |
+
|
139 |
+
def forward(self, tokens):
|
140 |
+
return math.sqrt(self.d_model) * self.embedding(tokens)
|
parseq/system.py
ADDED
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from timm.models.helpers import named_apply
|
4 |
+
from functools import partial
|
5 |
+
from .module import Encoder, Decoder, TokenEmbedding
|
6 |
+
from .utils import init_weights
|
7 |
+
import pytorch_lightning as pl
|
8 |
+
from .utils import Tokenizer, CharsetAdapter
|
9 |
+
import numpy as np
|
10 |
+
import math
|
11 |
+
from torch.optim import Optimizer
|
12 |
+
from timm.optim import create_optimizer_v2
|
13 |
+
from torch.optim.lr_scheduler import OneCycleLR
|
14 |
+
from itertools import permutations
|
15 |
+
import torch.nn.functional as F
|
16 |
+
|
17 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
18 |
+
|
19 |
+
class PARSeq(nn.Module):
|
20 |
+
|
21 |
+
def __init__(self, config, device = device):
|
22 |
+
super().__init__()
|
23 |
+
|
24 |
+
self.max_len = config['model']['max_len']
|
25 |
+
self.decode_ar = config['model']['decode_ar']
|
26 |
+
self.refine_iters = config['model']['refine_iter']
|
27 |
+
self.embed_dim = config['model']['d_model']
|
28 |
+
self.num_tokens = config['model']['num_tokens']
|
29 |
+
self.dropout = 0.1
|
30 |
+
self.encoder = Encoder(config['model']['image_size'], config['model']['patch_size'], embed_dim = config['model']['d_model'], depth = config['model']['enc_depth'], num_heads = config['model']['enc_num_heads'], mlp_ratio = config['model']['enc_mlp_ratio'])
|
31 |
+
self.decoder = Decoder(config)
|
32 |
+
self.text_embed = TokenEmbedding(config)
|
33 |
+
self.head = nn.Linear(self.embed_dim, self.num_tokens - 2)
|
34 |
+
|
35 |
+
self.pos_queries = nn.Parameter(torch.Tensor(1, self.max_len + 1, self.embed_dim))
|
36 |
+
self.dropout = nn.Dropout(self.dropout)
|
37 |
+
named_apply(partial(init_weights, exclude=['encoder']), self)
|
38 |
+
nn.init.trunc_normal_(self.pos_queries, std = 0.02)
|
39 |
+
self._device = device
|
40 |
+
|
41 |
+
@torch.jit.ignore
|
42 |
+
def no_weight_decay(self):
|
43 |
+
param_names = {'text_embed.embedding.weight', 'pos_queries'}
|
44 |
+
enc_param_names = {'encoder.' + n for n in self.encoder.no_weight_decay()}
|
45 |
+
return param_names.union(enc_param_names)
|
46 |
+
|
47 |
+
def encode(self, img: torch.Tensor):
|
48 |
+
return self.encoder(img.to(self._device))
|
49 |
+
|
50 |
+
def decode(
|
51 |
+
self,
|
52 |
+
tgt: torch.Tensor,
|
53 |
+
memory: torch.Tensor,
|
54 |
+
tgt_mask = None,
|
55 |
+
tgt_padding_mask = None,
|
56 |
+
tgt_query = None,
|
57 |
+
tgt_query_mask = None):
|
58 |
+
N, L = tgt.shape
|
59 |
+
null_ctx = self.text_embed(tgt[:, :1])
|
60 |
+
tgt_emb = self.pos_queries[:, : L - 1] + self.text_embed(tgt[:, 1:])
|
61 |
+
tgt_emb = self.dropout(torch.cat([null_ctx, tgt_emb], dim=1))
|
62 |
+
if tgt_query is None:
|
63 |
+
tgt_query = self.pos_queries[:, :L].expand(N, -1, -1)
|
64 |
+
tgt_query = self.dropout(tgt_query)
|
65 |
+
return self.decoder(tgt_query, tgt_emb, memory, tgt_query_mask, tgt_mask, tgt_padding_mask)
|
66 |
+
|
67 |
+
def forward(self, tokenizer: Tokenizer, images, max_length):
|
68 |
+
testing = max_length is None
|
69 |
+
max_length = self.max_len if max_length is None else min(max_length, self.max_len)
|
70 |
+
bs = images.shape[0]
|
71 |
+
num_steps = max_length + 1
|
72 |
+
memory = self.encode(images).to(device)
|
73 |
+
pos_queries = self.pos_queries[:, :num_steps].expand(bs, -1, -1)
|
74 |
+
|
75 |
+
tgt_mask = query_mask = torch.triu(torch.ones((num_steps, num_steps), dtype=torch.bool, device=self._device), 1)
|
76 |
+
|
77 |
+
if self.decode_ar:
|
78 |
+
tgt_in = torch.full((bs, num_steps), tokenizer.pad_id, dtype=torch.long, device=self._device)
|
79 |
+
tgt_in[:, 0] = tokenizer.sos_id
|
80 |
+
|
81 |
+
logits = []
|
82 |
+
for i in range(num_steps):
|
83 |
+
j = i + 1
|
84 |
+
tgt_out = self.decode(
|
85 |
+
tgt_in[:, :j],
|
86 |
+
memory,
|
87 |
+
tgt_mask[:j, :j],
|
88 |
+
tgt_query = pos_queries[:, i:j],
|
89 |
+
tgt_query_mask = query_mask[i:j, :j],)
|
90 |
+
|
91 |
+
p_i = self.head(tgt_out)
|
92 |
+
logits.append(p_i)
|
93 |
+
if j < num_steps:
|
94 |
+
tgt_in[:, j] = p_i.squeeze().argmax(-1)
|
95 |
+
if testing and (tgt_in == tokenizer.eos_id).any(dim=-1).all():
|
96 |
+
break
|
97 |
+
|
98 |
+
logits = torch.cat(logits, dim=1)
|
99 |
+
else:
|
100 |
+
tgt_in = torch.full((bs, 1), tokenizer.sos_id, dtype=torch.long, device=self._device)
|
101 |
+
tgt_out = self.decode(tgt_in, memory, tgt_query=pos_queries)
|
102 |
+
logits = self.head(tgt_out)
|
103 |
+
|
104 |
+
if self.refine_iters:
|
105 |
+
query_mask[torch.triu(torch.ones(num_steps, num_steps, dtype=torch.bool, device=self._device), 2)] = 0
|
106 |
+
bos = torch.full((bs, 1), tokenizer.sos_id, dtype=torch.long, device=self._device)
|
107 |
+
for i in range(self.refine_iters):
|
108 |
+
tgt_in = torch.cat([bos, logits[:, :-1].argmax(-1)], dim=1)
|
109 |
+
tgt_padding_mask = (tgt_in == tokenizer.eos_id).int().cumsum(-1) > 0
|
110 |
+
tgt_out = self.decode(
|
111 |
+
tgt_in, memory, tgt_mask, tgt_padding_mask, pos_queries, query_mask[:, : tgt_in.shape[1]])
|
112 |
+
logits = self.head(tgt_out)
|
113 |
+
|
114 |
+
return logits
|
115 |
+
|
116 |
+
|
117 |
+
|
118 |
+
class System(pl.LightningModule):
|
119 |
+
|
120 |
+
def __init__( self, config):
|
121 |
+
|
122 |
+
super().__init__()
|
123 |
+
self.save_hyperparameters()
|
124 |
+
self.max_len = int(config['model']['max_len'])
|
125 |
+
self.charset_adapter = CharsetAdapter()
|
126 |
+
self.charset = config['model']['train_charset']
|
127 |
+
self.lr = float(config['trainer']['lr'])
|
128 |
+
self.batch_size = config['trainer']['batch_size']
|
129 |
+
self.warm_pct = float(config['trainer']['warm_pct'])
|
130 |
+
self.weight_decay = float(config['trainer']['weight_decay'])
|
131 |
+
self.tokenizer = Tokenizer(self.charset, self.max_len)
|
132 |
+
self.sos_id = self.tokenizer.sos_id
|
133 |
+
self.eos_id = self.tokenizer.eos_id
|
134 |
+
self.pad_id = self.tokenizer.pad_id
|
135 |
+
|
136 |
+
self.model = PARSeq(config)
|
137 |
+
self.rng = np.random.default_rng()
|
138 |
+
self.max_gen_perms = config['model']['perm_num'] // 2 if config['model']['perm_mirrored'] else config['model']['perm_num']
|
139 |
+
self.perm_forward = config['model']['perm_forward']
|
140 |
+
self.perm_mirrored = config['model']['perm_mirrored']
|
141 |
+
if config['model']['pretrained']:
|
142 |
+
self.weight_ulr = config['model']['weight_url']
|
143 |
+
self.load_weight(self.weight_ulr)
|
144 |
+
self.set_seed()
|
145 |
+
|
146 |
+
def set_seed(self, seed = 42):
|
147 |
+
torch.manual_seed(seed)
|
148 |
+
np.random.seed(seed)
|
149 |
+
if torch.cuda.is_available():
|
150 |
+
torch.cuda.manual_seed(seed)
|
151 |
+
torch.cuda.manual_seed_all(seed)
|
152 |
+
torch.backends.cudnn.deterministic = True
|
153 |
+
torch.backends.cudnn.benchmark = False
|
154 |
+
|
155 |
+
|
156 |
+
def configure_optimizers(self):
|
157 |
+
agb = self.trainer.accumulate_grad_batches
|
158 |
+
lr_scale = agb * math.sqrt(self.trainer.num_devices) * self.batch_size / 256.0
|
159 |
+
lr = float(lr_scale) * float(self.lr)
|
160 |
+
optim = create_optimizer_v2(self, 'adamw', lr, self.weight_decay)
|
161 |
+
sched = OneCycleLR(
|
162 |
+
optim, lr, self.trainer.estimated_stepping_batches, pct_start=self.warm_pct, cycle_momentum=False
|
163 |
+
)
|
164 |
+
return {'optimizer': optim, 'lr_scheduler': {'scheduler': sched, 'interval': 'step'}}
|
165 |
+
|
166 |
+
def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer) -> None:
|
167 |
+
optimizer.zero_grad(set_to_none=True)
|
168 |
+
|
169 |
+
def forward(self, images, max_length = None):
|
170 |
+
return self.model.forward(self.tokenizer, images, max_length)
|
171 |
+
|
172 |
+
def gen_tgt_perms(self, tgt):
|
173 |
+
max_num_chars = tgt.shape[1] - 2
|
174 |
+
if max_num_chars == 1:
|
175 |
+
return torch.arange(3, device=self._device).unsqueeze(0)
|
176 |
+
perms = [torch.arange(max_num_chars, device=self._device)] if self.perm_forward else []
|
177 |
+
max_perms = math.factorial(max_num_chars)
|
178 |
+
if self.perm_mirrored:
|
179 |
+
max_perms //= 2
|
180 |
+
num_gen_perms = min(self.max_gen_perms, max_perms)
|
181 |
+
|
182 |
+
if max_num_chars < 5:
|
183 |
+
|
184 |
+
if max_num_chars == 4 and self.perm_mirrored:
|
185 |
+
selector = [0, 3, 4, 6, 9, 10, 12, 16, 17, 18, 19, 21]
|
186 |
+
else:
|
187 |
+
selector = list(range(max_perms))
|
188 |
+
perm_pool = torch.as_tensor(
|
189 |
+
list(permutations(range(max_num_chars), max_num_chars)),
|
190 |
+
device=self._device,
|
191 |
+
)[selector]
|
192 |
+
if self.perm_forward:
|
193 |
+
perm_pool = perm_pool[1:]
|
194 |
+
perms = torch.stack(perms)
|
195 |
+
if len(perm_pool):
|
196 |
+
i = self.rng.choice(len(perm_pool), size=num_gen_perms - len(perms), replace=False)
|
197 |
+
perms = torch.cat([perms, perm_pool[i]])
|
198 |
+
else:
|
199 |
+
perms.extend(
|
200 |
+
[torch.randperm(max_num_chars, device = self._device) for _ in range(num_gen_perms - len(perms))]
|
201 |
+
)
|
202 |
+
perms = torch.stack(perms)
|
203 |
+
if self.perm_mirrored:
|
204 |
+
comp = perms.flip(-1)
|
205 |
+
perms = torch.stack([perms, comp]).transpose(0, 1).reshape(-1, max_num_chars)
|
206 |
+
sos_idx = perms.new_zeros((len(perms), 1))
|
207 |
+
eos_idx = perms.new_full((len(perms), 1), max_num_chars + 1)
|
208 |
+
perms = torch.cat([sos_idx, perms + 1, eos_idx], dim=1)
|
209 |
+
if len(perms) > 1:
|
210 |
+
perms[1, 1:] = max_num_chars + 1 - torch.arange(max_num_chars + 1, device=self._device)
|
211 |
+
return perms
|
212 |
+
|
213 |
+
def generate_attn_masks(self, perm):
|
214 |
+
sz = perm.shape[0]
|
215 |
+
mask = torch.zeros((sz, sz), dtype=torch.bool, device=self._device)
|
216 |
+
for i in range(sz):
|
217 |
+
query_idx = perm[i]
|
218 |
+
masked_keys = perm[i + 1 :]
|
219 |
+
mask[query_idx, masked_keys] = True
|
220 |
+
content_mask = mask[:-1, :-1].clone()
|
221 |
+
mask[torch.eye(sz, dtype=torch.bool, device=self._device)] = True # mask "self"
|
222 |
+
query_mask = mask[1:, :-1]
|
223 |
+
return content_mask, query_mask
|
224 |
+
|
225 |
+
def training_step(self, batch, batch_idx):
|
226 |
+
images, labels = batch
|
227 |
+
images = images.to(device)
|
228 |
+
tgt = labels.to(device)
|
229 |
+
|
230 |
+
memory = self.model.encode(images.to(device))
|
231 |
+
|
232 |
+
tgt_perms = self.gen_tgt_perms(tgt)
|
233 |
+
tgt_in = tgt[:, :-1]
|
234 |
+
tgt_out = tgt[:, 1:]
|
235 |
+
tgt_padding_mask = (tgt_in == self.pad_id) | (tgt_in == self.eos_id)
|
236 |
+
|
237 |
+
loss = 0
|
238 |
+
loss_numel = 0
|
239 |
+
n = (tgt_out != self.pad_id).sum().item()
|
240 |
+
for i, perm in enumerate(tgt_perms):
|
241 |
+
tgt_mask, query_mask = self.generate_attn_masks(perm)
|
242 |
+
out = self.model.decode(tgt_in, memory, tgt_mask, tgt_padding_mask, tgt_query_mask=query_mask)
|
243 |
+
logits = self.model.head(out).flatten(end_dim=1)
|
244 |
+
loss += n * F.cross_entropy(logits, tgt_out.flatten(), ignore_index=self.pad_id)
|
245 |
+
loss_numel += n
|
246 |
+
if i == 1:
|
247 |
+
tgt_out = torch.where(tgt_out == self.eos_id, self.pad_id, tgt_out)
|
248 |
+
n = (tgt_out != self.pad_id).sum().item()
|
249 |
+
|
250 |
+
loss /= loss_numel
|
251 |
+
with torch.no_grad():
|
252 |
+
self.eval()
|
253 |
+
logits, _, _ = self.forward_logits_loss(images, labels)
|
254 |
+
predicted_labels, _ = self.tokenizer.decode(logits.softmax(-1))
|
255 |
+
predicted_labels = [self.charset_adapter(label) for label in predicted_labels]
|
256 |
+
true_labels = self.decode(labels)
|
257 |
+
count = 0
|
258 |
+
for i in range(len(true_labels)):
|
259 |
+
if true_labels[i] == predicted_labels[i]:
|
260 |
+
count += 1
|
261 |
+
train_acc = float(count / len(true_labels))
|
262 |
+
self.log("train_loss", loss, on_epoch = True, prog_bar = True, logger = True)
|
263 |
+
self.log("train_acc", train_acc, on_epoch = True, prog_bar = True, logger = True)
|
264 |
+
return loss
|
265 |
+
|
266 |
+
def forward_logits_loss(self, images, targets: list[str]):
|
267 |
+
targets = targets[:, 1:]
|
268 |
+
max_len = targets.shape[1] - 1
|
269 |
+
logits = self.forward(images, max_len)
|
270 |
+
loss = F.cross_entropy(logits.flatten(end_dim = 1), targets.flatten(), ignore_index = self.pad_id)
|
271 |
+
loss_numel = (targets != self.pad_id).sum()
|
272 |
+
return logits, loss, loss_numel
|
273 |
+
|
274 |
+
def validation_step(self, batch, batch_idx):
|
275 |
+
self.eval()
|
276 |
+
images, labels = batch
|
277 |
+
with torch.no_grad():
|
278 |
+
logits, loss, loss_numel = self.forward_logits_loss(images, labels)
|
279 |
+
predicted_labels, _ = self.tokenizer.decode(logits.softmax(-1))
|
280 |
+
predicted_labels = [self.charset_adapter(label) for label in predicted_labels]
|
281 |
+
true_labels = self.decode(labels)
|
282 |
+
count = 0
|
283 |
+
for i in range(len(true_labels)):
|
284 |
+
if true_labels[i] == predicted_labels[i]:
|
285 |
+
count += 1
|
286 |
+
val_acc = float(count / len(true_labels))
|
287 |
+
self.log("val_loss", loss / loss_numel, on_epoch = True, prog_bar = True, logger = True)
|
288 |
+
self.log("val_acc", val_acc, on_epoch = True, prog_bar = True, logger = True)
|
289 |
+
|
290 |
+
def on_train_epoch_end(self):
|
291 |
+
train_loss = self.trainer.callback_metrics["train_loss"].item()
|
292 |
+
train_acc = self.trainer.callback_metrics["train_acc"].item()
|
293 |
+
val_loss = self.trainer.callback_metrics["val_loss"].item()
|
294 |
+
val_acc = self.trainer.callback_metrics["val_acc"].item()
|
295 |
+
combined_acc = val_acc + 1e-1 * train_acc
|
296 |
+
self.log("combined_acc", combined_acc, prog_bar = False, logger = True)
|
297 |
+
print(f"Epoch {self.current_epoch}: train_loss = {train_loss:.3f}, train_acc = {train_acc:.3f}, val_loss = {val_loss:.3f}, val_acc = {val_acc:.3f}")
|
298 |
+
|
299 |
+
def load_weight(self, url):
|
300 |
+
state_dict = torch.hub.load_state_dict_from_url(url = url, map_location = 'cuda', check_hash = True)
|
301 |
+
self.model.load_state_dict(state_dict)
|
302 |
+
print("Load weights sucessfully !!!")
|
303 |
+
|
304 |
+
def decode(self, ids):
|
305 |
+
true_labels = []
|
306 |
+
if isinstance(ids):
|
307 |
+
ids = ids.tolist()
|
308 |
+
for label in ids:
|
309 |
+
true_label = self.tokenizer._ids2tok(label)
|
310 |
+
true_labels.append(self.charset_adapter(true_label))
|
311 |
+
return true_labels
|
parseq/utils.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import torch
|
3 |
+
from torch import Tensor
|
4 |
+
import torch.nn as nn
|
5 |
+
from typing import Sequence
|
6 |
+
class CharsetAdapter:
|
7 |
+
|
8 |
+
def __init__(self):
|
9 |
+
super().__init__()
|
10 |
+
self.charset = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
11 |
+
self.unsupported = re.compile(f'[^{re.escape(self.charset)}]')
|
12 |
+
|
13 |
+
def __call__(self, label):
|
14 |
+
label = label.upper()
|
15 |
+
label = self.unsupported.sub('', label)
|
16 |
+
return label
|
17 |
+
|
18 |
+
|
19 |
+
class Vocab:
|
20 |
+
def __init__(self, charset):
|
21 |
+
self.c2i = dict()
|
22 |
+
self.c2i['<EOS>'] = 0
|
23 |
+
count = 1
|
24 |
+
for c in charset:
|
25 |
+
if c not in self.c2i.keys():
|
26 |
+
self.c2i[c] = count
|
27 |
+
count += 1
|
28 |
+
self.c2i['<SOS>'] = len(self.c2i)
|
29 |
+
self.c2i['<PAD>'] = len(self.c2i)
|
30 |
+
|
31 |
+
self.i2c = {v : k for k, v in self.c2i.items()}
|
32 |
+
|
33 |
+
def __len__(self):
|
34 |
+
return len(self.c2i)
|
35 |
+
|
36 |
+
|
37 |
+
class Tokenizer:
|
38 |
+
def __init__(self, charset, max_len):
|
39 |
+
|
40 |
+
self.max_len = max_len
|
41 |
+
self.vocab = Vocab(charset)
|
42 |
+
self.sos_id = self.vocab.c2i['<SOS>']
|
43 |
+
self.eos_id = self.vocab.c2i['<EOS>']
|
44 |
+
self.pad_id = self.vocab.c2i['<PAD>']
|
45 |
+
self.special = [self.sos_id, self.eos_id, self.pad_id]
|
46 |
+
|
47 |
+
def __len__(self):
|
48 |
+
return len(self.vocab)
|
49 |
+
|
50 |
+
def _tok2ids(self, token : str):
|
51 |
+
return [self.vocab.c2i[c] for c in token]
|
52 |
+
|
53 |
+
def _ids2tok(self, token_ids, join = True):
|
54 |
+
if isinstance(token_ids, torch.Tensor):
|
55 |
+
token_ids = token_ids.tolist()
|
56 |
+
token = [self.vocab.i2c[i] for i in token_ids if i not in self.special]
|
57 |
+
return ''.join(token) if join else token
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
def encode_batch(self, labels : list[str], device):
|
62 |
+
encoded_labels = []
|
63 |
+
for label in labels:
|
64 |
+
encoded_label = [self.sos_id] + self._tok2ids(label) + [self.eos_id]
|
65 |
+
if len(encoded_label) > self.max_len:
|
66 |
+
encoded_label = encoded_label[ : self.max_len]
|
67 |
+
else:
|
68 |
+
encoded_label = encoded_label + [self.pad_id] * (self.max_len - len(encoded_label))
|
69 |
+
encoded_labels.append(torch.tensor(encoded_label, dtype = torch.long, device = device))
|
70 |
+
return torch.stack(encoded_labels, dim = 0)
|
71 |
+
|
72 |
+
def _filter(self, probs : Tensor, ids : Tensor):
|
73 |
+
ids = ids.tolist()
|
74 |
+
try:
|
75 |
+
eos_idx = ids.index(self.eos_id)
|
76 |
+
except ValueError:
|
77 |
+
eos_idx = len(ids)
|
78 |
+
ids = ids[ : eos_idx]
|
79 |
+
probs = probs[: eos_idx + 1]
|
80 |
+
return probs, ids
|
81 |
+
|
82 |
+
def decode(self, token_dists : Tensor, raw : bool = False):
|
83 |
+
|
84 |
+
batch_tokens = []
|
85 |
+
batch_probs = []
|
86 |
+
for dist in token_dists:
|
87 |
+
probs, ids = dist.max(-1)
|
88 |
+
if not raw:
|
89 |
+
probs, ids = self._filter(probs, ids)
|
90 |
+
tokens = self._ids2tok(ids, not raw)
|
91 |
+
batch_tokens.append(tokens)
|
92 |
+
batch_probs.append(probs)
|
93 |
+
return batch_tokens, batch_probs
|
94 |
+
|
95 |
+
|
96 |
+
def init_weights(module: nn.Module, name: str = '', exclude: Sequence[str] = ()):
|
97 |
+
if any(map(name.startswith, exclude)):
|
98 |
+
return
|
99 |
+
if isinstance(module, nn.Linear):
|
100 |
+
nn.init.trunc_normal_(module.weight, std=0.02)
|
101 |
+
if module.bias is not None:
|
102 |
+
nn.init.zeros_(module.bias)
|
103 |
+
elif isinstance(module, nn.Embedding):
|
104 |
+
nn.init.trunc_normal_(module.weight, std=0.02)
|
105 |
+
if module.padding_idx is not None:
|
106 |
+
module.weight.data[module.padding_idx].zero_()
|
107 |
+
elif isinstance(module, nn.Conv2d):
|
108 |
+
nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
|
109 |
+
if module.bias is not None:
|
110 |
+
nn.init.zeros_(module.bias)
|
111 |
+
elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)):
|
112 |
+
nn.init.ones_(module.weight)
|
113 |
+
nn.init.zeros_(module.bias)
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tensorflow==2.13.1
|
2 |
+
torch==2.4.1
|
3 |
+
gradio==4.44.1
|
4 |
+
timm==1.0.9
|
5 |
+
PIL==10.2.0
|
6 |
+
imgaug==0.4.0
|
7 |
+
opencv-python==4.10.0
|
8 |
+
torchvision==0.19.1
|
wpodnet/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = '1.0.3'
|
2 |
+
|
3 |
+
from .backend import Prediction, Predictor
|
4 |
+
|
5 |
+
__all__ = [
|
6 |
+
'Prediction', 'Predictor'
|
7 |
+
]
|
wpodnet/backend.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from PIL import Image, ImageDraw
|
6 |
+
from torchvision.transforms.functional import (to_tensor)
|
7 |
+
import cv2
|
8 |
+
from .model import WPODNet
|
9 |
+
|
10 |
+
|
11 |
+
class Prediction:
|
12 |
+
def __init__(self, image: Image.Image, bounds: np.ndarray, confidence: float):
|
13 |
+
self.image = image
|
14 |
+
self.bounds = bounds
|
15 |
+
self.confidence = confidence
|
16 |
+
|
17 |
+
def _get_width_height(self):
|
18 |
+
def distance(point1,point2):
|
19 |
+
x1=point1[0]
|
20 |
+
y1=point1[1]
|
21 |
+
x2=point2[0]
|
22 |
+
y2=point2[1]
|
23 |
+
distance = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
|
24 |
+
return distance
|
25 |
+
box = self.bounds
|
26 |
+
dis1= distance(box[0],box[1])
|
27 |
+
dis2 = distance(box[1],box[2])
|
28 |
+
dis3 = distance(box[2],box[3])
|
29 |
+
dis4 = distance(box[3],box[0])
|
30 |
+
width = (dis1+dis3)/2
|
31 |
+
height= (dis2+dis4)/2
|
32 |
+
if height/width >0.49:
|
33 |
+
return 64,46
|
34 |
+
return 100, 23
|
35 |
+
def get_perspective_M(self, width: int, height: int) -> List[float]:
|
36 |
+
# Get the perspective matrix
|
37 |
+
src_points = np.array(self.bounds,dtype=np.float32)
|
38 |
+
dst_points = np.array([[0, 0], [width, 0], [width, height], [0, height]],np.float32)
|
39 |
+
return cv2.getPerspectiveTransform(src_points,dst_points)
|
40 |
+
def annotate(self, outline: str = 'red', width: int = 3) -> Image.Image:
|
41 |
+
canvas = self.image.copy()
|
42 |
+
drawer = ImageDraw.Draw(canvas)
|
43 |
+
drawer.polygon(
|
44 |
+
[(x, y) for x, y in self.bounds],
|
45 |
+
outline=outline,
|
46 |
+
width=width
|
47 |
+
)
|
48 |
+
return canvas
|
49 |
+
|
50 |
+
def warp(self):#, width: int = 208, height: int = 60) -> Image.Image:
|
51 |
+
# Get the perspective matrix
|
52 |
+
width, height = self._get_width_height()
|
53 |
+
|
54 |
+
M= self.get_perspective_M(width, height)
|
55 |
+
|
56 |
+
n_image = np.array(self.image)
|
57 |
+
warped = cv2.warpPerspective(n_image,M,(int(width),int(height)))
|
58 |
+
return warped
|
59 |
+
|
60 |
+
|
61 |
+
class Predictor:
|
62 |
+
_q = np.array([
|
63 |
+
[-.5, .5, .5, -.5],
|
64 |
+
[-.5, -.5, .5, .5],
|
65 |
+
[1., 1., 1., 1.]
|
66 |
+
])
|
67 |
+
_scaling_const = 7.75
|
68 |
+
_stride = 16
|
69 |
+
|
70 |
+
def __init__(self, wpodnet:WPODNet):
|
71 |
+
self.wpodnet = wpodnet
|
72 |
+
self.wpodnet.eval()
|
73 |
+
|
74 |
+
def _resize_to_fixed_ratio(self, image: Image.Image, dim_min: int, dim_max: int) -> Image.Image:
|
75 |
+
h, w = image.height, image.width
|
76 |
+
|
77 |
+
wh_ratio = max(h, w) / min(h, w)
|
78 |
+
side = int(wh_ratio * dim_min)
|
79 |
+
bound_dim = min(side + side % self._stride, dim_max)
|
80 |
+
|
81 |
+
factor = bound_dim / max(h, w)
|
82 |
+
reg_w, reg_h = int(w * factor), int(h * factor)
|
83 |
+
|
84 |
+
# Ensure the both width and height are the multiply of `self._stride`
|
85 |
+
reg_w_mod = reg_w % self._stride
|
86 |
+
if reg_w_mod > 0:
|
87 |
+
reg_w += self._stride - reg_w_mod
|
88 |
+
|
89 |
+
reg_h_mod = reg_h % self._stride
|
90 |
+
if reg_h_mod > 0:
|
91 |
+
reg_h += self._stride - reg_h % self._stride
|
92 |
+
|
93 |
+
return image.resize((reg_w, reg_h))
|
94 |
+
|
95 |
+
def _to_torch_image(self, image: Image.Image) -> torch.Tensor:
|
96 |
+
tensor = to_tensor(image)
|
97 |
+
return tensor.unsqueeze_(0)
|
98 |
+
|
99 |
+
def _inference(self, image: torch.Tensor) -> Tuple[np.ndarray, np.ndarray]:
|
100 |
+
with torch.no_grad():
|
101 |
+
probs, affines = self.wpodnet.forward(image)
|
102 |
+
|
103 |
+
# Convert to squeezed numpy array
|
104 |
+
# grid_w: The number of anchors in row
|
105 |
+
# grid_h: The number of anchors in column
|
106 |
+
probs = np.squeeze(probs.cpu().numpy())[0] # (grid_h, grid_w)
|
107 |
+
affines = np.squeeze(affines.cpu().numpy()) # (6, grid_h, grid_w)
|
108 |
+
|
109 |
+
return probs, affines
|
110 |
+
|
111 |
+
def _get_max_anchor(self, probs: np.ndarray) -> Tuple[int, int]:
|
112 |
+
return np.unravel_index(probs.argmax(), probs.shape)
|
113 |
+
|
114 |
+
def _get_bounds(self, affines: np.ndarray, anchor_y: int, anchor_x: int, scaling_ratio: float = 1.0) -> np.ndarray:
|
115 |
+
# Compute theta
|
116 |
+
theta = affines[:, anchor_y, anchor_x]
|
117 |
+
theta = theta.reshape((2, 3))
|
118 |
+
theta[0, 0] = max(theta[0, 0], 0.0)
|
119 |
+
theta[1, 1] = max(theta[1, 1], 0.0)
|
120 |
+
|
121 |
+
# Convert theta into the bounding polygon
|
122 |
+
bounds = np.matmul(theta, self._q) * self._scaling_const * scaling_ratio
|
123 |
+
|
124 |
+
# Normalize the bounds
|
125 |
+
_, grid_h, grid_w = affines.shape
|
126 |
+
bounds[0] = (bounds[0] + anchor_x + .5) / grid_w
|
127 |
+
bounds[1] = (bounds[1] + anchor_y + .5) / grid_h
|
128 |
+
|
129 |
+
return np.transpose(bounds)
|
130 |
+
|
131 |
+
def predict(self, image: Image.Image, scaling_ratio: float = 1.0, dim_min: int = 288, dim_max: int = 608) -> Prediction:
|
132 |
+
orig_h, orig_w = image.height, image.width
|
133 |
+
|
134 |
+
# Resize the image to fixed ratio
|
135 |
+
# This operation is convienence for setup the anchors
|
136 |
+
resized = self._resize_to_fixed_ratio(image, dim_min=dim_min, dim_max=dim_max)
|
137 |
+
resized = self._to_torch_image(resized)
|
138 |
+
resized = resized.to(self.wpodnet.device)
|
139 |
+
|
140 |
+
# Inference with WPODNet
|
141 |
+
# probs: The probability distribution of the location of license plate
|
142 |
+
# affines: The predicted affine matrix
|
143 |
+
probs, affines = self._inference(resized)
|
144 |
+
|
145 |
+
# Get the theta with maximum probability
|
146 |
+
max_prob = np.amax(probs)
|
147 |
+
anchor_y, anchor_x = self._get_max_anchor(probs)
|
148 |
+
bounds = self._get_bounds(affines, anchor_y, anchor_x, scaling_ratio)
|
149 |
+
|
150 |
+
bounds[:, 0] *= orig_w
|
151 |
+
bounds[:, 1] *= orig_h
|
152 |
+
|
153 |
+
return Prediction(
|
154 |
+
image=image,
|
155 |
+
bounds=bounds.astype(np.int32),
|
156 |
+
confidence=max_prob.item()
|
157 |
+
)
|
wpodnet/lib_detection.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pylint: disable=invalid-name, redefined-outer-name, missing-docstring, non-parent-init-called, trailing-whitespace, line-too-long
|
2 |
+
from os.path import splitext
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
from keras.models import load_model
|
6 |
+
import os
|
7 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
8 |
+
|
9 |
+
class Label:
|
10 |
+
def __init__(self, cl=-1, tl=np.array([0., 0.]), br=np.array([0., 0.]), prob=None):
|
11 |
+
self.__tl = tl
|
12 |
+
self.__br = br
|
13 |
+
self.__cl = cl
|
14 |
+
self.__prob = prob
|
15 |
+
|
16 |
+
def __str__(self):
|
17 |
+
return 'Class: %d, top left(x: %f, y: %f), bottom right(x: %f, y: %f)' % (
|
18 |
+
self.__cl, self.__tl[0], self.__tl[1], self.__br[0], self.__br[1])
|
19 |
+
|
20 |
+
def copy(self):
|
21 |
+
return Label(self.__cl, self.__tl, self.__br)
|
22 |
+
|
23 |
+
def wh(self): return self.__br - self.__tl
|
24 |
+
|
25 |
+
def cc(self): return self.__tl + self.wh() / 2
|
26 |
+
|
27 |
+
def tl(self): return self.__tl
|
28 |
+
|
29 |
+
def br(self): return self.__br
|
30 |
+
|
31 |
+
def tr(self): return np.array([self.__br[0], self.__tl[1]])
|
32 |
+
|
33 |
+
def bl(self): return np.array([self.__tl[0], self.__br[1]])
|
34 |
+
|
35 |
+
def cl(self): return self.__cl
|
36 |
+
|
37 |
+
def area(self): return np.prod(self.wh())
|
38 |
+
|
39 |
+
def prob(self): return self.__prob
|
40 |
+
|
41 |
+
def set_class(self, cl):
|
42 |
+
self.__cl = cl
|
43 |
+
|
44 |
+
def set_tl(self, tl):
|
45 |
+
self.__tl = tl
|
46 |
+
|
47 |
+
def set_br(self, br):
|
48 |
+
self.__br = br
|
49 |
+
|
50 |
+
def set_wh(self, wh):
|
51 |
+
cc = self.cc()
|
52 |
+
self.__tl = cc - .5 * wh
|
53 |
+
self.__br = cc + .5 * wh
|
54 |
+
|
55 |
+
def set_prob(self, prob):
|
56 |
+
self.__prob = prob
|
57 |
+
|
58 |
+
class DLabel(Label):
|
59 |
+
def __init__(self, cl, pts, prob):
|
60 |
+
self.pts = pts
|
61 |
+
tl = np.amin(pts, axis=1)
|
62 |
+
br = np.amax(pts, axis=1)
|
63 |
+
Label.__init__(self, cl, tl, br, prob)
|
64 |
+
|
65 |
+
# Hàm normalize ảnh
|
66 |
+
def im2single(Image):
|
67 |
+
return Image.astype('float32') / 255
|
68 |
+
|
69 |
+
def getWH(shape):
|
70 |
+
return np.array(shape[1::-1]).astype(float)
|
71 |
+
|
72 |
+
def IOU(tl1, br1, tl2, br2):
|
73 |
+
wh1, wh2 = br1-tl1, br2-tl2
|
74 |
+
assert((wh1 >= 0).all() and (wh2 >= 0).all())
|
75 |
+
|
76 |
+
intersection_wh = np.maximum(np.minimum(br1, br2) - np.maximum(tl1, tl2), 0)
|
77 |
+
intersection_area = np.prod(intersection_wh)
|
78 |
+
area1, area2 = (np.prod(wh1), np.prod(wh2))
|
79 |
+
union_area = area1 + area2 - intersection_area
|
80 |
+
return intersection_area/union_area
|
81 |
+
|
82 |
+
def IOU_labels(l1, l2):
|
83 |
+
return IOU(l1.tl(), l1.br(), l2.tl(), l2.br())
|
84 |
+
|
85 |
+
def nms(Labels, iou_threshold=0.5):
|
86 |
+
SelectedLabels = []
|
87 |
+
Labels.sort(key=lambda l: l.prob(), reverse=True)
|
88 |
+
|
89 |
+
for label in Labels:
|
90 |
+
non_overlap = True
|
91 |
+
for sel_label in SelectedLabels:
|
92 |
+
if IOU_labels(label, sel_label) > iou_threshold:
|
93 |
+
non_overlap = False
|
94 |
+
break
|
95 |
+
|
96 |
+
if non_overlap:
|
97 |
+
SelectedLabels.append(label)
|
98 |
+
return SelectedLabels
|
99 |
+
|
100 |
+
def load_model_wpod(path):
|
101 |
+
model = load_model(path)
|
102 |
+
return model
|
103 |
+
|
104 |
+
def find_T_matrix(pts, t_pts):
|
105 |
+
A = np.zeros((8, 9))
|
106 |
+
for i in range(0, 4):
|
107 |
+
xi = pts[:, i]
|
108 |
+
xil = t_pts[:, i]
|
109 |
+
xi = xi.T
|
110 |
+
|
111 |
+
A[i*2, 3:6] = -xil[2]*xi
|
112 |
+
A[i*2, 6:] = xil[1]*xi
|
113 |
+
A[i*2+1, :3] = xil[2]*xi
|
114 |
+
A[i*2+1, 6:] = -xil[0]*xi
|
115 |
+
|
116 |
+
[U, S, V] = np.linalg.svd(A)
|
117 |
+
H = V[-1, :].reshape((3, 3))
|
118 |
+
return H
|
119 |
+
|
120 |
+
def getRectPts(a, b):
|
121 |
+
return np.array([[0,0], [a, 0], [a, b],[0,b]],np.float32)
|
122 |
+
|
123 |
+
def normal(pts, side, mn, MN):
|
124 |
+
pts_MN_center_mn = pts * side
|
125 |
+
pts_MN = pts_MN_center_mn + mn.reshape((2, 1))
|
126 |
+
pts_prop = pts_MN / MN.reshape((2, 1))
|
127 |
+
return pts_prop
|
128 |
+
def get_bound(x,y):
|
129 |
+
bound =[]
|
130 |
+
for i in range(0,len(x)):
|
131 |
+
point =[x[i],y[i]]
|
132 |
+
bound.append(point)
|
133 |
+
return bound
|
134 |
+
def calculate_ratio(bound):
|
135 |
+
def distance(point1,point2):
|
136 |
+
x1=point1[0]
|
137 |
+
y1=point1[1]
|
138 |
+
x2=point2[0]
|
139 |
+
y2=point2[1]
|
140 |
+
distance = np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
|
141 |
+
return distance
|
142 |
+
box = bound
|
143 |
+
dis1= distance(box[0],box[1])
|
144 |
+
dis2 = distance(box[1],box[2])
|
145 |
+
dis3 = distance(box[2],box[3])
|
146 |
+
dis4 = distance(box[3],box[0])
|
147 |
+
width = (dis1+dis3)/2
|
148 |
+
height= (dis2+dis4)/2
|
149 |
+
ratio = height/width
|
150 |
+
if ratio>0.55:
|
151 |
+
return 2
|
152 |
+
return 1
|
153 |
+
# Hàm tái tạo từ predict value thành biến số, cắt từ ảnh chính ra biển số, nhãn...
|
154 |
+
def reconstruct(I, Iresized, Yr, lp_threshold):
|
155 |
+
bounds=[]
|
156 |
+
# 4 max-pooling layers, stride = 2
|
157 |
+
net_stride = 2**4
|
158 |
+
side = ((208 + 40)/2)/net_stride
|
159 |
+
|
160 |
+
# one line and two lines license plate size
|
161 |
+
one_line = (100, 23)
|
162 |
+
two_lines = (64, 46)
|
163 |
+
|
164 |
+
Probs = Yr[..., 0]
|
165 |
+
Affines = Yr[..., 2:]
|
166 |
+
|
167 |
+
xx, yy = np.where(Probs > lp_threshold)
|
168 |
+
# CNN input image size
|
169 |
+
WH = getWH(Iresized.shape)
|
170 |
+
# output feature map size
|
171 |
+
MN = WH/net_stride
|
172 |
+
|
173 |
+
vxx = vyy = 0.5 #alpha
|
174 |
+
base = lambda vx, vy: np.matrix([[-vx, -vy, 1], [vx, -vy, 1], [vx, vy, 1], [-vx, vy, 1]]).T
|
175 |
+
labels = []
|
176 |
+
labels_frontal = []
|
177 |
+
|
178 |
+
for i in range(len(xx)):
|
179 |
+
x, y = xx[i], yy[i]
|
180 |
+
affine = Affines[x, y]
|
181 |
+
prob = Probs[x, y]
|
182 |
+
|
183 |
+
mn = np.array([float(y) + 0.5, float(x) + 0.5])
|
184 |
+
|
185 |
+
# affine transformation matrix
|
186 |
+
A = np.reshape(affine, (2, 3))
|
187 |
+
A[0, 0] = max(A[0, 0], 0)
|
188 |
+
A[1, 1] = max(A[1, 1], 0)
|
189 |
+
# identity transformation
|
190 |
+
B = np.zeros((2, 3))
|
191 |
+
B[0, 0] = max(A[0, 0], 0)
|
192 |
+
B[1, 1] = max(A[1, 1], 0)
|
193 |
+
|
194 |
+
pts = np.array(A*base(vxx, vyy))
|
195 |
+
pts_frontal = np.array(B*base(vxx, vyy))
|
196 |
+
|
197 |
+
pts_prop = normal(pts, side, mn, MN)
|
198 |
+
frontal = normal(pts_frontal, side, mn, MN)
|
199 |
+
|
200 |
+
labels.append(DLabel(0, pts_prop, prob))
|
201 |
+
labels_frontal.append(DLabel(0, frontal, prob))
|
202 |
+
|
203 |
+
final_labels = nms(labels, 0.1)
|
204 |
+
final_labels_frontal = nms(labels_frontal, 0.1)
|
205 |
+
if (len(final_labels_frontal)>0):
|
206 |
+
|
207 |
+
|
208 |
+
# LP size and type
|
209 |
+
#out_size, lp_type = (two_lines, 2) if ((final_labels_frontal[0].wh()[1] / final_labels_frontal[0].wh()[1]) >0.49) else (one_line, 1)
|
210 |
+
lp_type=0
|
211 |
+
TLp = []
|
212 |
+
if len(final_labels):
|
213 |
+
final_labels.sort(key=lambda x: x.prob(), reverse=True)
|
214 |
+
for _, label in enumerate(final_labels):
|
215 |
+
ptsh = np.concatenate((label.pts * getWH(I.shape).reshape((2, 1)), np.ones((1, 4))))
|
216 |
+
bound = get_bound(ptsh[0],ptsh[1])
|
217 |
+
pts=np.array(bound,dtype=np.float32)
|
218 |
+
bounds.append(bound)
|
219 |
+
lp_type = calculate_ratio(bound)
|
220 |
+
if lp_type==2:
|
221 |
+
out_size=two_lines
|
222 |
+
else: out_size=one_line
|
223 |
+
t_ptsh = getRectPts(out_size[0], out_size[1])
|
224 |
+
H=cv2.getPerspectiveTransform(pts,t_ptsh)
|
225 |
+
Ilp = cv2.warpPerspective(I,H, (int(out_size[0]),int(out_size[1])))
|
226 |
+
TLp.append(Ilp)
|
227 |
+
|
228 |
+
return final_labels, TLp, lp_type,bounds
|
229 |
+
else:
|
230 |
+
return None,[], None,None
|
231 |
+
def detect_lp(model, I, lp_threshold):
|
232 |
+
Dmax = 350
|
233 |
+
Dmin = 288
|
234 |
+
|
235 |
+
# Lấy tỷ lệ giữa W và H của ảnh và tìm ra chiều nhỏ nhất
|
236 |
+
ratio = float(max(I.shape[:2])) / min(I.shape[:2])
|
237 |
+
side = int(ratio * Dmin)
|
238 |
+
max_dim = min(side, Dmax)
|
239 |
+
I=im2single(I)
|
240 |
+
# Tính factor resize ảnh
|
241 |
+
min_dim_img = min(I.shape[:2])
|
242 |
+
factor = float(max_dim) / min_dim_img
|
243 |
+
|
244 |
+
# Tính W và H mới sau khi resize
|
245 |
+
w, h = (np.array(I.shape[1::-1], dtype=float) * factor).astype(int).tolist()
|
246 |
+
|
247 |
+
# Tiến hành resize ảnh
|
248 |
+
Iresized = cv2.resize(I, (w, h))
|
249 |
+
|
250 |
+
T = Iresized.copy()
|
251 |
+
|
252 |
+
# Chuyển thành Tensor
|
253 |
+
T = T.reshape((1, T.shape[0], T.shape[1], T.shape[2]))
|
254 |
+
|
255 |
+
# Tiến hành detect biển số bằng Wpod-net pretrain
|
256 |
+
Yr = model.predict(T,verbose=0)
|
257 |
+
|
258 |
+
# Remove các chiều =1 của Yr
|
259 |
+
Yr = np.squeeze(Yr)
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
# Tái tạo và trả về các biến gồm: Nhãn, Ảnh biến số, Loại biển số (1: dài: 2 vuông)
|
264 |
+
L, TLp, lp_type,bounds = reconstruct(I, Iresized, Yr, lp_threshold)
|
265 |
+
return L, TLp, lp_type,bounds
|
wpodnet/model.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
|
5 |
+
class BasicConvBlock(nn.Module):
|
6 |
+
def __init__(self, in_channels: int, out_channels: int):
|
7 |
+
super(BasicConvBlock, self).__init__()
|
8 |
+
self.conv_layer = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
|
9 |
+
self.bn_layer = nn.BatchNorm2d(out_channels, momentum=0.99, eps=0.001)
|
10 |
+
self.act_layer = nn.ReLU(inplace=True)
|
11 |
+
|
12 |
+
def forward(self, x):
|
13 |
+
x = self.conv_layer(x)
|
14 |
+
x = self.bn_layer(x)
|
15 |
+
return self.act_layer(x)
|
16 |
+
|
17 |
+
|
18 |
+
class ResBlock(nn.Module):
|
19 |
+
def __init__(self, channels: int):
|
20 |
+
super(ResBlock, self).__init__()
|
21 |
+
self.conv_block = BasicConvBlock(channels, channels)
|
22 |
+
self.sec_layer = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
|
23 |
+
self.bn_layer = nn.BatchNorm2d(channels, momentum=0.99, eps=0.001)
|
24 |
+
self.act_layer = nn.ReLU(inplace=True)
|
25 |
+
|
26 |
+
def forward(self, x):
|
27 |
+
h = self.conv_block(x)
|
28 |
+
h = self.sec_layer(h)
|
29 |
+
h = self.bn_layer(h)
|
30 |
+
return self.act_layer(x + h)
|
31 |
+
|
32 |
+
|
33 |
+
class WPODNet(nn.Module):
|
34 |
+
def __init__(self):
|
35 |
+
super(WPODNet, self).__init__()
|
36 |
+
self.backbone = nn.Sequential(
|
37 |
+
BasicConvBlock(3, 16),
|
38 |
+
BasicConvBlock(16, 16),
|
39 |
+
nn.MaxPool2d(2),
|
40 |
+
BasicConvBlock(16, 32),
|
41 |
+
ResBlock(32),
|
42 |
+
nn.MaxPool2d(2),
|
43 |
+
BasicConvBlock(32, 64),
|
44 |
+
ResBlock(64),
|
45 |
+
ResBlock(64),
|
46 |
+
nn.MaxPool2d(2),
|
47 |
+
BasicConvBlock(64, 64),
|
48 |
+
ResBlock(64),
|
49 |
+
ResBlock(64),
|
50 |
+
nn.MaxPool2d(2),
|
51 |
+
BasicConvBlock(64, 128),
|
52 |
+
ResBlock(128),
|
53 |
+
ResBlock(128),
|
54 |
+
ResBlock(128),
|
55 |
+
ResBlock(128)
|
56 |
+
)
|
57 |
+
self.prob_layer = nn.Conv2d(128, 2, kernel_size=3, padding=1)
|
58 |
+
self.bbox_layer = nn.Conv2d(128, 6, kernel_size=3, padding=1)
|
59 |
+
|
60 |
+
# Registry a dummy tensor for retrieve the attached device
|
61 |
+
self.register_buffer('dummy', torch.Tensor(), persistent=False)
|
62 |
+
|
63 |
+
@property
|
64 |
+
def device(self) -> torch.device:
|
65 |
+
return self.dummy.device
|
66 |
+
|
67 |
+
def forward(self, image: torch.Tensor):
|
68 |
+
feature: torch.Tensor = self.backbone(image)
|
69 |
+
probs: torch.Tensor = self.prob_layer(feature)
|
70 |
+
probs = torch.softmax(probs, dim=1)
|
71 |
+
affines: torch.Tensor = self.bbox_layer(feature)
|
72 |
+
|
73 |
+
return probs, affines
|
wpodnet/stream.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Generator, Union
|
3 |
+
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
|
7 |
+
class ImageStreamer:
|
8 |
+
def __init__(self, image_or_folder: Union[str, Path]):
|
9 |
+
path = Path(image_or_folder)
|
10 |
+
self.generator = self._get_image_generator(path)
|
11 |
+
|
12 |
+
def _get_image_generator(self, path: Path) -> Generator[Image.Image, None, None]:
|
13 |
+
if path.is_file():
|
14 |
+
image_paths = [path] if self._is_image_file(path) else []
|
15 |
+
elif path.is_dir():
|
16 |
+
image_paths = [
|
17 |
+
p
|
18 |
+
for p in path.rglob('**/*')
|
19 |
+
if self._is_image_file(p)
|
20 |
+
]
|
21 |
+
else:
|
22 |
+
raise TypeError(f'Invalid path to images {path}')
|
23 |
+
|
24 |
+
for p in image_paths:
|
25 |
+
yield Image.open(p)
|
26 |
+
|
27 |
+
def _is_image_file(self, path: Path) -> bool:
|
28 |
+
try:
|
29 |
+
image = Image.open(path)
|
30 |
+
image.verify()
|
31 |
+
return True
|
32 |
+
except Exception:
|
33 |
+
return False
|
34 |
+
|
35 |
+
def __iter__(self):
|
36 |
+
return self.generator
|