Add application file
Browse files- .gitignore +3 -0
- app.py +58 -0
- data/address_matrix_all_1.pt +3 -0
- data/standard_address_all_1.json +0 -0
- norm_typing.py +203 -0
- parameters.py +7 -0
- requirements.txt +7 -0
- siameser.py +91 -0
- utils.py +98 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
flagged
|
3 |
+
embedding-model
|
app.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import siameser
|
3 |
+
import norm_typing
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
# print(1)
|
9 |
+
std = siameser.Siameser(stadard_scope='all')
|
10 |
+
count = 0
|
11 |
+
|
12 |
+
def standardize(raw_address):
|
13 |
+
global count
|
14 |
+
raw_address = norm_typing.norm_vietnamese_sentence_accent(raw_address)
|
15 |
+
std_add = std.standardize(raw_address)
|
16 |
+
std_address = dict()
|
17 |
+
# top_1 = std.standardize(raw_address)
|
18 |
+
# detail_address = Utils.get_detail_address(raw_address, main_address)
|
19 |
+
# std_address['detail address'] = detail_address
|
20 |
+
# std_address['main address'] = main_address
|
21 |
+
top_1, top_5 = std.get_top_k(raw_address, 5)
|
22 |
+
count += 1
|
23 |
+
# logging.info(f'Request {count}')
|
24 |
+
if count % 10 == 9:
|
25 |
+
print(f'Request: {count}')
|
26 |
+
return top_1, top_5
|
27 |
+
|
28 |
+
|
29 |
+
demo = gr.Interface(
|
30 |
+
# fn=test,
|
31 |
+
fn=standardize,
|
32 |
+
inputs=gr.Textbox(label='raw address', lines=1, placeholder="Nhập địa chỉ thô"),
|
33 |
+
outputs=[gr.JSON(label='stadard address'), gr.JSON(label='top 5 standard addresses')],
|
34 |
+
allow_flagging='auto',
|
35 |
+
title='Chuẩn hóa địa chỉ tiếng Việt',
|
36 |
+
description='Công cụ sử dụng để chuẩn hóa địa chỉ tiếng Việt. <br> \
|
37 |
+
Nhập vào 1 câu địa chỉ thô (ví dụ ở dưới), mô hình sẽ chuẩn hóa thành địa chỉ chuẩn, dưới dạng json, gồm 2 phần: <br> \
|
38 |
+
* Địa chỉ chi tiết (detail address): thông tin về số nhà, ngõ ngách, hẻm,... được cung cấp trong địa chỉ thô. <br> \
|
39 |
+
* Địa chỉ chính (main address): hiển thị dưới dạng dict, gồm tối đa 3 trên 4 trường thông tin: đường/phố, phường/xã, quận/huyện, tỉnh/thành phố. <br>\
|
40 |
+
* Trong trường hợp địa chỉ thô xuất hiện cả tên đường và phường, thì địa chỉ chính chỉ chứa tên đường mà không cần phường (vì như thế đã đủ để xác định ví trí rồi). <br>',
|
41 |
+
examples=['1 dong khoi str., dist. 1 ,hcmc',
|
42 |
+
'112/21 bạch đằng, p.2, tân bình, tp. hồ chí minh',
|
43 |
+
'văn phòng và căn hộ cao cấp licogi 13 tower , thanh xuân , hn',
|
44 |
+
'dablend hostel, 417/2 hoà hảo, phường 5, quận 10, hồ chí minh, vietnam',
|
45 |
+
'17-05, tower 4,the sun avenue, 28 mai chi tho, district 2, ho chi minh city'
|
46 |
+
],
|
47 |
+
# article='Contact<br>Email: [email protected] <br> Facebook: https://www.facebook.com/CaoHaiNamHust/'
|
48 |
+
article='Contact<br>Email: [email protected]'
|
49 |
+
)
|
50 |
+
|
51 |
+
# with gr.Blocks() as demo:
|
52 |
+
# name = gr.Textbox(label='raw address', placeholder="150 kim hoa ha noi")
|
53 |
+
# output1 = gr.Textbox(label='standard address')
|
54 |
+
# output2 = gr.Textbox(label='top 5 addresses')
|
55 |
+
# greet_btn = gr.Button("standardize")
|
56 |
+
# greet_btn.click(fn=standardize, inputs=name, outputs=[output1, output2])
|
57 |
+
|
58 |
+
demo.launch()
|
data/address_matrix_all_1.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:977558e6e8a75b5513f41f1e63f3bf6f3749ce9a74b5bee5ee160334c1185a68
|
3 |
+
size 215544807
|
data/standard_address_all_1.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
norm_typing.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
source: https://github.com/langmaninternet/VietnameseTextNormalizer
|
4 |
+
"""
|
5 |
+
|
6 |
+
|
7 |
+
import regex as re
|
8 |
+
|
9 |
+
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
|
10 |
+
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"
|
11 |
+
|
12 |
+
|
13 |
+
def loaddicchar():
|
14 |
+
dic = {}
|
15 |
+
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
|
16 |
+
'|')
|
17 |
+
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
|
18 |
+
'|')
|
19 |
+
for i in range(len(char1252)):
|
20 |
+
dic[char1252[i]] = charutf8[i]
|
21 |
+
return dic
|
22 |
+
|
23 |
+
|
24 |
+
dicchar = loaddicchar()
|
25 |
+
|
26 |
+
|
27 |
+
def convert_unicode(txt):
|
28 |
+
return re.sub(
|
29 |
+
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
|
30 |
+
lambda x: dicchar[x.group()], txt)
|
31 |
+
|
32 |
+
|
33 |
+
"""
|
34 |
+
Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
|
35 |
+
Ví dụ: thủy = thuyr, tượng = tuwowngj
|
36 |
+
"""
|
37 |
+
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
|
38 |
+
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
|
39 |
+
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
|
40 |
+
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
|
41 |
+
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
|
42 |
+
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
|
43 |
+
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
|
44 |
+
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
|
45 |
+
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
|
46 |
+
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
|
47 |
+
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
|
48 |
+
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
|
49 |
+
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']
|
50 |
+
|
51 |
+
nguyen_am_to_ids = {}
|
52 |
+
|
53 |
+
for i in range(len(bang_nguyen_am)):
|
54 |
+
for j in range(len(bang_nguyen_am[i]) - 1):
|
55 |
+
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)
|
56 |
+
|
57 |
+
|
58 |
+
def vn_word_to_telex_type(word):
|
59 |
+
dau_cau = 0
|
60 |
+
new_word = ''
|
61 |
+
for char in word:
|
62 |
+
x, y = nguyen_am_to_ids.get(char, (-1, -1))
|
63 |
+
if x == -1:
|
64 |
+
new_word += char
|
65 |
+
continue
|
66 |
+
if y != 0:
|
67 |
+
dau_cau = y
|
68 |
+
new_word += bang_nguyen_am[x][-1]
|
69 |
+
new_word += bang_ky_tu_dau[dau_cau]
|
70 |
+
return new_word
|
71 |
+
|
72 |
+
|
73 |
+
def vn_sentence_to_telex_type(sentence):
|
74 |
+
"""
|
75 |
+
Chuyển câu tiếng việt có dấu về kiểu gõ telex.
|
76 |
+
:param sentence:
|
77 |
+
:return:
|
78 |
+
"""
|
79 |
+
words = sentence.split()
|
80 |
+
for index, word in enumerate(words):
|
81 |
+
words[index] = vn_word_to_telex_type(word)
|
82 |
+
return ' '.join(words)
|
83 |
+
|
84 |
+
|
85 |
+
"""
|
86 |
+
End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
|
87 |
+
"""
|
88 |
+
|
89 |
+
"""
|
90 |
+
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
|
91 |
+
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
|
92 |
+
"""
|
93 |
+
|
94 |
+
|
95 |
+
def norm_vietnamese_word_accent(word):
|
96 |
+
if not is_valid_vietnam_word(word):
|
97 |
+
return word
|
98 |
+
|
99 |
+
chars = list(word)
|
100 |
+
dau_cau = 0
|
101 |
+
nguyen_am_index = []
|
102 |
+
qu_or_gi = False
|
103 |
+
for index, char in enumerate(chars):
|
104 |
+
x, y = nguyen_am_to_ids.get(char, (-1, -1))
|
105 |
+
if x == -1:
|
106 |
+
continue
|
107 |
+
elif x == 9: # check qu
|
108 |
+
if index != 0 and chars[index - 1] == 'q':
|
109 |
+
chars[index] = 'u'
|
110 |
+
qu_or_gi = True
|
111 |
+
elif x == 5: # check gi
|
112 |
+
if index != 0 and chars[index - 1] == 'g':
|
113 |
+
chars[index] = 'i'
|
114 |
+
qu_or_gi = True
|
115 |
+
if y != 0:
|
116 |
+
dau_cau = y
|
117 |
+
chars[index] = bang_nguyen_am[x][0]
|
118 |
+
if not qu_or_gi or index != 1:
|
119 |
+
nguyen_am_index.append(index)
|
120 |
+
if len(nguyen_am_index) < 2:
|
121 |
+
if qu_or_gi:
|
122 |
+
if len(chars) == 2:
|
123 |
+
x, y = nguyen_am_to_ids.get(chars[1])
|
124 |
+
chars[1] = bang_nguyen_am[x][dau_cau]
|
125 |
+
else:
|
126 |
+
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
|
127 |
+
if x != -1:
|
128 |
+
chars[2] = bang_nguyen_am[x][dau_cau]
|
129 |
+
else:
|
130 |
+
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
|
131 |
+
return ''.join(chars)
|
132 |
+
return word
|
133 |
+
|
134 |
+
for index in nguyen_am_index:
|
135 |
+
x, y = nguyen_am_to_ids[chars[index]]
|
136 |
+
if x == 4 or x == 8: # ê, ơ
|
137 |
+
chars[index] = bang_nguyen_am[x][dau_cau]
|
138 |
+
# for index2 in nguyen_am_index:
|
139 |
+
# if index2 != index:
|
140 |
+
# x, y = nguyen_am_to_ids[chars[index]]
|
141 |
+
# chars[index2] = bang_nguyen_am[x][0]
|
142 |
+
return ''.join(chars)
|
143 |
+
|
144 |
+
if len(nguyen_am_index) == 2:
|
145 |
+
if nguyen_am_index[-1] == len(chars) - 1:
|
146 |
+
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
|
147 |
+
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
|
148 |
+
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
|
149 |
+
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
|
150 |
+
else:
|
151 |
+
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
|
152 |
+
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
|
153 |
+
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
|
154 |
+
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
|
155 |
+
else:
|
156 |
+
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
|
157 |
+
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
|
158 |
+
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
|
159 |
+
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
|
160 |
+
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
|
161 |
+
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
|
162 |
+
return ''.join(chars)
|
163 |
+
|
164 |
+
|
165 |
+
def is_valid_vietnam_word(word):
|
166 |
+
chars = list(word)
|
167 |
+
nguyen_am_index = -1
|
168 |
+
for index, char in enumerate(chars):
|
169 |
+
x, y = nguyen_am_to_ids.get(char, (-1, -1))
|
170 |
+
if x != -1:
|
171 |
+
if nguyen_am_index == -1:
|
172 |
+
nguyen_am_index = index
|
173 |
+
else:
|
174 |
+
if index - nguyen_am_index != 1:
|
175 |
+
return False
|
176 |
+
nguyen_am_index = index
|
177 |
+
return True
|
178 |
+
|
179 |
+
|
180 |
+
def norm_vietnamese_sentence_accent(sentence):
|
181 |
+
"""
|
182 |
+
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
|
183 |
+
:param sentence:
|
184 |
+
:return:
|
185 |
+
"""
|
186 |
+
sentence = sentence.lower()
|
187 |
+
words = sentence.split()
|
188 |
+
for index, word in enumerate(words):
|
189 |
+
# print(word)
|
190 |
+
cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1?\2?\3', word).split('?')
|
191 |
+
# print(cw)
|
192 |
+
if len(cw) == 3:
|
193 |
+
cw[1] = norm_vietnamese_word_accent(cw[1])
|
194 |
+
words[index] = ''.join(cw)
|
195 |
+
return ' '.join(words)
|
196 |
+
|
197 |
+
|
198 |
+
"""
|
199 |
+
End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
|
200 |
+
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_tắc_đặt_dấu_thanh_trong_chữ_quốc_ngữ
|
201 |
+
"""
|
202 |
+
if __name__ == '__main__':
|
203 |
+
print(chuan_hoa_dau_cau_tieng_viet('anh Hoà, đang làm.. gì'))
|
parameters.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# transformer model
|
2 |
+
embedding_model = 'CaoHaiNam/vietnamese-address-embedding'
|
3 |
+
local_embedding_model = 'embedding-model'
|
4 |
+
|
5 |
+
|
6 |
+
NORM_ADDS_FILE_ALL_1 = 'data/standard_address_all_1.json'
|
7 |
+
STD_EMBEDDING_FILE_ALL_1 = 'data/address_matrix_all_1.pt'
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
sentence-transformers
|
3 |
+
transformers
|
4 |
+
tqdm
|
5 |
+
torch
|
6 |
+
flake8
|
7 |
+
gradio
|
siameser.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unicodedata
|
2 |
+
import torch
|
3 |
+
import utils
|
4 |
+
import parameters
|
5 |
+
import json
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import os
|
8 |
+
import torch.nn.functional as F
|
9 |
+
|
10 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
11 |
+
|
12 |
+
device = torch.device('cpu')
|
13 |
+
|
14 |
+
class Siameser:
|
15 |
+
def __init__(self, model_name=None, stadard_scope=None):
|
16 |
+
# print('Load model')
|
17 |
+
print("Load sentence embedding model (If this is the first time you run this repo, It could be take time to download sentence embedding model)")
|
18 |
+
self.threshold = 0.61
|
19 |
+
# if os.path.isdir(parameters.local_embedding_model):
|
20 |
+
# self.embedding_model = SentenceTransformer(parameters.local_embedding_model).to(device)
|
21 |
+
# else:
|
22 |
+
# self.embedding_model = SentenceTransformer(parameters.embedding_model).to(device)
|
23 |
+
# self.embedding_model.save(parameters.local_embedding_model)
|
24 |
+
self.embedding_model = SentenceTransformer(parameters.embedding_model).to(device)
|
25 |
+
|
26 |
+
if stadard_scope == 'all':
|
27 |
+
print('Load standard address')
|
28 |
+
with open(file=parameters.NORM_ADDS_FILE_ALL_1, mode='r', encoding='utf-8') as f:
|
29 |
+
self.NORM_ADDS = json.load(fp=f)
|
30 |
+
|
31 |
+
print('Load standard address matrix')
|
32 |
+
embedding = torch.load(parameters.STD_EMBEDDING_FILE_ALL_1)
|
33 |
+
self.std_embeddings = embedding['accent_matrix'].to(device)
|
34 |
+
self.NT_std_embeddings = embedding['noaccent_matrix'].to(device)
|
35 |
+
else:
|
36 |
+
print('Load standard address')
|
37 |
+
with open(file=parameters.NORM_ADDS_FILE_HN_HCM, mode='r', encoding='utf-8') as f:
|
38 |
+
self.NORM_ADDS = json.load(fp=f)
|
39 |
+
|
40 |
+
print('Load standard address matrix')
|
41 |
+
embedding = torch.load(parameters.STD_EMBEDDING_FILE_HN_HCM)
|
42 |
+
self.std_embeddings = embedding['accent_matrix'].to(device)
|
43 |
+
self.NT_std_embeddings = embedding['noaccent_matrix'].to(device)
|
44 |
+
|
45 |
+
self.num_std_add = self.std_embeddings.shape[0]
|
46 |
+
print('Done')
|
47 |
+
|
48 |
+
def standardize(self, raw_add_):
|
49 |
+
raw_add = unicodedata.normalize('NFC', raw_add_).lower()
|
50 |
+
raw_add = utils.remove_punctuation(raw_add)
|
51 |
+
raw_add_vector = self.embedding_model.encode(raw_add, convert_to_tensor=True).to(device)
|
52 |
+
raw_add_vectors = raw_add_vector.repeat(self.num_std_add, 1)
|
53 |
+
if raw_add == utils.remove_accent(raw_add):
|
54 |
+
score = F.cosine_similarity(raw_add_vectors, self.NT_std_embeddings)
|
55 |
+
else:
|
56 |
+
score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
|
57 |
+
s, top_k = score.topk(1)
|
58 |
+
# print(s, top_k)
|
59 |
+
# return
|
60 |
+
s, idx = s.tolist()[0], top_k.tolist()[0]
|
61 |
+
# if s < 0.57:
|
62 |
+
if s < self.threshold:
|
63 |
+
return {'Format Error': 'Xâu truyền vào không phải địa chỉ, mời nhập lại.'}
|
64 |
+
std_add = self.NORM_ADDS[str(idx)]
|
65 |
+
return utils.get_full_result(raw_add_, std_add, round(s, 4))
|
66 |
+
|
67 |
+
def get_top_k(self, raw_add_, k):
|
68 |
+
raw_add = unicodedata.normalize('NFC', raw_add_).lower()
|
69 |
+
raw_add = utils.remove_punctuation(raw_add)
|
70 |
+
raw_add_vector = self.embedding_model.encode(raw_add, convert_to_tensor=True).to(device)
|
71 |
+
raw_add_vectors = raw_add_vector.repeat(self.num_std_add, 1)
|
72 |
+
if raw_add == utils.remove_accent(raw_add):
|
73 |
+
score = F.cosine_similarity(raw_add_vectors, self.NT_std_embeddings)
|
74 |
+
else:
|
75 |
+
score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
|
76 |
+
s, top_k = score.topk(k)
|
77 |
+
s, top_k = s.tolist(), top_k.tolist()
|
78 |
+
# print(s, top_k)
|
79 |
+
# return
|
80 |
+
|
81 |
+
if s[0] < self.threshold:
|
82 |
+
return {'Format Error': 'Dường như xâu truyền vào không phải địa chỉ, mời nhập lại.'}, {}
|
83 |
+
|
84 |
+
top_std_adds = []
|
85 |
+
for score, idx in zip(s, top_k):
|
86 |
+
std_add = self.NORM_ADDS[str(idx)]
|
87 |
+
top_std_adds.append(utils.get_full_result(raw_add_, std_add, round(score, 4)))
|
88 |
+
|
89 |
+
x1, x2 = top_std_adds[0], top_std_adds[1]
|
90 |
+
|
91 |
+
return top_std_adds[0], top_std_adds
|
utils.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import numpy as np
|
2 |
+
import re
|
3 |
+
import string
|
4 |
+
|
5 |
+
# delete tone and lower
|
6 |
+
anphabet = ['a', 'ă', 'â', 'b', 'c', 'd',
|
7 |
+
'đ', 'e', 'ê', 'g', 'h', 'i',
|
8 |
+
'k', 'l', 'm', 'n', 'o', 'ô',
|
9 |
+
'ơ', 'p', 'q', 'r', 's', 't',
|
10 |
+
't', 'u', 'ư', 'v', 'x', 'y',
|
11 |
+
]
|
12 |
+
|
13 |
+
tone = {
|
14 |
+
'á, à, ã, ạ, ả, ấ, ầ, ẫ, ậ, ẩ, ắ, ằ, ẵ, ặ, ẳ, â, ă': 'a',
|
15 |
+
'ó, ò, õ, ọ, ỏ, ố, ồ, ỗ, ộ, ổ, ớ, ờ, ỡ, ợ, ở, ơ, ô': 'o',
|
16 |
+
'é, è, ẽ, ẹ, ẻ, ế, ề, ễ, ệ, ể, ê': 'e',
|
17 |
+
'í, ì, ĩ, ị, ỉ': 'i',
|
18 |
+
'ú, ù, ũ, ụ, ủ, ứ, ừ, ự, ử, ữ, ư': 'u',
|
19 |
+
'đ': 'd',
|
20 |
+
'ý, ỳ, ỹ, ỵ, ỷ': 'y'
|
21 |
+
}
|
22 |
+
|
23 |
+
RT = {}
|
24 |
+
for i in tone.items():
|
25 |
+
for j in i[0]:
|
26 |
+
if j == ',' or j == ' ':
|
27 |
+
continue
|
28 |
+
RT[j] = i[1]
|
29 |
+
|
30 |
+
|
31 |
+
def remove_accent(text):
|
32 |
+
|
33 |
+
res = ''
|
34 |
+
for char in text:
|
35 |
+
res += RT[char] if char in RT else char
|
36 |
+
return res
|
37 |
+
|
38 |
+
|
39 |
+
# remove functuation
|
40 |
+
def remove_punctuation(text):
|
41 |
+
|
42 |
+
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
|
43 |
+
whitespace = ' '
|
44 |
+
for i in text:
|
45 |
+
if i in punctuation:
|
46 |
+
text = text.replace(i, whitespace)
|
47 |
+
return ' '.join(text.split())
|
48 |
+
|
49 |
+
|
50 |
+
def clean_text(text):
|
51 |
+
text = text.encode("ascii", errors="ignore").decode(
|
52 |
+
"ascii"
|
53 |
+
) # remove non-ascii, Chinese characters
|
54 |
+
text = re.sub(r"http\S+", "", text)
|
55 |
+
text = re.sub(r"\n", " ", text)
|
56 |
+
text = re.sub(r"\n\n", " ", text)
|
57 |
+
text = re.sub(r"\t", " ", text)
|
58 |
+
text = text.strip(" ")
|
59 |
+
text = re.sub(
|
60 |
+
" +", " ", text
|
61 |
+
).strip() # get rid of multiple spaces and replace with a single
|
62 |
+
return text
|
63 |
+
|
64 |
+
|
65 |
+
def remove_prefix(address):
|
66 |
+
if address != remove_accent(address):
|
67 |
+
return re.sub('(tỉnh |thành phố |huyện |thị trấn |thị xã |phường |xã |quận |đường |phố |tp )', '', address, flags=re.IGNORECASE).strip()
|
68 |
+
return re.sub('(tinh |thanh pho |huyen |thi tran |thi xa |phuong |xa |quan |duong |pho |tp )', '', address, flags=re.IGNORECASE).strip()
|
69 |
+
|
70 |
+
|
71 |
+
def clean_detail_address(detail_address):
|
72 |
+
detail_address = remove_prefix(detail_address)
|
73 |
+
try:
|
74 |
+
if detail_address[-1] in string.punctuation:
|
75 |
+
detail_address = detail_address[:-1]
|
76 |
+
except:
|
77 |
+
pass
|
78 |
+
return detail_address
|
79 |
+
|
80 |
+
|
81 |
+
def get_detail_address(address, std_address):
|
82 |
+
address = address.lower()
|
83 |
+
split_token = list(std_address.values())[0].split()[0]
|
84 |
+
if address == remove_accent(address):
|
85 |
+
split_token = remove_accent(split_token)
|
86 |
+
detail_address = address.split(split_token)[0]
|
87 |
+
if detail_address == address:
|
88 |
+
return ''
|
89 |
+
detail_address = clean_detail_address(detail_address)
|
90 |
+
return detail_address
|
91 |
+
|
92 |
+
|
93 |
+
def get_full_result(raw_address, std_address, score):
|
94 |
+
full_result = dict()
|
95 |
+
full_result['detail_address'] = get_detail_address(raw_address, std_address)
|
96 |
+
full_result['main_address'] = std_address
|
97 |
+
full_result['similarity_score'] = score
|
98 |
+
return full_result
|