CaoHaiNam commited on
Commit
3a379e2
·
1 Parent(s): a8ca718

Add application file

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ flagged
3
+ embedding-model
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import siameser
3
+ import norm_typing
4
+ import json
5
+ import logging
6
+ from datetime import datetime
7
+
8
+ # print(1)
9
+ std = siameser.Siameser(stadard_scope='all')
10
+ count = 0
11
+
12
+ def standardize(raw_address):
13
+ global count
14
+ raw_address = norm_typing.norm_vietnamese_sentence_accent(raw_address)
15
+ std_add = std.standardize(raw_address)
16
+ std_address = dict()
17
+ # top_1 = std.standardize(raw_address)
18
+ # detail_address = Utils.get_detail_address(raw_address, main_address)
19
+ # std_address['detail address'] = detail_address
20
+ # std_address['main address'] = main_address
21
+ top_1, top_5 = std.get_top_k(raw_address, 5)
22
+ count += 1
23
+ # logging.info(f'Request {count}')
24
+ if count % 10 == 9:
25
+ print(f'Request: {count}')
26
+ return top_1, top_5
27
+
28
+
29
+ demo = gr.Interface(
30
+ # fn=test,
31
+ fn=standardize,
32
+ inputs=gr.Textbox(label='raw address', lines=1, placeholder="Nhập địa chỉ thô"),
33
+ outputs=[gr.JSON(label='stadard address'), gr.JSON(label='top 5 standard addresses')],
34
+ allow_flagging='auto',
35
+ title='Chuẩn hóa địa chỉ tiếng Việt',
36
+ description='Công cụ sử dụng để chuẩn hóa địa chỉ tiếng Việt. <br> \
37
+ Nhập vào 1 câu địa chỉ thô (ví dụ ở dưới), mô hình sẽ chuẩn hóa thành địa chỉ chuẩn, dưới dạng json, gồm 2 phần: <br> \
38
+ * Địa chỉ chi tiết (detail address): thông tin về số nhà, ngõ ngách, hẻm,... được cung cấp trong địa chỉ thô. <br> \
39
+ * Địa chỉ chính (main address): hiển thị dưới dạng dict, gồm tối đa 3 trên 4 trường thông tin: đường/phố, phường/xã, quận/huyện, tỉnh/thành phố. <br>\
40
+ * Trong trường hợp địa chỉ thô xuất hiện cả tên đường và phường, thì địa chỉ chính chỉ chứa tên đường mà không cần phường (vì như thế đã đủ để xác định ví trí rồi). <br>',
41
+ examples=['1 dong khoi str., dist. 1 ,hcmc',
42
+ '112/21 bạch đằng, p.2, tân bình, tp. hồ chí minh',
43
+ 'văn phòng và căn hộ cao cấp licogi 13 tower , thanh xuân , hn',
44
+ 'dablend hostel, 417/2 hoà hảo, phường 5, quận 10, hồ chí minh, vietnam',
45
+ '17-05, tower 4,the sun avenue, 28 mai chi tho, district 2, ho chi minh city'
46
+ ],
47
+ # article='Contact<br>Email: [email protected] <br> Facebook: https://www.facebook.com/CaoHaiNamHust/'
48
+ article='Contact<br>Email: [email protected]'
49
+ )
50
+
51
+ # with gr.Blocks() as demo:
52
+ # name = gr.Textbox(label='raw address', placeholder="150 kim hoa ha noi")
53
+ # output1 = gr.Textbox(label='standard address')
54
+ # output2 = gr.Textbox(label='top 5 addresses')
55
+ # greet_btn = gr.Button("standardize")
56
+ # greet_btn.click(fn=standardize, inputs=name, outputs=[output1, output2])
57
+
58
+ demo.launch()
data/address_matrix_all_1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:977558e6e8a75b5513f41f1e63f3bf6f3749ce9a74b5bee5ee160334c1185a68
3
+ size 215544807
data/standard_address_all_1.json ADDED
The diff for this file is too large to render. See raw diff
 
norm_typing.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ source: https://github.com/langmaninternet/VietnameseTextNormalizer
4
+ """
5
+
6
+
7
+ import regex as re
8
+
9
+ uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
10
+ unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"
11
+
12
+
13
+ def loaddicchar():
14
+ dic = {}
15
+ char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
16
+ '|')
17
+ charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
18
+ '|')
19
+ for i in range(len(char1252)):
20
+ dic[char1252[i]] = charutf8[i]
21
+ return dic
22
+
23
+
24
+ dicchar = loaddicchar()
25
+
26
+
27
+ def convert_unicode(txt):
28
+ return re.sub(
29
+ r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
30
+ lambda x: dicchar[x.group()], txt)
31
+
32
+
33
+ """
34
+ Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
35
+ Ví dụ: thủy = thuyr, tượng = tuwowngj
36
+ """
37
+ bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
38
+ ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
39
+ ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
40
+ ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
41
+ ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
42
+ ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
43
+ ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
44
+ ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
45
+ ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
46
+ ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
47
+ ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
48
+ ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
49
+ bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']
50
+
51
+ nguyen_am_to_ids = {}
52
+
53
+ for i in range(len(bang_nguyen_am)):
54
+ for j in range(len(bang_nguyen_am[i]) - 1):
55
+ nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)
56
+
57
+
58
+ def vn_word_to_telex_type(word):
59
+ dau_cau = 0
60
+ new_word = ''
61
+ for char in word:
62
+ x, y = nguyen_am_to_ids.get(char, (-1, -1))
63
+ if x == -1:
64
+ new_word += char
65
+ continue
66
+ if y != 0:
67
+ dau_cau = y
68
+ new_word += bang_nguyen_am[x][-1]
69
+ new_word += bang_ky_tu_dau[dau_cau]
70
+ return new_word
71
+
72
+
73
+ def vn_sentence_to_telex_type(sentence):
74
+ """
75
+ Chuyển câu tiếng việt có dấu về kiểu gõ telex.
76
+ :param sentence:
77
+ :return:
78
+ """
79
+ words = sentence.split()
80
+ for index, word in enumerate(words):
81
+ words[index] = vn_word_to_telex_type(word)
82
+ return ' '.join(words)
83
+
84
+
85
+ """
86
+ End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey
87
+ """
88
+
89
+ """
90
+ Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
91
+ Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF
92
+ """
93
+
94
+
95
+ def norm_vietnamese_word_accent(word):
96
+ if not is_valid_vietnam_word(word):
97
+ return word
98
+
99
+ chars = list(word)
100
+ dau_cau = 0
101
+ nguyen_am_index = []
102
+ qu_or_gi = False
103
+ for index, char in enumerate(chars):
104
+ x, y = nguyen_am_to_ids.get(char, (-1, -1))
105
+ if x == -1:
106
+ continue
107
+ elif x == 9: # check qu
108
+ if index != 0 and chars[index - 1] == 'q':
109
+ chars[index] = 'u'
110
+ qu_or_gi = True
111
+ elif x == 5: # check gi
112
+ if index != 0 and chars[index - 1] == 'g':
113
+ chars[index] = 'i'
114
+ qu_or_gi = True
115
+ if y != 0:
116
+ dau_cau = y
117
+ chars[index] = bang_nguyen_am[x][0]
118
+ if not qu_or_gi or index != 1:
119
+ nguyen_am_index.append(index)
120
+ if len(nguyen_am_index) < 2:
121
+ if qu_or_gi:
122
+ if len(chars) == 2:
123
+ x, y = nguyen_am_to_ids.get(chars[1])
124
+ chars[1] = bang_nguyen_am[x][dau_cau]
125
+ else:
126
+ x, y = nguyen_am_to_ids.get(chars[2], (-1, -1))
127
+ if x != -1:
128
+ chars[2] = bang_nguyen_am[x][dau_cau]
129
+ else:
130
+ chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau]
131
+ return ''.join(chars)
132
+ return word
133
+
134
+ for index in nguyen_am_index:
135
+ x, y = nguyen_am_to_ids[chars[index]]
136
+ if x == 4 or x == 8: # ê, ơ
137
+ chars[index] = bang_nguyen_am[x][dau_cau]
138
+ # for index2 in nguyen_am_index:
139
+ # if index2 != index:
140
+ # x, y = nguyen_am_to_ids[chars[index]]
141
+ # chars[index2] = bang_nguyen_am[x][0]
142
+ return ''.join(chars)
143
+
144
+ if len(nguyen_am_index) == 2:
145
+ if nguyen_am_index[-1] == len(chars) - 1:
146
+ x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
147
+ chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau]
148
+ # x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
149
+ # chars[nguyen_am_index[1]] = bang_nguyen_am[x][0]
150
+ else:
151
+ # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
152
+ # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
153
+ x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
154
+ chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
155
+ else:
156
+ # x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]]
157
+ # chars[nguyen_am_index[0]] = bang_nguyen_am[x][0]
158
+ x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]]
159
+ chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau]
160
+ # x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]]
161
+ # chars[nguyen_am_index[2]] = bang_nguyen_am[x][0]
162
+ return ''.join(chars)
163
+
164
+
165
+ def is_valid_vietnam_word(word):
166
+ chars = list(word)
167
+ nguyen_am_index = -1
168
+ for index, char in enumerate(chars):
169
+ x, y = nguyen_am_to_ids.get(char, (-1, -1))
170
+ if x != -1:
171
+ if nguyen_am_index == -1:
172
+ nguyen_am_index = index
173
+ else:
174
+ if index - nguyen_am_index != 1:
175
+ return False
176
+ nguyen_am_index = index
177
+ return True
178
+
179
+
180
+ def norm_vietnamese_sentence_accent(sentence):
181
+ """
182
+ Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ.
183
+ :param sentence:
184
+ :return:
185
+ """
186
+ sentence = sentence.lower()
187
+ words = sentence.split()
188
+ for index, word in enumerate(words):
189
+ # print(word)
190
+ cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1?\2?\3', word).split('?')
191
+ # print(cw)
192
+ if len(cw) == 3:
193
+ cw[1] = norm_vietnamese_word_accent(cw[1])
194
+ words[index] = ''.join(cw)
195
+ return ' '.join(words)
196
+
197
+
198
+ """
199
+ End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý
200
+ Xem tại đây: https://vi.wikipedia.org/wiki/Quy_tắc_đặt_dấu_thanh_trong_chữ_quốc_ngữ
201
+ """
202
+ if __name__ == '__main__':
203
+ print(chuan_hoa_dau_cau_tieng_viet('anh Hoà, đang làm.. gì'))
parameters.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # transformer model
2
+ embedding_model = 'CaoHaiNam/vietnamese-address-embedding'
3
+ local_embedding_model = 'embedding-model'
4
+
5
+
6
+ NORM_ADDS_FILE_ALL_1 = 'data/standard_address_all_1.json'
7
+ STD_EMBEDDING_FILE_ALL_1 = 'data/address_matrix_all_1.pt'
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ numpy
2
+ sentence-transformers
3
+ transformers
4
+ tqdm
5
+ torch
6
+ flake8
7
+ gradio
siameser.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unicodedata
2
+ import torch
3
+ import utils
4
+ import parameters
5
+ import json
6
+ from sentence_transformers import SentenceTransformer
7
+ import os
8
+ import torch.nn.functional as F
9
+
10
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
11
+
12
+ device = torch.device('cpu')
13
+
14
+ class Siameser:
15
+ def __init__(self, model_name=None, stadard_scope=None):
16
+ # print('Load model')
17
+ print("Load sentence embedding model (If this is the first time you run this repo, It could be take time to download sentence embedding model)")
18
+ self.threshold = 0.61
19
+ # if os.path.isdir(parameters.local_embedding_model):
20
+ # self.embedding_model = SentenceTransformer(parameters.local_embedding_model).to(device)
21
+ # else:
22
+ # self.embedding_model = SentenceTransformer(parameters.embedding_model).to(device)
23
+ # self.embedding_model.save(parameters.local_embedding_model)
24
+ self.embedding_model = SentenceTransformer(parameters.embedding_model).to(device)
25
+
26
+ if stadard_scope == 'all':
27
+ print('Load standard address')
28
+ with open(file=parameters.NORM_ADDS_FILE_ALL_1, mode='r', encoding='utf-8') as f:
29
+ self.NORM_ADDS = json.load(fp=f)
30
+
31
+ print('Load standard address matrix')
32
+ embedding = torch.load(parameters.STD_EMBEDDING_FILE_ALL_1)
33
+ self.std_embeddings = embedding['accent_matrix'].to(device)
34
+ self.NT_std_embeddings = embedding['noaccent_matrix'].to(device)
35
+ else:
36
+ print('Load standard address')
37
+ with open(file=parameters.NORM_ADDS_FILE_HN_HCM, mode='r', encoding='utf-8') as f:
38
+ self.NORM_ADDS = json.load(fp=f)
39
+
40
+ print('Load standard address matrix')
41
+ embedding = torch.load(parameters.STD_EMBEDDING_FILE_HN_HCM)
42
+ self.std_embeddings = embedding['accent_matrix'].to(device)
43
+ self.NT_std_embeddings = embedding['noaccent_matrix'].to(device)
44
+
45
+ self.num_std_add = self.std_embeddings.shape[0]
46
+ print('Done')
47
+
48
+ def standardize(self, raw_add_):
49
+ raw_add = unicodedata.normalize('NFC', raw_add_).lower()
50
+ raw_add = utils.remove_punctuation(raw_add)
51
+ raw_add_vector = self.embedding_model.encode(raw_add, convert_to_tensor=True).to(device)
52
+ raw_add_vectors = raw_add_vector.repeat(self.num_std_add, 1)
53
+ if raw_add == utils.remove_accent(raw_add):
54
+ score = F.cosine_similarity(raw_add_vectors, self.NT_std_embeddings)
55
+ else:
56
+ score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
57
+ s, top_k = score.topk(1)
58
+ # print(s, top_k)
59
+ # return
60
+ s, idx = s.tolist()[0], top_k.tolist()[0]
61
+ # if s < 0.57:
62
+ if s < self.threshold:
63
+ return {'Format Error': 'Xâu truyền vào không phải địa chỉ, mời nhập lại.'}
64
+ std_add = self.NORM_ADDS[str(idx)]
65
+ return utils.get_full_result(raw_add_, std_add, round(s, 4))
66
+
67
+ def get_top_k(self, raw_add_, k):
68
+ raw_add = unicodedata.normalize('NFC', raw_add_).lower()
69
+ raw_add = utils.remove_punctuation(raw_add)
70
+ raw_add_vector = self.embedding_model.encode(raw_add, convert_to_tensor=True).to(device)
71
+ raw_add_vectors = raw_add_vector.repeat(self.num_std_add, 1)
72
+ if raw_add == utils.remove_accent(raw_add):
73
+ score = F.cosine_similarity(raw_add_vectors, self.NT_std_embeddings)
74
+ else:
75
+ score = F.cosine_similarity(raw_add_vectors, self.std_embeddings)
76
+ s, top_k = score.topk(k)
77
+ s, top_k = s.tolist(), top_k.tolist()
78
+ # print(s, top_k)
79
+ # return
80
+
81
+ if s[0] < self.threshold:
82
+ return {'Format Error': 'Dường như xâu truyền vào không phải địa chỉ, mời nhập lại.'}, {}
83
+
84
+ top_std_adds = []
85
+ for score, idx in zip(s, top_k):
86
+ std_add = self.NORM_ADDS[str(idx)]
87
+ top_std_adds.append(utils.get_full_result(raw_add_, std_add, round(score, 4)))
88
+
89
+ x1, x2 = top_std_adds[0], top_std_adds[1]
90
+
91
+ return top_std_adds[0], top_std_adds
utils.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import numpy as np
2
+ import re
3
+ import string
4
+
5
+ # delete tone and lower
6
+ anphabet = ['a', 'ă', 'â', 'b', 'c', 'd',
7
+ 'đ', 'e', 'ê', 'g', 'h', 'i',
8
+ 'k', 'l', 'm', 'n', 'o', 'ô',
9
+ 'ơ', 'p', 'q', 'r', 's', 't',
10
+ 't', 'u', 'ư', 'v', 'x', 'y',
11
+ ]
12
+
13
+ tone = {
14
+ 'á, à, ã, ạ, ả, ấ, ầ, ẫ, ậ, ẩ, ắ, ằ, ẵ, ặ, ẳ, â, ă': 'a',
15
+ 'ó, ò, õ, ọ, ỏ, ố, ồ, ỗ, ộ, ổ, ớ, ờ, ỡ, ợ, ở, ơ, ô': 'o',
16
+ 'é, è, ẽ, ẹ, ẻ, ế, ề, ễ, ệ, ể, ê': 'e',
17
+ 'í, ì, ĩ, ị, ỉ': 'i',
18
+ 'ú, ù, ũ, ụ, ủ, ứ, ừ, ự, ử, ữ, ư': 'u',
19
+ 'đ': 'd',
20
+ 'ý, ỳ, ỹ, ỵ, ỷ': 'y'
21
+ }
22
+
23
+ RT = {}
24
+ for i in tone.items():
25
+ for j in i[0]:
26
+ if j == ',' or j == ' ':
27
+ continue
28
+ RT[j] = i[1]
29
+
30
+
31
+ def remove_accent(text):
32
+
33
+ res = ''
34
+ for char in text:
35
+ res += RT[char] if char in RT else char
36
+ return res
37
+
38
+
39
+ # remove functuation
40
+ def remove_punctuation(text):
41
+
42
+ punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
43
+ whitespace = ' '
44
+ for i in text:
45
+ if i in punctuation:
46
+ text = text.replace(i, whitespace)
47
+ return ' '.join(text.split())
48
+
49
+
50
+ def clean_text(text):
51
+ text = text.encode("ascii", errors="ignore").decode(
52
+ "ascii"
53
+ ) # remove non-ascii, Chinese characters
54
+ text = re.sub(r"http\S+", "", text)
55
+ text = re.sub(r"\n", " ", text)
56
+ text = re.sub(r"\n\n", " ", text)
57
+ text = re.sub(r"\t", " ", text)
58
+ text = text.strip(" ")
59
+ text = re.sub(
60
+ " +", " ", text
61
+ ).strip() # get rid of multiple spaces and replace with a single
62
+ return text
63
+
64
+
65
+ def remove_prefix(address):
66
+ if address != remove_accent(address):
67
+ return re.sub('(tỉnh |thành phố |huyện |thị trấn |thị xã |phường |xã |quận |đường |phố |tp )', '', address, flags=re.IGNORECASE).strip()
68
+ return re.sub('(tinh |thanh pho |huyen |thi tran |thi xa |phuong |xa |quan |duong |pho |tp )', '', address, flags=re.IGNORECASE).strip()
69
+
70
+
71
+ def clean_detail_address(detail_address):
72
+ detail_address = remove_prefix(detail_address)
73
+ try:
74
+ if detail_address[-1] in string.punctuation:
75
+ detail_address = detail_address[:-1]
76
+ except:
77
+ pass
78
+ return detail_address
79
+
80
+
81
+ def get_detail_address(address, std_address):
82
+ address = address.lower()
83
+ split_token = list(std_address.values())[0].split()[0]
84
+ if address == remove_accent(address):
85
+ split_token = remove_accent(split_token)
86
+ detail_address = address.split(split_token)[0]
87
+ if detail_address == address:
88
+ return ''
89
+ detail_address = clean_detail_address(detail_address)
90
+ return detail_address
91
+
92
+
93
+ def get_full_result(raw_address, std_address, score):
94
+ full_result = dict()
95
+ full_result['detail_address'] = get_detail_address(raw_address, std_address)
96
+ full_result['main_address'] = std_address
97
+ full_result['similarity_score'] = score
98
+ return full_result