Update model.py
Browse files
model.py
CHANGED
@@ -1,42 +1,464 @@
|
|
1 |
import os
|
2 |
import kenlm
|
3 |
import sentencepiece as spm
|
4 |
-
from tokenizers import normalizers
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
class KenlmModel:
|
8 |
def __init__(
|
9 |
self,
|
10 |
vocabulary_size: str,
|
11 |
ngram: str,
|
12 |
pruning: str,
|
|
|
13 |
normalize_nfd: bool = True,
|
14 |
normalize_numbers: bool = True,
|
15 |
-
|
|
|
16 |
):
|
17 |
self.model = kenlm.Model(os.path.join("files", f"jomleh-sp-{vocabulary_size}-o{ngram}-prune{pruning}.probing"))
|
18 |
self.tokenizer = spm.SentencePieceProcessor(os.path.join("files", f"jomleh-sp-{vocabulary_size}.model"))
|
19 |
|
20 |
norm_list = []
|
21 |
-
if
|
22 |
-
norm_list += [normalizers.Replace(
|
23 |
-
normalizers.Replace("۲", "۰"),
|
24 |
-
normalizers.Replace("۳", "۰"),
|
25 |
-
normalizers.Replace("۴", "۰"),
|
26 |
-
normalizers.Replace("۵", "۰"),
|
27 |
-
normalizers.Replace("۶", "۰"),
|
28 |
-
normalizers.Replace("۷", "۰"),
|
29 |
-
normalizers.Replace("۸", "۰"),
|
30 |
-
normalizers.Replace("۹", "۰"),
|
31 |
-
normalizers.Replace(".", "")]
|
32 |
-
if normalize_puctuation:
|
33 |
-
norm_list += [normalizers.Replace(".", ""),
|
34 |
-
normalizers.Replace("!", ""),
|
35 |
-
normalizers.Replace("؛", ""),
|
36 |
-
normalizers.Replace("،", ""),
|
37 |
-
normalizers.Replace("؟", "")]
|
38 |
if normalize_nfd:
|
39 |
norm_list += [normalizers.NFD()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
norm_list += [normalizers.Strip()]
|
41 |
|
42 |
self.normalizer = normalizers.Sequence(norm_list)
|
@@ -47,8 +469,20 @@ class KenlmModel:
|
|
47 |
vocabulary_size: str,
|
48 |
ngram: str,
|
49 |
pruning: str,
|
|
|
|
|
|
|
|
|
|
|
50 |
):
|
51 |
-
return cls(vocabulary_size,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
def score(self, doc: str):
|
54 |
doc = self.normalizer.normalize_str(doc)
|
|
|
1 |
import os
|
2 |
import kenlm
|
3 |
import sentencepiece as spm
|
4 |
+
from tokenizers import normalizers, Regex
|
5 |
|
6 |
|
7 |
+
# Borrowed from Jomleh dataset code
|
8 |
+
char_map = {
|
9 |
+
# Arabic Letter Hamza
|
10 |
+
# "\u": "\u0621",
|
11 |
+
|
12 |
+
# Arabic Letter Alef with Hamza Above
|
13 |
+
"\uFE83": "\u0623",
|
14 |
+
"\uFE84": "\u0623",
|
15 |
+
|
16 |
+
# Arabic Letter Yeh with Hamza Above
|
17 |
+
"\uFE89": "\u0626",
|
18 |
+
"\uFE8A": "\u0626",
|
19 |
+
"\uFE8B": "\u0626",
|
20 |
+
"\uFE8C": "\u0626",
|
21 |
+
|
22 |
+
# Arabic Letter Waw with Hamza Above
|
23 |
+
"\uFE85": "\u0624",
|
24 |
+
"\uFE86": "\u0624",
|
25 |
+
"\u0676": "\u0624",
|
26 |
+
|
27 |
+
# Arabic Letter Alef with Madda Above
|
28 |
+
"\uFE81": "\u0622", # Arabic letter Alef final form
|
29 |
+
"\uFE82": "\u0622", # Arabic letter Alef isolated form
|
30 |
+
|
31 |
+
# Alef
|
32 |
+
"\uFB50": "\u0627", # Arabic letter Alef wasla
|
33 |
+
"\uFE87": "\u0627",
|
34 |
+
"\u0675": "\u0627",
|
35 |
+
"\u0625": "\u0627",
|
36 |
+
"\uFE8D": "\u0627",
|
37 |
+
"\uFE8E": "\u0627",
|
38 |
+
"\u1EE00": "\u0627",
|
39 |
+
"\u1EE80": "\u0627",
|
40 |
+
|
41 |
+
# Beh
|
42 |
+
"\uFE8F": "\u0628",
|
43 |
+
"\uFE90": "\u0628",
|
44 |
+
"\uFE91": "\u0628",
|
45 |
+
"\uFE92": "\u0628",
|
46 |
+
"\u1EE01": "\u0628",
|
47 |
+
"\u1EE21": "\u0628",
|
48 |
+
"\u1EE61": "\u0628",
|
49 |
+
"\u1EE81": "\u0628",
|
50 |
+
"\u1EEA1": "\u0628",
|
51 |
+
|
52 |
+
# Pe
|
53 |
+
"\uFB56": "\u067E",
|
54 |
+
"\uFB57": "\u067E",
|
55 |
+
"\uFB58": "\u067E",
|
56 |
+
"\uFB59": "\u067E",
|
57 |
+
|
58 |
+
# Teh
|
59 |
+
"\uFE95": "\u062A",
|
60 |
+
"\uFE96": "\u062A",
|
61 |
+
"\uFE97": "\u062A",
|
62 |
+
"\uFE98": "\u062A",
|
63 |
+
"\u1EE15": "\u062A",
|
64 |
+
"\u1EE35": "\u062A",
|
65 |
+
"\u1EE75": "\u062A",
|
66 |
+
"\u1EE95": "\u062A",
|
67 |
+
"\u1EEB5": "\u062A",
|
68 |
+
|
69 |
+
# Theh
|
70 |
+
"\uFE99": "\u062B",
|
71 |
+
"\uFE9A": "\u062B",
|
72 |
+
"\uFE9B": "\u062B",
|
73 |
+
"\uFE9C": "\u062B",
|
74 |
+
"\u1EE16": "\u062B",
|
75 |
+
"\u1EE36": "\u062B",
|
76 |
+
"\u1EE76": "\u062B",
|
77 |
+
"\u1EE96": "\u062B",
|
78 |
+
"\u1EEB6": "\u062B",
|
79 |
+
|
80 |
+
# Jim
|
81 |
+
"\uFE9D": "\u062C",
|
82 |
+
"\uFE9E": "\u062C",
|
83 |
+
"\uFE9F": "\u062C",
|
84 |
+
"\uFEA0": "\u062C",
|
85 |
+
"\u1EE02": "\u062C",
|
86 |
+
"\u1EE22": "\u062C",
|
87 |
+
"\u1EE42": "\u062C",
|
88 |
+
"\u1EE62": "\u062C",
|
89 |
+
"\u1EE82": "\u062C",
|
90 |
+
"\u1EEA2": "\u062C",
|
91 |
+
|
92 |
+
# Cheh
|
93 |
+
"\uFB7A": "\u0686",
|
94 |
+
"\uFB7B": "\u0686",
|
95 |
+
"\uFB7C": "\u0686",
|
96 |
+
"\uFB7D": "\u0686",
|
97 |
+
|
98 |
+
# Hah
|
99 |
+
"\uFEA1": "\u062D",
|
100 |
+
"\uFEA2": "\u062D",
|
101 |
+
"\uFEA3": "\u062D",
|
102 |
+
"\uFEA4": "\u062D",
|
103 |
+
"\u1EE07": "\u062D",
|
104 |
+
"\u1EE27": "\u062D",
|
105 |
+
"\u1EE47": "\u062D",
|
106 |
+
"\u1EE67": "\u062D",
|
107 |
+
"\u1EE87": "\u062D",
|
108 |
+
"\u1EEA7": "\u062D",
|
109 |
+
|
110 |
+
# Khah
|
111 |
+
"\uFEA5": "\u062E",
|
112 |
+
"\uFEA6": "\u062E",
|
113 |
+
"\uFEA7": "\u062E",
|
114 |
+
"\uFEA8": "\u062E",
|
115 |
+
"\u1EE17": "\u062E",
|
116 |
+
"\u1EE37": "\u062E",
|
117 |
+
"\u1EE57": "\u062E",
|
118 |
+
"\u1EE77": "\u062E",
|
119 |
+
"\u1EE97": "\u062E",
|
120 |
+
"\u1EEB7": "\u062E",
|
121 |
+
|
122 |
+
# Dal
|
123 |
+
"\uFEA9": "\u062F",
|
124 |
+
"\uFEAA": "\u062F",
|
125 |
+
"\u1EE03": "\u062F",
|
126 |
+
"\u1EE83": "\u062F",
|
127 |
+
"\u1EEA3": "\u062F",
|
128 |
+
|
129 |
+
# Zal
|
130 |
+
"\uFEAB": "\u0630",
|
131 |
+
"\uFEAC": "\u0630",
|
132 |
+
"\u1EE18": "\u0630",
|
133 |
+
"\u1EE98": "\u0630",
|
134 |
+
"\u1EEB8": "\u0630",
|
135 |
+
|
136 |
+
# Reh
|
137 |
+
"\uFEAE": "\u0631", # Arabic letter Reh isolated form
|
138 |
+
"\uFEAD": "\u0631", # Arabic letter Reh final form
|
139 |
+
"\u0692": "\u0631",
|
140 |
+
"\u1EE13": "\u0631",
|
141 |
+
"\u1EE93": "\u0631",
|
142 |
+
"\u1EEB3": "\u0631",
|
143 |
+
|
144 |
+
# Ze
|
145 |
+
"\uFEAF": "\u0632", #
|
146 |
+
"\uFEB0": "\u0632", #
|
147 |
+
"\u1EE06": "\u0632", #
|
148 |
+
"\u1EE86": "\u0632", #
|
149 |
+
"\u1EEA6": "\u0632", #
|
150 |
+
|
151 |
+
# Jhe
|
152 |
+
"\uFB8A": "\u0698",
|
153 |
+
"\uFB8B": "\u0698",
|
154 |
+
|
155 |
+
# Seen
|
156 |
+
"\uFEB1": "\u0633", #
|
157 |
+
"\uFEB2": "\u0633", #
|
158 |
+
"\uFEB3": "\u0633", #
|
159 |
+
"\uFEB4": "\u0633", #
|
160 |
+
"\u1EE0E": "\u0633", #
|
161 |
+
"\u1EE2E": "\u0633", #
|
162 |
+
"\u1EE4E": "\u0633", #
|
163 |
+
"\u1EE6E": "\u0633", #
|
164 |
+
"\u1EE8E": "\u0633", #
|
165 |
+
"\u1EEAE": "\u0633", #
|
166 |
+
|
167 |
+
# Sheen
|
168 |
+
"\uFEB5": "\u0634", #
|
169 |
+
"\uFEB6": "\u0634", #
|
170 |
+
"\uFEB7": "\u0634", #
|
171 |
+
"\uFEB8": "\u0634", #
|
172 |
+
"\u1EE14": "\u0634", #
|
173 |
+
"\u1EE34": "\u0634", #
|
174 |
+
"\u1EE54": "\u0634", #
|
175 |
+
"\u1EE74": "\u0634", #
|
176 |
+
"\u1EE94": "\u0634", #
|
177 |
+
"\u1EEB4": "\u0634", #
|
178 |
+
|
179 |
+
# Sad
|
180 |
+
"\uFEB9": "\u0635", #
|
181 |
+
"\uFEBA": "\u0635", #
|
182 |
+
"\uFEBB": "\u0635", #
|
183 |
+
"\uFEBC": "\u0635", #
|
184 |
+
"\u1EE11": "\u0635", #
|
185 |
+
"\u1EE31": "\u0635", #
|
186 |
+
"\u1EE51": "\u0635", #
|
187 |
+
"\u1EE71": "\u0635", #
|
188 |
+
"\u1EE91": "\u0635", #
|
189 |
+
"\u1EEB1": "\u0635", #
|
190 |
+
|
191 |
+
# Zad
|
192 |
+
"\uFEBD": "\u0636", #
|
193 |
+
"\uFEBE": "\u0636", #
|
194 |
+
"\uFEBF": "\u0636", #
|
195 |
+
"\uFEC0": "\u0636", #
|
196 |
+
"\u1EE19": "\u0636", #
|
197 |
+
"\u1EE39": "\u0636", #
|
198 |
+
"\u1EE59": "\u0636", #
|
199 |
+
"\u1EE79": "\u0636", #
|
200 |
+
"\u1EE99": "\u0636", #
|
201 |
+
"\u1EEB9": "\u0636", #
|
202 |
+
|
203 |
+
# Ta
|
204 |
+
"\uFEC1": "\u0637", #
|
205 |
+
"\uFEC2": "\u0637", #
|
206 |
+
"\uFEC3": "\u0637", #
|
207 |
+
"\uFEC4": "\u0637", #
|
208 |
+
"\u1EE08": "\u0637", #
|
209 |
+
"\u1EE68": "\u0637", #
|
210 |
+
"\u1EE88": "\u0637", #
|
211 |
+
"\u1EEA8": "\u0637", #
|
212 |
+
|
213 |
+
# Za
|
214 |
+
"\uFEC5": "\u0638", #
|
215 |
+
"\uFEC6": "\u0638", #
|
216 |
+
"\uFEC7": "\u0638", #
|
217 |
+
"\uFEC8": "\u0638", #
|
218 |
+
"\u1EE1A": "\u0638", #
|
219 |
+
"\u1EE7A": "\u0638", #
|
220 |
+
"\u1EE9A": "\u0638", #
|
221 |
+
"\u1EEBA": "\u0638", #
|
222 |
+
|
223 |
+
# Ain
|
224 |
+
"\uFEC9": "\u0639", #
|
225 |
+
"\uFECA": "\u0639", #
|
226 |
+
"\uFECB": "\u0639", #
|
227 |
+
"\uFECC": "\u0639", #
|
228 |
+
"\u1EE0F": "\u0639", #
|
229 |
+
"\u1EE2F": "\u0639", #
|
230 |
+
"\u1EE4F": "\u0639", #
|
231 |
+
"\u1EE6F": "\u0639", #
|
232 |
+
"\u1EE8F": "\u0639", #
|
233 |
+
"\u1EEAF": "\u0639", #
|
234 |
+
|
235 |
+
# Ghain
|
236 |
+
"\uFECD": "\u063A", #
|
237 |
+
"\uFECE": "\u063A", #
|
238 |
+
"\uFECF": "\u063A", #
|
239 |
+
"\uFED0": "\u063A", #
|
240 |
+
"\u1EE1B": "\u063A", #
|
241 |
+
"\u1EE3B": "\u063A", #
|
242 |
+
"\u1EE5B": "\u063A", #
|
243 |
+
"\u1EE7B": "\u063A", #
|
244 |
+
"\u1EE9B": "\u063A", #
|
245 |
+
"\u1EEBB": "\u063A", #
|
246 |
+
|
247 |
+
# Fa
|
248 |
+
"\uFED1": "\u0641", #
|
249 |
+
"\uFED2": "\u0641", #
|
250 |
+
"\uFED3": "\u0641", #
|
251 |
+
"\uFED4": "\u0641", #
|
252 |
+
"\u1EE10": "\u0641", #
|
253 |
+
"\u1EE30": "\u0641", #
|
254 |
+
"\u1EE70": "\u0641", #
|
255 |
+
"\u1EE90": "\u0641", #
|
256 |
+
"\u1EEB0": "\u0641", #
|
257 |
+
|
258 |
+
# Qaf
|
259 |
+
"\uFED5": "\u0642", #
|
260 |
+
"\uFED6": "\u0642", #
|
261 |
+
"\uFED7": "\u0642", #
|
262 |
+
"\uFED8": "\u0642", #
|
263 |
+
"\u1EE12": "\u0642", #
|
264 |
+
"\u1EE32": "\u0642", #
|
265 |
+
"\u1EE52": "\u0642", #
|
266 |
+
"\u1EE72": "\u0642", #
|
267 |
+
"\u1EE92": "\u0642", #
|
268 |
+
"\u1EEB2": "\u0642", #
|
269 |
+
|
270 |
+
# Kaf
|
271 |
+
"\uFB8E": "\u06A9", # Arabic letter Kaf isolated form
|
272 |
+
"\uFB8F": "\u06A9", # Arabic letter Kaf final form
|
273 |
+
"\uFB90": "\u06A9", # Arabic letter Kaf initial form
|
274 |
+
"\uFB91": "\u06A9", # Arabic letter Kaf medial form
|
275 |
+
"\uFCC8": "\u06A9", # Arabic ligature Dal with Alef final form
|
276 |
+
"\u0643": "\u06A9",
|
277 |
+
"\uFED9": "\u06A9",
|
278 |
+
"\uFEDA": "\u06A9", # Arabic Letter Kaf Final Form
|
279 |
+
"\uFEDB": "\u06A9", #
|
280 |
+
"\uFEDC": "\u06A9", #
|
281 |
+
"\u1EE0A": "\u06A9", #
|
282 |
+
"\u1EE2A": "\u06A9", #
|
283 |
+
"\u1EE6A": "\u06A9", #
|
284 |
+
|
285 |
+
# Gaf
|
286 |
+
"\uFB92": "\u06AF", # Arabic letter Gaf isolated form
|
287 |
+
"\uFB93": "\u06AF", # Arabic letter Gaf final form
|
288 |
+
"\uFB94": "\u06AF", # Arabic letter Gaf initial form
|
289 |
+
"\uFB95": "\u06AF", # Arabic letter Gaf medial form
|
290 |
+
|
291 |
+
# Lam
|
292 |
+
"\uFCC9": "\u0644", # Arabic Ligature Lam with Jeem Initial Form
|
293 |
+
"\uFEDD": "\u0644", # Arabic Letter Lam Isolated Form
|
294 |
+
"\uFEDE": "\u0644", # Arabic Letter Lam Final Form
|
295 |
+
"\uFEDF": "\u0644", # Arabic Letter Lam Initial Form
|
296 |
+
"\uFEE0": "\u0644", # Arabic Letter Lam Medial Form
|
297 |
+
"\u1EE0B": "\u0644", # Arabic Mathematical Lam
|
298 |
+
"\u1EE2B": "\u0644", # Arabic Mathematical Initial Lam
|
299 |
+
"\u1EE4B": "\u0644", # Arabic Mathematical Tailed Lam
|
300 |
+
"\u1EE8B": "\u0644", # Arabic Mathematical Looped Lam
|
301 |
+
"\u1EEAB": "\u0644", # Arabic Mathematical Double-Struck Lam
|
302 |
+
|
303 |
+
# Mim
|
304 |
+
"\uFEE1": "\u0645", # Arabic Letter Meem Isolated Form
|
305 |
+
"\uFEE2": "\u0645", # Arabic Letter Meem Final Form
|
306 |
+
"\uFEE3": "\u0645", # Arabic Letter Meem Initial Form
|
307 |
+
"\uFEE4": "\u0645", # Arabic Letter Meem Medial Form
|
308 |
+
"\u1EE0C": "\u0645", # Arabic Mathematical Meem
|
309 |
+
"\u1EE2C": "\u0645", # Arabic Mathematical Initial Meem
|
310 |
+
"\u1EE6C": "\u0645", # Arabic Mathematical Stretched Meem
|
311 |
+
"\u1EE8C": "\u0645", # Arabic Mathematical Looped Meem
|
312 |
+
"\u1EEAC": "\u0645", # Arabic Mathematical Double-Struck Meem
|
313 |
+
|
314 |
+
# Nun
|
315 |
+
"\uFEE5": "\u0646", # Arabic Letter Noon Isolated Form
|
316 |
+
"\uFEE6": "\u0646", # Arabic Letter Noon Final Form
|
317 |
+
"\uFEE7": "\u0646", # Arabic Letter Noon Initial Form
|
318 |
+
"\uFEE8": "\u0646", # Arabic Letter Noon Medial Form
|
319 |
+
"\u1EE0D": "\u0646", # Arabic Mathematical Noon
|
320 |
+
"\u1EE2D": "\u0646", # Arabic Mathematical Initial Noon
|
321 |
+
"\u1EE4D": "\u0646", # Arabic Mathematical Tailed Noon
|
322 |
+
"\u1EE6D": "\u0646", # Arabic Mathematical Stretched Noon
|
323 |
+
"\u1EE8D": "\u0646", # Arabic Mathematical Looped Noon
|
324 |
+
"\u1EEAD": "\u0646", # Arabic Mathematical Double-Struck Noon
|
325 |
+
|
326 |
+
# Vav
|
327 |
+
"\u0677": "\u0648", # Arabic letter Mid hamza on waw
|
328 |
+
"\uFEED": "\u0648", # Arabic Letter Waw Isolated Form
|
329 |
+
"\uFEEE": "\u0648", # Arabic Letter Waw Final Form
|
330 |
+
"\u06C6": "\u0648", # Arabic Letter Oe
|
331 |
+
"\u06C7": "\u0648", # Arabic Letter U
|
332 |
+
|
333 |
+
# He
|
334 |
+
"\u06C0": "\u0647", # Arabic letter Heh with yeh above
|
335 |
+
"\u0629": "\u0647", # Arabic Letter Teh Marbuta
|
336 |
+
"\u06BE": "\u0647", # Arabic Letter Heh Doachashmee
|
337 |
+
"\uFE93": "\u0647", # Arabic Letter Teh Marbuta Isolated Form
|
338 |
+
"\u06D5": "\u0647", # Arabic Letter Ae
|
339 |
+
"\uFEE9": "\u0647", # Arabic Letter Heh Isolated Form
|
340 |
+
"\uFEEA": "\u0647", # Arabic Letter Heh Final Form
|
341 |
+
"\uFEEB": "\u0647", # Arabic Letter Heh Initial Form
|
342 |
+
"\uFEEC": "\u0647", # Arabic Letter Heh Medial Form
|
343 |
+
"\u1EE24": "\u0647", # Arabic Mathematical Initial Heh
|
344 |
+
"\u1EE64": "\u0647", # Arabic Mathematical Stretched Heh
|
345 |
+
"\u1EE84": "\u0647", # Arabic Mathematical Looped Heh
|
346 |
+
|
347 |
+
# Yeh
|
348 |
+
"\u06D0": "\u06CC", # Arabic letter Yeh with dot below
|
349 |
+
"\uFEEF": "\u06CC", # Arabic Letter Alef Maksura Isolated Form
|
350 |
+
"\uFEF3": "\u06CC", # Arabic Letter Yeh Initial Form
|
351 |
+
"\uFEF4": "\u06CC", # Arabic Letter Yeh Medial Form
|
352 |
+
"\u064A": "\u06CC", # Arabic Letter Yeh
|
353 |
+
"\uFEF1": "\u06CC", # Arabic Letter Yeh Isolated Form
|
354 |
+
"\u06CE": "\u06CC", # Arabic Letter Yeh with Small V
|
355 |
+
"\uFBFD": "\u06CC", # Arabic Letter Farsi Yeh Final Form
|
356 |
+
"\uFBFC": "\u06CC", # Arabic Letter Farsi Yeh Isolated Form
|
357 |
+
"\uFBFE": "\u06CC", # Arabic Letter Farsi Yeh Initial Form
|
358 |
+
"\uFBFF": "\u06CC", # Arabic Letter Farsi Yeh Medial Form
|
359 |
+
"\uFEF0": "\u06CC", # Arabic letter Lam final form
|
360 |
+
"\uFEF2": "\u06CC", # Arabic letter Lam medial form
|
361 |
+
"\u063D": "\u06CC",
|
362 |
+
"\u063E": "\u06CC",
|
363 |
+
"\u063F": "\u06CC",
|
364 |
+
"\u06D2": "\u06CC", # Arabic Letter Yeh Barree
|
365 |
+
|
366 |
+
"\u064E": "",
|
367 |
+
"\u064B": "",
|
368 |
+
"\u064F": "",
|
369 |
+
"\u064C": "",
|
370 |
+
"\u0650": "",
|
371 |
+
"\u064D": "",
|
372 |
+
"\u0652": "",
|
373 |
+
"\u0651": "",
|
374 |
+
"\u0654": "",
|
375 |
+
|
376 |
+
"0": "۰",
|
377 |
+
"1": "۱",
|
378 |
+
"2": "۲",
|
379 |
+
"3": "۳",
|
380 |
+
"4": "۴",
|
381 |
+
"5": "۵",
|
382 |
+
"6": "۶",
|
383 |
+
"7": "۷",
|
384 |
+
"8": "۸",
|
385 |
+
"9": "۹",
|
386 |
+
"٠": "۰",
|
387 |
+
"١": "۱",
|
388 |
+
"٢": "۲",
|
389 |
+
"٣": "۳",
|
390 |
+
"٤": "۴",
|
391 |
+
"٥": "۵",
|
392 |
+
"٦": "۶",
|
393 |
+
"٧": "۷",
|
394 |
+
"٨": "۸",
|
395 |
+
"٩": "۹",
|
396 |
+
|
397 |
+
"٬": "،",
|
398 |
+
",": "،",
|
399 |
+
";": "؛",
|
400 |
+
"?": "؟",
|
401 |
+
"\\": " ",
|
402 |
+
"…": " غیره ",
|
403 |
+
"%": " درصد ",
|
404 |
+
"\u200e": " ",# LEFT-TO-RIGHT
|
405 |
+
"\u200f": " ",# RIGHT-TO-LEFT
|
406 |
+
"\u202a": " ",# LEFT-TO-RIGHT EMBEDDING
|
407 |
+
"\u202b": " ",# RIGHT-TO-LEFT EMBEDDING
|
408 |
+
"\u2066": " ",# LEFT-TO-RIGHT ISOLATE
|
409 |
+
"\u2067": " ",# RIGHT-TO-LEFT ISOLATE
|
410 |
+
"\u2069": " ",# POP DIRECTIONAL ISOLATE
|
411 |
+
"\ufdef": " ",# Non-standard
|
412 |
+
"\u00B7": ".",# MIDDLE DOT
|
413 |
+
"\u2022": " ",# BULLET POINT
|
414 |
+
|
415 |
+
"'": " ",
|
416 |
+
"“": " ",
|
417 |
+
"”": " ",
|
418 |
+
"\u00ad": " ",
|
419 |
+
"\u005f": " ",
|
420 |
+
"\u002b": " ",
|
421 |
+
"\u200b": " ",
|
422 |
+
# ©
|
423 |
+
"\u00a9": " ",
|
424 |
+
|
425 |
+
"\u2014": " ",# Em Dash
|
426 |
+
"\u2019": " ",# Right Single Quotation Mark
|
427 |
+
"\uFE0F": "",# Variation Selector-16 (VS16)
|
428 |
+
"\u007C": " ",# Vertical Line
|
429 |
+
}
|
430 |
+
|
431 |
class KenlmModel:
|
432 |
def __init__(
|
433 |
self,
|
434 |
vocabulary_size: str,
|
435 |
ngram: str,
|
436 |
pruning: str,
|
437 |
+
map_to_farsi_alphabet: bool = True,
|
438 |
normalize_nfd: bool = True,
|
439 |
normalize_numbers: bool = True,
|
440 |
+
remove_puctuation: bool = True,
|
441 |
+
remove_non_farsi: bool = True,
|
442 |
):
|
443 |
self.model = kenlm.Model(os.path.join("files", f"jomleh-sp-{vocabulary_size}-o{ngram}-prune{pruning}.probing"))
|
444 |
self.tokenizer = spm.SentencePieceProcessor(os.path.join("files", f"jomleh-sp-{vocabulary_size}.model"))
|
445 |
|
446 |
norm_list = []
|
447 |
+
if map_to_farsi_alphabet:
|
448 |
+
norm_list += [normalizers.Replace(key, value) for key, value in char_map.items()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
449 |
if normalize_nfd:
|
450 |
norm_list += [normalizers.NFD()]
|
451 |
+
if normalize_numbers:
|
452 |
+
norm_list += [normalizers.Replace(Regex("[۱۲۳۴۵۶۷۸۹]"), "۰")]
|
453 |
+
if remove_puctuation:
|
454 |
+
norm_list += [normalizers.Replace(Regex("[.!؛،؟]"), "")]
|
455 |
+
if remove_non_farsi:
|
456 |
+
norm_list += [normalizers.Replace(Regex("[^\u060c\u061b\u061f\u0622\u0623\u0624\u0626\u0627"
|
457 |
+
"\u0628\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631"
|
458 |
+
"\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a"
|
459 |
+
"\u0641\u0642\u0644\u0645\u0646\u0647\u0648\u067e\u0686"
|
460 |
+
"\u0698\u06a9\u06af\u06cc\u06f0\u06f1\u06f2\u06f3\u06f4"
|
461 |
+
"\u06f5\u06f6\u06f7\u06f8\u06f9\\s\u200c\\.\\!]"), "")]
|
462 |
norm_list += [normalizers.Strip()]
|
463 |
|
464 |
self.normalizer = normalizers.Sequence(norm_list)
|
|
|
469 |
vocabulary_size: str,
|
470 |
ngram: str,
|
471 |
pruning: str,
|
472 |
+
map_to_farsi_alphabet: bool = True,
|
473 |
+
normalize_nfd: bool = True,
|
474 |
+
normalize_numbers: bool = True,
|
475 |
+
remove_puctuation: bool = True,
|
476 |
+
remove_non_farsi: bool = True,
|
477 |
):
|
478 |
+
return cls(vocabulary_size,
|
479 |
+
ngram,
|
480 |
+
pruning,
|
481 |
+
map_to_farsi_alphabet,
|
482 |
+
normalize_nfd,
|
483 |
+
normalize_numbers,
|
484 |
+
remove_puctuation,
|
485 |
+
remove_non_farsi)
|
486 |
|
487 |
def score(self, doc: str):
|
488 |
doc = self.normalizer.normalize_str(doc)
|