huuminh365 commited on
Commit
28ea347
·
1 Parent(s): dc20062

first upload

Browse files
Files changed (7) hide show
  1. .gitignore +1 -0
  2. bpe.codes +0 -0
  3. const.py +1455 -0
  4. latex2operatortree.py +20 -0
  5. test.py +15 -0
  6. tokenizer.py +349 -0
  7. vocab.txt +0 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ env/
bpe.codes ADDED
The diff for this file is too large to render. See raw diff
 
const.py ADDED
@@ -0,0 +1,1455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LATEX_VOC = ['\\overrightarrow{\\rm AB}',
2
+ '\\overleftrightarrow',
3
+ '\\Longleftrightarrow',
4
+ '\\longleftrightarrow',
5
+ '\\supsetneqq\\gtrsim',
6
+ '\\scriptscriptstyle',
7
+ '\\rightleftharpoons',
8
+ '\\circlearrowright',
9
+ '\\overline{\\rm AB}',
10
+ '\\rightrightarrows',
11
+ '\\leftrightarrows',
12
+ '\\rightleftarrows',
13
+ '\\curvearrowright',
14
+ '\\circlearrowleft',
15
+ '\\bigtriangledown',
16
+ '\\leftleftarrows',
17
+ '\\longrightarrow',
18
+ '\\Leftrightarrow',
19
+ '\\Longrightarrow',
20
+ '\\curvearrowleft',
21
+ '\\hookrightarrow',
22
+ '\\rightharpoonup',
23
+ '\\leftrightarrow',
24
+ '\\overrightarrow',
25
+ '\\longleftarrow',
26
+ '\\bigtriangleup',
27
+ '\\overleftarrow',
28
+ '\\triangleright',
29
+ '\\operatorname*',
30
+ '\\hookleftarrow',
31
+ '\\varsubsetneqq',
32
+ '\\Longleftarrow',
33
+ '\\arraystretch',
34
+ '\\right\\rfloor',
35
+ '\\right\\rbrace',
36
+ '\\footnotesize',
37
+ '\\begin{array}',
38
+ '\\right\\rangle',
39
+ '\\displaystyle',
40
+ '\\operatorname',
41
+ '\\renewcommand',
42
+ '\\right\\rbrack',
43
+ '\\triangleleft',
44
+ '\\arraycolsep',
45
+ '\\updownarrow',
46
+ '\\diamondsuit',
47
+ '\\left\\lbrace',
48
+ '\\nolinebreak',
49
+ '\\scriptstyle',
50
+ '\\left\\lfloor',
51
+ '\\left\\langle',
52
+ '\\left\\lbrack',
53
+ '\\multicolumn',
54
+ '\\mathversion',
55
+ '\\not\\supset',
56
+ '\\complement',
57
+ '\\unitlength',
58
+ '\\thicklines',
59
+ '\\rightarrow',
60
+ '\\varnothing',
61
+ '\\right\\vert',
62
+ '\\sqsupseteq',
63
+ '\\supsetneqq',
64
+ '\\not\\subset',
65
+ '\\scriptsize',
66
+ '\\Rightarrow',
67
+ '\\boldsymbol',
68
+ '\\ensuremath',
69
+ '\\curlywedge',
70
+ '\\textnormal',
71
+ '\\normalsize',
72
+ '\\end{array}',
73
+ '\\sqsubseteq',
74
+ '\\varepsilon',
75
+ '\\setcounter',
76
+ '\\unboldmath',
77
+ '\\underbrace',
78
+ '\\circledast',
79
+ '\\longmapsto',
80
+ '\\nsubseteq',
81
+ '\\backslash',
82
+ '\\thinspace',
83
+ '\\smallskip',
84
+ '\\leftarrow',
85
+ '\\llbracket',
86
+ '\\Leftarrow',
87
+ '\\setlength',
88
+ '\\triangleq',
89
+ '\\arrowvert',
90
+ '\\subsetneq',
91
+ '\\supseteqq',
92
+ '\\mathbb{H}',
93
+ '\\varlimsup',
94
+ '\\left\\vert',
95
+ '\\downarrow',
96
+ '\\mathbb{S}',
97
+ '\\nparallel',
98
+ '\\widetilde',
99
+ '\\Downarrow',
100
+ '\\nsupseteq',
101
+ '\\overbrace',
102
+ '\\textstyle',
103
+ '\\varliminf',
104
+ '\\checkmark',
105
+ '\\supsetneq',
106
+ '\\bigotimes',
107
+ '\\pitchfork',
108
+ '\\mathbb{A}',
109
+ '\\underline',
110
+ '\\mathbb{O}',
111
+ '\\righarrow',
112
+ '\\hphantom',
113
+ '\\parallel',
114
+ '\\leqslant',
115
+ '\\underset',
116
+ '\\subseteq',
117
+ '\\bigsqcup',
118
+ '\\bigoplus',
119
+ '\\multiput',
120
+ '\\ulcorner',
121
+ '\\raisebox',
122
+ '\\bigwedge',
123
+ '\\emptyset',
124
+ '\\protectu',
125
+ '\\boxminus',
126
+ '\\buildrel',
127
+ '\\vartheta',
128
+ '\\overline',
129
+ '\\framebox',
130
+ '\\mathfrak',
131
+ '\\smallint',
132
+ '\\stackrel',
133
+ '\\sqsupset',
134
+ '\\llcorner',
135
+ '\\sqsubset',
136
+ '\\lrcorner',
137
+ '\\curlyvee',
138
+ '\\nonumber',
139
+ '\\substack',
140
+ '\\supseteq',
141
+ '\\varsigma',
142
+ '\\biguplus',
143
+ '\\triangle',
144
+ '\\setminus',
145
+ '\\vphantom',
146
+ '\\boldmath',
147
+ '\\geqslant',
148
+ '\\upsilon',
149
+ '\\diamond',
150
+ '\\epsilon',
151
+ '\\noalign',
152
+ '\\medskip',
153
+ '\\lessdot',
154
+ '\\bigcirc',
155
+ '\\protect',
156
+ '\\right\\|',
157
+ '\\boxplus',
158
+ '\\special',
159
+ '\\nearrow',
160
+ '\\right\\}',
161
+ '\\because',
162
+ '\\widehat',
163
+ '\\phantom',
164
+ '\\itshape',
165
+ '\\lessgtr',
166
+ '\\mathbin',
167
+ '\\lefteqn',
168
+ '\\backsim',
169
+ '\\makebox',
170
+ '\\mathscr',
171
+ '\\searrow',
172
+ '\\ooalign',
173
+ '\\Upsilon',
174
+ '\\natural',
175
+ '\\enspace',
176
+ '\\partial',
177
+ '\\uparrow',
178
+ '\\newline',
179
+ '\\ddagger',
180
+ '\\cooking',
181
+ '\\lesssim',
182
+ '\\nexists',
183
+ '\\bigodot',
184
+ '\\nwarrow',
185
+ '\\mathcal',
186
+ '\\mathrel',
187
+ '\\gtrless',
188
+ '\\vcenter',
189
+ '\\swarrow',
190
+ '\\fboxsep',
191
+ '\\omicron',
192
+ '\\leadsto',
193
+ '\\Subset',
194
+ '\\mathit',
195
+ '\\lambda',
196
+ '\\varphi',
197
+ '\\rbrace',
198
+ '\\textit',
199
+ '\\supset',
200
+ '\\textup',
201
+ '\\otimes',
202
+ '\\mathbf',
203
+ '\\right|',
204
+ '\\lbrace',
205
+ '\\forall',
206
+ '\\square',
207
+ '\\varrho',
208
+ '\\arctan',
209
+ '\\rtimes',
210
+ '\\right>',
211
+ '\\textrm',
212
+ '\\texttt',
213
+ '\\textsf',
214
+ '\\models',
215
+ '\\bigvee',
216
+ '\\langle',
217
+ '\\dagger',
218
+ '\\arccot',
219
+ '\\mapsto',
220
+ '\\succeq',
221
+ '\\lbrack',
222
+ '\\bullet',
223
+ '\\textbf',
224
+ '\\gtrsim',
225
+ '\\bigcap',
226
+ '\\nvdash',
227
+ '\\Lambda',
228
+ '\\arccos',
229
+ '\\lceilm',
230
+ '\\rgroup',
231
+ '\\coprod',
232
+ '\\ominus',
233
+ '\\approx',
234
+ '\\parbox',
235
+ '\\lfloor',
236
+ '\\bigcup',
237
+ '\\left\\{',
238
+ '\\oslash',
239
+ '\\enskip',
240
+ '\\ltimes',
241
+ '\\rangle',
242
+ '\\rfloor',
243
+ '\\propto',
244
+ '\\mathsf',
245
+ '\\lgroup',
246
+ '\\pounds',
247
+ '\\rbrack',
248
+ '\\mathrm',
249
+ '\\subset',
250
+ '\\left\\|',
251
+ '\\mathop',
252
+ '\\circle',
253
+ '\\right.',
254
+ '\\mathbb',
255
+ '\\exists',
256
+ '\\arcsin',
257
+ '\\prime',
258
+ '\\vline',
259
+ '\\cdots',
260
+ '\\varpi',
261
+ '\\notin',
262
+ '\\Theta',
263
+ '\\sqcup',
264
+ '\\left<',
265
+ '\\Gamma',
266
+ '\\theta',
267
+ '\\label',
268
+ '\\right',
269
+ '\\simeq',
270
+ '\\infty',
271
+ '\\mskip',
272
+ '\\Large',
273
+ '\\asymp',
274
+ '\\sharp',
275
+ '\\left.',
276
+ '\\dashv',
277
+ '\\lceil',
278
+ '\\omega',
279
+ '\\dddot',
280
+ '\\ldots',
281
+ '\\qquad',
282
+ '\\alpha',
283
+ '\\Biggm',
284
+ '\\check',
285
+ '\\smile',
286
+ '\\vDash',
287
+ '\\grave',
288
+ '\\nless',
289
+ '\\sqcap',
290
+ '\\delta',
291
+ '\\doteq',
292
+ '\\Omega',
293
+ '\\Biggl',
294
+ '\\mkern',
295
+ '\\Biggr',
296
+ '\\hline',
297
+ '\\LARGE',
298
+ '\\smash',
299
+ '\\small',
300
+ '\\aleph',
301
+ '\\equiv',
302
+ '\\tilde',
303
+ '\\space',
304
+ '\\rceil',
305
+ '\\raise',
306
+ '\\unlhd',
307
+ '\\binom',
308
+ '\\gamma',
309
+ '\\vskip',
310
+ '\\strut',
311
+ '\\biggl',
312
+ '\\colon',
313
+ '\\cdotp',
314
+ '\\amalg',
315
+ '\\kappa',
316
+ '\\hrule',
317
+ '\\jmath',
318
+ '\\lower',
319
+ '\\breve',
320
+ '\\sigma',
321
+ '\\vdash',
322
+ '\\wedge',
323
+ '\\large',
324
+ '\\nabla',
325
+ '\\hfill',
326
+ '\\Sigma',
327
+ '\\biggr',
328
+ '\\relax',
329
+ '\\Delta',
330
+ '\\begin',
331
+ '\\unrhd',
332
+ '\\acute',
333
+ '\\vdots',
334
+ '\\left|',
335
+ '\\oplus',
336
+ '\\slash',
337
+ '\\ddots',
338
+ '\\times',
339
+ '\\imath',
340
+ '\\angle',
341
+ '\\vrule',
342
+ '\\Huge',
343
+ '\\bmod',
344
+ '\\geqq',
345
+ '\\null',
346
+ '\\kern',
347
+ '\\nsim',
348
+ '\\nleq',
349
+ '\\Bigl',
350
+ '\\Perp',
351
+ '\\each',
352
+ '\\land',
353
+ '\\bigr',
354
+ '\\Vert',
355
+ '\\tanh',
356
+ '\\hfil',
357
+ '\\circ',
358
+ '\\prec',
359
+ '\\iota',
360
+ '\\odot',
361
+ '\\line',
362
+ '\\sent',
363
+ '\\prod',
364
+ '\\atop',
365
+ '\\iint',
366
+ '\\Reef',
367
+ '\\ngeq',
368
+ '\\cite',
369
+ '\\tiny',
370
+ '\\ddot',
371
+ '\\nmid',
372
+ '\\rlap',
373
+ '\\bigg',
374
+ '\\Bigm',
375
+ '\\flat',
376
+ '\\midf',
377
+ '\\cosh',
378
+ '\\llap',
379
+ '\\surd',
380
+ '\\perp',
381
+ '\\star',
382
+ '\\fbox',
383
+ '\\dots',
384
+ '\\vert',
385
+ '\\bigm',
386
+ '\\ddag',
387
+ '\\taxi',
388
+ '\\oint',
389
+ '\\frac',
390
+ '\\left',
391
+ '\\text',
392
+ '\\crcr',
393
+ '\\cong',
394
+ '\\zeta',
395
+ '\\ngtr',
396
+ '\\Bigg',
397
+ '\\pmod',
398
+ '\\skew',
399
+ '\\quad',
400
+ '\\Bigr',
401
+ '\\beta',
402
+ '\\leqq',
403
+ '\\hbar',
404
+ '\\bigl',
405
+ '\\sinh',
406
+ '\\sqrt',
407
+ '\\cdot',
408
+ '\\not',
409
+ '\\div',
410
+ '\\put',
411
+ '\\lll',
412
+ '\\tau',
413
+ '\\sum',
414
+ '\\lor',
415
+ '\\eta',
416
+ '\\min',
417
+ '\\rho',
418
+ '\\mid',
419
+ '\\neg',
420
+ '\\cao',
421
+ '\\end',
422
+ '\\vee',
423
+ '\\int',
424
+ '\\ref',
425
+ '\\geq',
426
+ '\\cap',
427
+ '\\hat',
428
+ '\\phi',
429
+ '\\mit',
430
+ '\\sin',
431
+ '\\cot',
432
+ '\\ggg',
433
+ '\\cos',
434
+ '\\mho',
435
+ '\\hss',
436
+ '\\Big',
437
+ '\\top',
438
+ '\\for',
439
+ '\\lim',
440
+ '\\eth',
441
+ '\\cal',
442
+ '\\dag',
443
+ '\\log',
444
+ '\\vss',
445
+ '\\arg',
446
+ '\\Phi',
447
+ '\\psi',
448
+ '\\dot',
449
+ '\\bar',
450
+ '\\leq',
451
+ '\\bot',
452
+ '\\ell',
453
+ '\\sec',
454
+ '\\tan',
455
+ '\\sim',
456
+ '\\Psi',
457
+ '\\dog',
458
+ '\\chi',
459
+ '\\big',
460
+ '\\cup',
461
+ '\\his',
462
+ '\\odd',
463
+ '\\ast',
464
+ '\\neq',
465
+ '\\max',
466
+ '\\vec',
467
+ '\\sup',
468
+ '\\le',
469
+ '\\bf',
470
+ '\\Em',
471
+ '\\lq',
472
+ '\\lg',
473
+ '\\sc',
474
+ '\\nu',
475
+ '\\ln',
476
+ '\\mp',
477
+ '\\Im',
478
+ '\\Pi',
479
+ '\\do',
480
+ '\\it',
481
+ '\\gt',
482
+ '\\wp',
483
+ '\\ll',
484
+ '\\ge',
485
+ '\\AA',
486
+ '\\tt',
487
+ '\\sp',
488
+ '\\ae',
489
+ '\\Xi',
490
+ '\\sf',
491
+ '\\mu',
492
+ '\\sb',
493
+ '\\ni',
494
+ '\\Re',
495
+ '\\rm',
496
+ '\\ss',
497
+ '\\pm',
498
+ '\\em',
499
+ '\\to',
500
+ '\\ne',
501
+ '\\in',
502
+ '\\tg',
503
+ '\\gg',
504
+ '\\xi',
505
+ '\\sl',
506
+ '\\pi',
507
+ '\\SS',
508
+ '\\%',
509
+ '\\S',
510
+ '\\A',
511
+ '\\P',
512
+ '\\_',
513
+ '\\:',
514
+ '\\;',
515
+ '\\*',
516
+ '\\$',
517
+ '\\>',
518
+ '\\x',
519
+ '\\/',
520
+ '\\+',
521
+ '\\}',
522
+ '\\B',
523
+ '\\&',
524
+ '\\R',
525
+ '\\l',
526
+ '\\L',
527
+ '\\|',
528
+ '\\,',
529
+ '\\{',
530
+ '\\n',
531
+ '\\m',
532
+ '\\=',
533
+ '\\z',
534
+ "\\'",
535
+ '\\!',
536
+ '\\<',
537
+ '\\Z',
538
+ '\\o',
539
+ '\\O',
540
+ '\\T',
541
+ '\\a',
542
+ '\\C',
543
+ '\\^',
544
+ '\\y',
545
+ '\\\\',
546
+ '\\i',
547
+ '\\c',
548
+ '\\#',
549
+ '\\-',
550
+ '\\N',
551
+ '\\b',
552
+ '\\j',
553
+ '\\d',
554
+ '\\Q',
555
+ ']',
556
+ '^',
557
+ '_',
558
+ '\\']
559
+
560
+ TOKEN_LATEX_VOC = ['\\ o v e r r i g h t a r r o w { \\ r m A B }',
561
+ '\\ o v e r l e f t r i g h t a r r o w',
562
+ '\\ L o n g l e f t r i g h t a r r o w',
563
+ '\\ l o n g l e f t r i g h t a r r o w',
564
+ '\\ s u p s e t n e q q \\ g t r s i m',
565
+ '\\ s c r i p t s c r i p t s t y l e',
566
+ '\\ r i g h t l e f t h a r p o o n s',
567
+ '\\ c i r c l e a r r o w r i g h t',
568
+ '\\ o v e r l i n e { \\ r m A B }',
569
+ '\\ r i g h t r i g h t a r r o w s',
570
+ '\\ l e f t r i g h t a r r o w s',
571
+ '\\ r i g h t l e f t a r r o w s',
572
+ '\\ c u r v e a r r o w r i g h t',
573
+ '\\ c i r c l e a r r o w l e f t',
574
+ '\\ b i g t r i a n g l e d o w n',
575
+ '\\ l e f t l e f t a r r o w s',
576
+ '\\ l o n g r i g h t a r r o w',
577
+ '\\ L e f t r i g h t a r r o w',
578
+ '\\ L o n g r i g h t a r r o w',
579
+ '\\ c u r v e a r r o w l e f t',
580
+ '\\ h o o k r i g h t a r r o w',
581
+ '\\ r i g h t h a r p o o n u p',
582
+ '\\ l e f t r i g h t a r r o w',
583
+ '\\ o v e r r i g h t a r r o w',
584
+ '\\ l o n g l e f t a r r o w',
585
+ '\\ b i g t r i a n g l e u p',
586
+ '\\ o v e r l e f t a r r o w',
587
+ '\\ t r i a n g l e r i g h t',
588
+ '\\ o p e r a t o r n a m e *',
589
+ '\\ h o o k l e f t a r r o w',
590
+ '\\ v a r s u b s e t n e q q',
591
+ '\\ L o n g l e f t a r r o w',
592
+ '\\ a r r a y s t r e t c h',
593
+ '\\ r i g h t \\ r f l o o r',
594
+ '\\ r i g h t \\ r b r a c e',
595
+ '\\ f o o t n o t e s i z e',
596
+ '\\ b e g i n { a r r a y }',
597
+ '\\ r i g h t \\ r a n g l e',
598
+ '\\ d i s p l a y s t y l e',
599
+ '\\ o p e r a t o r n a m e',
600
+ '\\ r e n e w c o m m a n d',
601
+ '\\ r i g h t \\ r b r a c k',
602
+ '\\ t r i a n g l e l e f t',
603
+ '\\ a r r a y c o l s e p',
604
+ '\\ u p d o w n a r r o w',
605
+ '\\ d i a m o n d s u i t',
606
+ '\\ l e f t \\ l b r a c e',
607
+ '\\ n o l i n e b r e a k',
608
+ '\\ s c r i p t s t y l e',
609
+ '\\ l e f t \\ l f l o o r',
610
+ '\\ l e f t \\ l a n g l e',
611
+ '\\ l e f t \\ l b r a c k',
612
+ '\\ m u l t i c o l u m n',
613
+ '\\ m a t h v e r s i o n',
614
+ '\\ n o t \\ s u p s e t',
615
+ '\\ c o m p l e m e n t',
616
+ '\\ u n i t l e n g t h',
617
+ '\\ t h i c k l i n e s',
618
+ '\\ r i g h t a r r o w',
619
+ '\\ v a r n o t h i n g',
620
+ '\\ r i g h t \\ v e r t',
621
+ '\\ s q s u p s e t e q',
622
+ '\\ s u p s e t n e q q',
623
+ '\\ n o t \\ s u b s e t',
624
+ '\\ s c r i p t s i z e',
625
+ '\\ R i g h t a r r o w',
626
+ '\\ b o l d s y m b o l',
627
+ '\\ e n s u r e m a t h',
628
+ '\\ c u r l y w e d g e',
629
+ '\\ t e x t n o r m a l',
630
+ '\\ n o r m a l s i z e',
631
+ '\\ e n d { a r r a y }',
632
+ '\\ s q s u b s e t e q',
633
+ '\\ v a r e p s i l o n',
634
+ '\\ s e t c o u n t e r',
635
+ '\\ u n b o l d m a t h',
636
+ '\\ u n d e r b r a c e',
637
+ '\\ c i r c l e d a s t',
638
+ '\\ l o n g m a p s t o',
639
+ '\\ n s u b s e t e q',
640
+ '\\ b a c k s l a s h',
641
+ '\\ t h i n s p a c e',
642
+ '\\ s m a l l s k i p',
643
+ '\\ l e f t a r r o w',
644
+ '\\ l l b r a c k e t',
645
+ '\\ L e f t a r r o w',
646
+ '\\ s e t l e n g t h',
647
+ '\\ t r i a n g l e q',
648
+ '\\ a r r o w v e r t',
649
+ '\\ s u b s e t n e q',
650
+ '\\ s u p s e t e q q',
651
+ '\\ m a t h b b { H }',
652
+ '\\ v a r l i m s u p',
653
+ '\\ l e f t \\ v e r t',
654
+ '\\ d o w n a r r o w',
655
+ '\\ m a t h b b { S }',
656
+ '\\ n p a r a l l e l',
657
+ '\\ w i d e t i l d e',
658
+ '\\ D o w n a r r o w',
659
+ '\\ n s u p s e t e q',
660
+ '\\ o v e r b r a c e',
661
+ '\\ t e x t s t y l e',
662
+ '\\ v a r l i m i n f',
663
+ '\\ c h e c k m a r k',
664
+ '\\ s u p s e t n e q',
665
+ '\\ b i g o t i m e s',
666
+ '\\ p i t c h f o r k',
667
+ '\\ m a t h b b { A }',
668
+ '\\ u n d e r l i n e',
669
+ '\\ m a t h b b { O }',
670
+ '\\ r i g h a r r o w',
671
+ '\\ h p h a n t o m',
672
+ '\\ p a r a l l e l',
673
+ '\\ l e q s l a n t',
674
+ '\\ u n d e r s e t',
675
+ '\\ s u b s e t e q',
676
+ '\\ b i g s q c u p',
677
+ '\\ b i g o p l u s',
678
+ '\\ m u l t i p u t',
679
+ '\\ u l c o r n e r',
680
+ '\\ r a i s e b o x',
681
+ '\\ b i g w e d g e',
682
+ '\\ e m p t y s e t',
683
+ '\\ p r o t e c t u',
684
+ '\\ b o x m i n u s',
685
+ '\\ b u i l d r e l',
686
+ '\\ v a r t h e t a',
687
+ '\\ o v e r l i n e',
688
+ '\\ f r a m e b o x',
689
+ '\\ m a t h f r a k',
690
+ '\\ s m a l l i n t',
691
+ '\\ s t a c k r e l',
692
+ '\\ s q s u p s e t',
693
+ '\\ l l c o r n e r',
694
+ '\\ s q s u b s e t',
695
+ '\\ l r c o r n e r',
696
+ '\\ c u r l y v e e',
697
+ '\\ n o n u m b e r',
698
+ '\\ s u b s t a c k',
699
+ '\\ s u p s e t e q',
700
+ '\\ v a r s i g m a',
701
+ '\\ b i g u p l u s',
702
+ '\\ t r i a n g l e',
703
+ '\\ s e t m i n u s',
704
+ '\\ v p h a n t o m',
705
+ '\\ b o l d m a t h',
706
+ '\\ g e q s l a n t',
707
+ '\\ u p s i l o n',
708
+ '\\ d i a m o n d',
709
+ '\\ e p s i l o n',
710
+ '\\ n o a l i g n',
711
+ '\\ m e d s k i p',
712
+ '\\ l e s s d o t',
713
+ '\\ b i g c i r c',
714
+ '\\ p r o t e c t',
715
+ '\\ r i g h t \\ |',
716
+ '\\ b o x p l u s',
717
+ '\\ s p e c i a l',
718
+ '\\ n e a r r o w',
719
+ '\\ r i g h t \\ }',
720
+ '\\ b e c a u s e',
721
+ '\\ w i d e h a t',
722
+ '\\ p h a n t o m',
723
+ '\\ i t s h a p e',
724
+ '\\ l e s s g t r',
725
+ '\\ m a t h b i n',
726
+ '\\ l e f t e q n',
727
+ '\\ b a c k s i m',
728
+ '\\ m a k e b o x',
729
+ '\\ m a t h s c r',
730
+ '\\ s e a r r o w',
731
+ '\\ o o a l i g n',
732
+ '\\ U p s i l o n',
733
+ '\\ n a t u r a l',
734
+ '\\ e n s p a c e',
735
+ '\\ p a r t i a l',
736
+ '\\ u p a r r o w',
737
+ '\\ n e w l i n e',
738
+ '\\ d d a g g e r',
739
+ '\\ c o o k i n g',
740
+ '\\ l e s s s i m',
741
+ '\\ n e x i s t s',
742
+ '\\ b i g o d o t',
743
+ '\\ n w a r r o w',
744
+ '\\ m a t h c a l',
745
+ '\\ m a t h r e l',
746
+ '\\ g t r l e s s',
747
+ '\\ v c e n t e r',
748
+ '\\ s w a r r o w',
749
+ '\\ f b o x s e p',
750
+ '\\ o m i c r o n',
751
+ '\\ l e a d s t o',
752
+ '\\ S u b s e t',
753
+ '\\ m a t h i t',
754
+ '\\ l a m b d a',
755
+ '\\ v a r p h i',
756
+ '\\ r b r a c e',
757
+ '\\ t e x t i t',
758
+ '\\ s u p s e t',
759
+ '\\ t e x t u p',
760
+ '\\ o t i m e s',
761
+ '\\ m a t h b f',
762
+ '\\ r i g h t |',
763
+ '\\ l b r a c e',
764
+ '\\ f o r a l l',
765
+ '\\ s q u a r e',
766
+ '\\ v a r r h o',
767
+ '\\ a r c t a n',
768
+ '\\ r t i m e s',
769
+ '\\ r i g h t >',
770
+ '\\ t e x t r m',
771
+ '\\ t e x t t t',
772
+ '\\ t e x t s f',
773
+ '\\ m o d e l s',
774
+ '\\ b i g v e e',
775
+ '\\ l a n g l e',
776
+ '\\ d a g g e r',
777
+ '\\ a r c c o t',
778
+ '\\ m a p s t o',
779
+ '\\ s u c c e q',
780
+ '\\ l b r a c k',
781
+ '\\ b u l l e t',
782
+ '\\ t e x t b f',
783
+ '\\ g t r s i m',
784
+ '\\ b i g c a p',
785
+ '\\ n v d a s h',
786
+ '\\ L a m b d a',
787
+ '\\ a r c c o s',
788
+ '\\ l c e i l m',
789
+ '\\ r g r o u p',
790
+ '\\ c o p r o d',
791
+ '\\ o m i n u s',
792
+ '\\ a p p r o x',
793
+ '\\ p a r b o x',
794
+ '\\ l f l o o r',
795
+ '\\ b i g c u p',
796
+ '\\ l e f t \\ {',
797
+ '\\ o s l a s h',
798
+ '\\ e n s k i p',
799
+ '\\ l t i m e s',
800
+ '\\ r a n g l e',
801
+ '\\ r f l o o r',
802
+ '\\ p r o p t o',
803
+ '\\ m a t h s f',
804
+ '\\ l g r o u p',
805
+ '\\ p o u n d s',
806
+ '\\ r b r a c k',
807
+ '\\ m a t h r m',
808
+ '\\ s u b s e t',
809
+ '\\ l e f t \\ |',
810
+ '\\ m a t h o p',
811
+ '\\ c i r c l e',
812
+ '\\ r i g h t .',
813
+ '\\ m a t h b b',
814
+ '\\ e x i s t s',
815
+ '\\ a r c s i n',
816
+ '\\ p r i m e',
817
+ '\\ v l i n e',
818
+ '\\ c d o t s',
819
+ '\\ v a r p i',
820
+ '\\ n o t i n',
821
+ '\\ T h e t a',
822
+ '\\ s q c u p',
823
+ '\\ l e f t <',
824
+ '\\ G a m m a',
825
+ '\\ t h e t a',
826
+ '\\ l a b e l',
827
+ '\\ r i g h t',
828
+ '\\ s i m e q',
829
+ '\\ i n f t y',
830
+ '\\ m s k i p',
831
+ '\\ L a r g e',
832
+ '\\ a s y m p',
833
+ '\\ s h a r p',
834
+ '\\ l e f t .',
835
+ '\\ d a s h v',
836
+ '\\ l c e i l',
837
+ '\\ o m e g a',
838
+ '\\ d d d o t',
839
+ '\\ l d o t s',
840
+ '\\ q q u a d',
841
+ '\\ a l p h a',
842
+ '\\ B i g g m',
843
+ '\\ c h e c k',
844
+ '\\ s m i l e',
845
+ '\\ v D a s h',
846
+ '\\ g r a v e',
847
+ '\\ n l e s s',
848
+ '\\ s q c a p',
849
+ '\\ d e l t a',
850
+ '\\ d o t e q',
851
+ '\\ O m e g a',
852
+ '\\ B i g g l',
853
+ '\\ m k e r n',
854
+ '\\ B i g g r',
855
+ '\\ h l i n e',
856
+ '\\ L A R G E',
857
+ '\\ s m a s h',
858
+ '\\ s m a l l',
859
+ '\\ a l e p h',
860
+ '\\ e q u i v',
861
+ '\\ t i l d e',
862
+ '\\ s p a c e',
863
+ '\\ r c e i l',
864
+ '\\ r a i s e',
865
+ '\\ u n l h d',
866
+ '\\ b i n o m',
867
+ '\\ g a m m a',
868
+ '\\ v s k i p',
869
+ '\\ s t r u t',
870
+ '\\ b i g g l',
871
+ '\\ c o l o n',
872
+ '\\ c d o t p',
873
+ '\\ a m a l g',
874
+ '\\ k a p p a',
875
+ '\\ h r u l e',
876
+ '\\ j m a t h',
877
+ '\\ l o w e r',
878
+ '\\ b r e v e',
879
+ '\\ s i g m a',
880
+ '\\ v d a s h',
881
+ '\\ w e d g e',
882
+ '\\ l a r g e',
883
+ '\\ n a b l a',
884
+ '\\ h f i l l',
885
+ '\\ S i g m a',
886
+ '\\ b i g g r',
887
+ '\\ r e l a x',
888
+ '\\ D e l t a',
889
+ '\\ b e g i n',
890
+ '\\ u n r h d',
891
+ '\\ a c u t e',
892
+ '\\ v d o t s',
893
+ '\\ l e f t |',
894
+ '\\ o p l u s',
895
+ '\\ s l a s h',
896
+ '\\ d d o t s',
897
+ '\\ t i m e s',
898
+ '\\ i m a t h',
899
+ '\\ a n g l e',
900
+ '\\ v r u l e',
901
+ '\\ H u g e',
902
+ '\\ b m o d',
903
+ '\\ g e q q',
904
+ '\\ n u l l',
905
+ '\\ k e r n',
906
+ '\\ n s i m',
907
+ '\\ n l e q',
908
+ '\\ B i g l',
909
+ '\\ P e r p',
910
+ '\\ e a c h',
911
+ '\\ l a n d',
912
+ '\\ b i g r',
913
+ '\\ V e r t',
914
+ '\\ t a n h',
915
+ '\\ h f i l',
916
+ '\\ c i r c',
917
+ '\\ p r e c',
918
+ '\\ i o t a',
919
+ '\\ o d o t',
920
+ '\\ l i n e',
921
+ '\\ s e n t',
922
+ '\\ p r o d',
923
+ '\\ a t o p',
924
+ '\\ i i n t',
925
+ '\\ R e e f',
926
+ '\\ n g e q',
927
+ '\\ c i t e',
928
+ '\\ t i n y',
929
+ '\\ d d o t',
930
+ '\\ n m i d',
931
+ '\\ r l a p',
932
+ '\\ b i g g',
933
+ '\\ B i g m',
934
+ '\\ f l a t',
935
+ '\\ m i d f',
936
+ '\\ c o s h',
937
+ '\\ l l a p',
938
+ '\\ s u r d',
939
+ '\\ p e r p',
940
+ '\\ s t a r',
941
+ '\\ f b o x',
942
+ '\\ d o t s',
943
+ '\\ v e r t',
944
+ '\\ b i g m',
945
+ '\\ d d a g',
946
+ '\\ t a x i',
947
+ '\\ o i n t',
948
+ '\\ f r a c',
949
+ '\\ l e f t',
950
+ '\\ t e x t',
951
+ '\\ c r c r',
952
+ '\\ c o n g',
953
+ '\\ z e t a',
954
+ '\\ n g t r',
955
+ '\\ B i g g',
956
+ '\\ p m o d',
957
+ '\\ s k e w',
958
+ '\\ q u a d',
959
+ '\\ B i g r',
960
+ '\\ b e t a',
961
+ '\\ l e q q',
962
+ '\\ h b a r',
963
+ '\\ b i g l',
964
+ '\\ s i n h',
965
+ '\\ s q r t',
966
+ '\\ c d o t',
967
+ '\\ n o t',
968
+ '\\ d i v',
969
+ '\\ p u t',
970
+ '\\ l l l',
971
+ '\\ t a u',
972
+ '\\ s u m',
973
+ '\\ l o r',
974
+ '\\ e t a',
975
+ '\\ m i n',
976
+ '\\ r h o',
977
+ '\\ m i d',
978
+ '\\ n e g',
979
+ '\\ c a o',
980
+ '\\ e n d',
981
+ '\\ v e e',
982
+ '\\ i n t',
983
+ '\\ r e f',
984
+ '\\ g e q',
985
+ '\\ c a p',
986
+ '\\ h a t',
987
+ '\\ p h i',
988
+ '\\ m i t',
989
+ '\\ s i n',
990
+ '\\ c o t',
991
+ '\\ g g g',
992
+ '\\ c o s',
993
+ '\\ m h o',
994
+ '\\ h s s',
995
+ '\\ B i g',
996
+ '\\ t o p',
997
+ '\\ f o r',
998
+ '\\ l i m',
999
+ '\\ e t h',
1000
+ '\\ c a l',
1001
+ '\\ d a g',
1002
+ '\\ l o g',
1003
+ '\\ v s s',
1004
+ '\\ a r g',
1005
+ '\\ P h i',
1006
+ '\\ p s i',
1007
+ '\\ d o t',
1008
+ '\\ b a r',
1009
+ '\\ l e q',
1010
+ '\\ b o t',
1011
+ '\\ e l l',
1012
+ '\\ s e c',
1013
+ '\\ t a n',
1014
+ '\\ s i m',
1015
+ '\\ P s i',
1016
+ '\\ d o g',
1017
+ '\\ c h i',
1018
+ '\\ b i g',
1019
+ '\\ c u p',
1020
+ '\\ h i s',
1021
+ '\\ o d d',
1022
+ '\\ a s t',
1023
+ '\\ n e q',
1024
+ '\\ m a x',
1025
+ '\\ v e c',
1026
+ '\\ s u p',
1027
+ '\\ l e',
1028
+ '\\ b f',
1029
+ '\\ E m',
1030
+ '\\ l q',
1031
+ '\\ l g',
1032
+ '\\ s c',
1033
+ '\\ n u',
1034
+ '\\ l n',
1035
+ '\\ m p',
1036
+ '\\ I m',
1037
+ '\\ P i',
1038
+ '\\ d o',
1039
+ '\\ i t',
1040
+ '\\ g t',
1041
+ '\\ w p',
1042
+ '\\ l l',
1043
+ '\\ g e',
1044
+ '\\ A A',
1045
+ '\\ t t',
1046
+ '\\ s p',
1047
+ '\\ a e',
1048
+ '\\ X i',
1049
+ '\\ s f',
1050
+ '\\ m u',
1051
+ '\\ s b',
1052
+ '\\ n i',
1053
+ '\\ R e',
1054
+ '\\ r m',
1055
+ '\\ s s',
1056
+ '\\ p m',
1057
+ '\\ e m',
1058
+ '\\ t o',
1059
+ '\\ n e',
1060
+ '\\ i n',
1061
+ '\\ t g',
1062
+ '\\ g g',
1063
+ '\\ x i',
1064
+ '\\ s l',
1065
+ '\\ p i',
1066
+ '\\ S S',
1067
+ '\\ %',
1068
+ '\\ S',
1069
+ '\\ A',
1070
+ '\\ P',
1071
+ '\\ _',
1072
+ '\\ :',
1073
+ '\\ ;',
1074
+ '\\ *',
1075
+ '\\ $',
1076
+ '\\ >',
1077
+ '\\ x',
1078
+ '\\ /',
1079
+ '\\ +',
1080
+ '\\ }',
1081
+ '\\ B',
1082
+ '\\ &',
1083
+ '\\ R',
1084
+ '\\ l',
1085
+ '\\ L',
1086
+ '\\ |',
1087
+ '\\ ,',
1088
+ '\\ {',
1089
+ '\\ n',
1090
+ '\\ m',
1091
+ '\\ =',
1092
+ '\\ z',
1093
+ "\\ '",
1094
+ '\\ !',
1095
+ '\\ <',
1096
+ '\\ Z',
1097
+ '\\ o',
1098
+ '\\ O',
1099
+ '\\ T',
1100
+ '\\ a',
1101
+ '\\ C',
1102
+ '\\ ^',
1103
+ '\\ y',
1104
+ '\\ \\',
1105
+ '\\ i',
1106
+ '\\ c',
1107
+ '\\ #',
1108
+ '\\ -',
1109
+ '\\ N',
1110
+ '\\ b',
1111
+ '\\ j',
1112
+ '\\ d',
1113
+ '\\ Q',
1114
+ ']',
1115
+ '^',
1116
+ '_',
1117
+ '\\']
1118
+
1119
+
1120
+ NORMALIZE_WORD_DICT = {
1121
+ "”": '\\"',
1122
+ "’": "'",
1123
+ "‘": "'",
1124
+ "–": "-",
1125
+ "“": '\\"',
1126
+ "с": "c",
1127
+ "а": "a",
1128
+ "о": "o",
1129
+ "Н": "H",
1130
+ "у": "y",
1131
+ "О": "0",
1132
+ "М": "M",
1133
+ "Α": "A",
1134
+ "А": "A",
1135
+ "Р": "P",
1136
+ "Т": "T",
1137
+ "р": "p",
1138
+ "З": "3",
1139
+ "х": "x",
1140
+ "е": "e",
1141
+ ":": ":",
1142
+ "Η": "H",
1143
+ "(": "(",
1144
+ "К": "K",
1145
+ "Ү": "Y",
1146
+ "б": "6",
1147
+ "․": ".",
1148
+ "В": "B",
1149
+ "С": "C",
1150
+ "Ρ": "P",
1151
+ "і": "i",
1152
+ "г": "r",
1153
+ "Κ": "K",
1154
+ "Х": "X",
1155
+ "Ι": "I",
1156
+ ",": ",",
1157
+ "Τ": "T",
1158
+ "Ј": "J",
1159
+ "У": "y",
1160
+ "๐": "o",
1161
+ "ı": "i",
1162
+ "в": "B",
1163
+ "Β": "B",
1164
+ "һ": "h",
1165
+ "ν": "v",
1166
+ "Τ": "T",
1167
+ "Е": "E",
1168
+ "Ј": "J",
1169
+ "Ι": "I",
1170
+ "Ρ": "P",
1171
+ "У": "y",
1172
+ "І": "I",
1173
+ "Ј": "J",
1174
+ "Ι": "I",
1175
+ "×": "x",
1176
+ "†": "t",
1177
+ "̃": "",
1178
+ " ̣": "",
1179
+ "`": "'",
1180
+ # Latex
1181
+ r"\\left[": r"[",
1182
+ r"\\right]": r"]",
1183
+ r"\\left\\{": r"\\{",
1184
+ r"\\right\\}": r"\\}",
1185
+ r"\\left|": "|",
1186
+ r"\\right|": "|",
1187
+ r"\\left(": r"(",
1188
+ r"\\right)": r")",
1189
+ r"\\left.": r"",
1190
+ r"\\right.": r" ",
1191
+ r"\\cdot": r".",
1192
+ r"\\prime": r"'",
1193
+ r"\\Delta": r"\\triangle",
1194
+ r"\\circ": "o",
1195
+ r"\\bigtriangleup": r"\\triangle",
1196
+ r"\\left ": r" ",
1197
+ r"\\right ": r" ",
1198
+ r"\\le ": r"\\leq ",
1199
+ r"\\ge ": r"\\geq ",
1200
+ r"\\setminus": r"\\backslash",
1201
+ r"$\\bullet$": r"",
1202
+ r"\\therefore": r"",
1203
+ r"\\otimes": r"",
1204
+ r"\\rightarrowI": r"\\rightarrow",
1205
+ r"\\overparen": r"\\widehat",
1206
+ r"\\triangleABC": r"\\triangle ABC",
1207
+ r"\\rightrightarrows": r"\\rightarrow",
1208
+ r"\\top": r"'",
1209
+ r"\\dagger": r"'",
1210
+ r"\\bigoplus": r"Đ",
1211
+ r"\\oplus": r"Đ",
1212
+ r"\\doteq": r"=",
1213
+ r"\\bigcap": r"\\cap",
1214
+ r"\\I": r"I",
1215
+ r"\\l ": r"I",
1216
+ r"\\kappa": r"K",
1217
+ # r"\n": "",
1218
+ }
1219
+
1220
+
1221
+ CHARACTER_SET = {
1222
+ "$",
1223
+ "\\",
1224
+ " ",
1225
+ "n",
1226
+ "h",
1227
+ "t",
1228
+ "e",
1229
+ "i",
1230
+ "a",
1231
+ "o",
1232
+ "c",
1233
+ ".",
1234
+ "s",
1235
+ "r",
1236
+ "g",
1237
+ "u",
1238
+ "l",
1239
+ "m",
1240
+ "d",
1241
+ "y",
1242
+ "p",
1243
+ "đ",
1244
+ "C",
1245
+ "v",
1246
+ "b",
1247
+ "à",
1248
+ "k",
1249
+ "w",
1250
+ "f",
1251
+ "á",
1252
+ "â",
1253
+ "B",
1254
+ "1",
1255
+ "T",
1256
+ "ư",
1257
+ "A",
1258
+ ")",
1259
+ ",",
1260
+ "D",
1261
+ "Ð",
1262
+ "(",
1263
+ ":",
1264
+ "2",
1265
+ "I",
1266
+ "ệ",
1267
+ "?",
1268
+ "ế",
1269
+ "ạ",
1270
+ "ộ",
1271
+ "3",
1272
+ "ó",
1273
+ "ủ",
1274
+ "ì",
1275
+ "ô",
1276
+ "ả",
1277
+ "ố",
1278
+ "H",
1279
+ "ể",
1280
+ "ớ",
1281
+ "ê",
1282
+ "N",
1283
+ "0",
1284
+ "4",
1285
+ "ề",
1286
+ "í",
1287
+ "x",
1288
+ "5",
1289
+ "ậ",
1290
+ "ị",
1291
+ "M",
1292
+ "ờ",
1293
+ "ấ",
1294
+ "ơ",
1295
+ "S",
1296
+ "'",
1297
+ "ợ",
1298
+ "q",
1299
+ "ọ",
1300
+ "V",
1301
+ "ứ",
1302
+ "-",
1303
+ "Đ",
1304
+ "ú",
1305
+ "ự",
1306
+ "W",
1307
+ "6",
1308
+ "P",
1309
+ "9",
1310
+ "L",
1311
+ "ầ",
1312
+ "7",
1313
+ "8",
1314
+ "ã",
1315
+ "E",
1316
+ "Ẹ",
1317
+ "ă",
1318
+ '"',
1319
+ "ụ",
1320
+ "ữ",
1321
+ "ồ",
1322
+ "G",
1323
+ "ở",
1324
+ "K",
1325
+ "ỉ",
1326
+ "ò",
1327
+ "ẳ",
1328
+ "ắ",
1329
+ "ừ",
1330
+ "ù",
1331
+ "ằ",
1332
+ "R",
1333
+ "O",
1334
+ "ặ",
1335
+ "Q",
1336
+ "ổ",
1337
+ "ĩ",
1338
+ "X",
1339
+ "ỏ",
1340
+ "Y",
1341
+ "ử",
1342
+ "F",
1343
+ "é",
1344
+ "U",
1345
+ "j",
1346
+ "ý",
1347
+ "/",
1348
+ "ẽ",
1349
+ "z",
1350
+ "ỗ",
1351
+ "J",
1352
+ "ễ",
1353
+ "ẩ",
1354
+ "ũ",
1355
+ "Á",
1356
+ "ẹ",
1357
+ "=",
1358
+ "ẫ",
1359
+ "ẻ",
1360
+ "Ẻ",
1361
+ ";",
1362
+ "!",
1363
+ ">",
1364
+ "è",
1365
+ "È",
1366
+ "Ẽ",
1367
+ "[",
1368
+ "]",
1369
+ "*",
1370
+ "À",
1371
+ "Â",
1372
+ "Ẫ",
1373
+ "+",
1374
+ "õ",
1375
+ "Õ",
1376
+ "ỡ",
1377
+ "Ậ",
1378
+ "Ô",
1379
+ "Ỗ",
1380
+ "Ở",
1381
+ "Ỡ",
1382
+ "ỹ",
1383
+ "Ỹ",
1384
+ "^",
1385
+ "ẵ",
1386
+ "ỳ",
1387
+ "Ă",
1388
+ "Ẵ",
1389
+ "|",
1390
+ "Ệ",
1391
+ "Ạ",
1392
+ "Ư",
1393
+ "Ố",
1394
+ "Ý",
1395
+ "Ọ",
1396
+ "Ấ",
1397
+ "ỷ",
1398
+ "Ê",
1399
+ "Ế",
1400
+ "Ầ",
1401
+ "Ề",
1402
+ "Ự",
1403
+ "Í",
1404
+ "Ả",
1405
+ "Ụ",
1406
+ "Ộ",
1407
+ "Z",
1408
+ "Ơ",
1409
+ "}",
1410
+ "Ể",
1411
+ "Ú",
1412
+ "{",
1413
+ "%",
1414
+ "Ị",
1415
+ "Ắ",
1416
+ "_",
1417
+ "Ủ",
1418
+ "Ó",
1419
+ "Ì",
1420
+ "<",
1421
+ "°",
1422
+ "Ớ",
1423
+ "å",
1424
+ "̣",
1425
+ "Ĩ",
1426
+ "~",
1427
+ "Ờ",
1428
+ "Ợ",
1429
+ "̉",
1430
+ "Ỏ",
1431
+ "•",
1432
+ "Ữ",
1433
+ "ā",
1434
+ "&",
1435
+ "Ứ",
1436
+ "Ổ",
1437
+ "Ã",
1438
+ "Ồ",
1439
+ "Ễ",
1440
+ "Ử",
1441
+ "Ẩ",
1442
+ "#",
1443
+ "Ù",
1444
+ "Ò",
1445
+ "É",
1446
+ "ӑ",
1447
+ "\n",
1448
+ "Ặ",
1449
+ "Ũ",
1450
+ "Ỳ",
1451
+ "Ừ",
1452
+ "Ẳ",
1453
+ "Ỷ",
1454
+ "Ằ",
1455
+ }
latex2operatortree.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lxml import etree
2
+ import latex2mathml.converter
3
+
4
+
5
+ def mathml_to_operator_tree(node):
6
+ """Convert MathML to an operator tree."""
7
+ # If the node is a leaf node, return the operator as a string
8
+ if len(node) == 0:
9
+ return node.text
10
+ # Otherwise, recursively build the operator tree for each child node
11
+ operator_tree = ""
12
+ for child in node:
13
+ operator_tree += str(child.tag).split('}')[-1].replace('m', '') +"(" + str(mathml_to_operator_tree(child)) + ')'
14
+ return operator_tree
15
+
16
+ def latex2tree(latex_text):
17
+ mathml = latex2mathml.converter.convert(latex_text)
18
+ root = etree.fromstring(mathml.encode())
19
+ operator_tree = mathml_to_operator_tree(root[0])
20
+ return operator_tree
test.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import py_vncorenlp
3
+ from const import *
4
+ from pprint import pprint
5
+ from latex2operatortree import *
6
+ from transformers import AutoTokenizer, AutoModel
7
+ # text = "Trong các hình vẽ sau $y=\dfrac{x+1}{-x+1}$, hình nào biểu diễn đồ thị của hàm số $y=x^3$, $y=x^5$?"
8
+ # pattern = r'\$.*?\$'
9
+
10
+ # equations = re.findall(pattern, text)
11
+
12
+ # pprint(latex2tree(text))
13
+ dir = 'code/'
14
+ py_vncorenlp.download_model(save_dir=dir)
15
+ model = py_vncorenlp.VnCoreNLP(save_dir='code/')
tokenizer.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ BERT Tokenizer custom for Math
3
+ '''
4
+
5
+ import os
6
+ import re
7
+ from const import *
8
+ from shutil import copyfile #thu vien sao chep tep kem thong tin cua tep
9
+ from typing import List, Optional, Tuple
10
+
11
+ from transformers.tokenization_utils import PreTrainedTokenizer
12
+ from transformers.utils import logging #log ra thay vi phai in
13
+
14
+ logger = logging.get_logger(__name__)
15
+
16
+ VOCAB_FILES_NAMES = {
17
+ 'vocab_file': 'vocab.txt',
18
+ 'merges_file': 'bpe.codes',
19
+ }
20
+
21
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
22
+ "vinai/phobert-base": 256,
23
+ "vinai/phobert-large": 256,
24
+ }
25
+
26
+
27
+ def get_pairs(word):
28
+ '''
29
+ Return set of symbol pairs in a word.
30
+
31
+ Word is represented as tuple of symbols (symbols being variable-length strings).
32
+ '''
33
+
34
+ pairs = set()
35
+ prev_char = word[0]
36
+ for char in word[1:]:
37
+ pairs.add((prev_char, char))
38
+ prev_char = char
39
+
40
+ pairs = set(pairs)
41
+ return pairs
42
+
43
+
44
+ class PhobertTokenizer(PreTrainedTokenizer):
45
+ '''
46
+ Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.
47
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
48
+ this superclass for more information regarding those methods.
49
+
50
+
51
+
52
+ Args:
53
+ vocab_file (`str`):
54
+ Path to the vocabulary file.
55
+ merges_file (`str`):
56
+ Path to the merges file.
57
+ bos_token (`st`, *optional*, defaults to `"<s>"`):
58
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
59
+ <Tip>
60
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
61
+ sequence. The token used is the `cls_token`.
62
+ </Tip>
63
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
64
+ The end of sequence token.
65
+ <Tip>
66
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
67
+ The token used is the `sep_token`.
68
+ </Tip>
69
+ sep_token (`str`, *optional*, defaults to `"</s>"`):
70
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
71
+ sequence classification or for a text and a question for question answering. It is also used as the last
72
+ token of a sequence built with special tokens.
73
+ cls_token (`str`, *optional*, defaults to `"<s>"`):
74
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
75
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
76
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
77
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
78
+ token instead.
79
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
80
+ The token used for padding, for example when batching sequences of different lengths.
81
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
82
+ The token used for masking values. This is the token used when training this model with masked language
83
+ modeling. This is the token which the model will try to predict.
84
+ '''
85
+ vocab_files_names = VOCAB_FILES_NAMES
86
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
87
+ model_input_names = ["input_ids", "attention_mask"]
88
+
89
+ def __init__(self,
90
+ vocab_file,
91
+ merges_file,
92
+ bos_token="<s>",
93
+ eos_token="</s>",
94
+ sep_token="</s>",
95
+ cls_token="<s>",
96
+ unk_token="<unk>",
97
+ pad_token="<pad>",
98
+ mask_token="<mask>",
99
+ **kwargs):
100
+ super().__init__(
101
+ bos_token=bos_token,
102
+ eos_token=eos_token,
103
+ unk_token=unk_token,
104
+ sep_token=sep_token,
105
+ cls_token=cls_token,
106
+ pad_token=pad_token,
107
+ mask_token=mask_token,
108
+ **kwargs,
109
+ )
110
+
111
+ self.vocab_file = vocab_file
112
+ self.merges_file = merges_file
113
+
114
+ self.encoder = {}
115
+ self.encode[self.bos_token] = 0
116
+ self.encode[self.pad_token] = 1
117
+ self.encode[self.eos_token] = 2
118
+ self.encode[self.unk_token] = 3
119
+
120
+ self.add_from_file(vocab_file)
121
+ self.encoder[self.mask_token] = len(self.encoder)
122
+
123
+ self.decoder = {v : k for k, v in self.encoder.items()}
124
+
125
+
126
+ with open(merges_file, encoding="utf-8") as merges_handle:
127
+ merges = merges_handle.read().split("\n")[:-1]
128
+ merges = [tuple(merge.split()[:-1]) for merge in merges]
129
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
130
+ self.cache = {}
131
+
132
+ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
133
+
134
+ """
135
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
136
+ adding special tokens. A PhoBERT sequence has the following format:
137
+ - single sequence: `<s> X </s>`
138
+ - pair of sequences: `<s> A </s></s> B </s>`
139
+ Args:
140
+ token_ids_0 (`List[int]`):
141
+ List of IDs to which the special tokens will be added.
142
+ token_ids_1 (`List[int]`, *optional*):
143
+ Optional second list of IDs for sequence pairs.
144
+ Returns:
145
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
146
+ """
147
+
148
+ if token_ids_1 is None:
149
+ #cls_token_id -> PretrainedTokenizerBase
150
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
151
+ cls = [self.cls_token_id]
152
+ sep = [self.sep_token_id]
153
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
154
+
155
+
156
+ def get_special_tokens_mask(self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False) -> List[int]:
157
+
158
+ """
159
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
160
+ special tokens using the tokenizer `prepare_for_model` method.
161
+ Args:
162
+ token_ids_0 (`List[int]`):
163
+ List of IDs.
164
+ token_ids_1 (`List[int]`, *optional*):
165
+ Optional second list of IDs for sequence pairs.
166
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
167
+ Whether or not the token list is already formatted with special tokens for the model.
168
+ Returns:
169
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
170
+ """
171
+
172
+ if already_has_special_tokens:
173
+ return super().get_special_tokens_mask(
174
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
175
+ )
176
+
177
+ if token_ids_1 is None:
178
+ return [1] + ([0] * len(token_ids_0)) + [1]
179
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
180
+
181
+
182
+ def create_token_type_ids_from_sequences(
183
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
184
+ ) -> List[int]:
185
+ """
186
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
187
+ make use of token type ids, therefore a list of zeros is returned.
188
+ Args:
189
+ token_ids_0 (`List[int]`):
190
+ List of IDs.
191
+ token_ids_1 (`List[int]`, *optional*):
192
+ Optional second list of IDs for sequence pairs.
193
+ Returns:
194
+ `List[int]`: List of zeros.
195
+ """
196
+
197
+ sep = [self.sep_token_id]
198
+ cls = [self.cls_token_id]
199
+
200
+ if token_ids_1 is None:
201
+ return len(cls + token_ids_0 + sep) * [0]
202
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
203
+
204
+ @property
205
+ def vocab_size(self):
206
+ return len(self.encoder)
207
+
208
+ def get_vocab(self):
209
+ return dict(self.encoder, **self.added_tokens_encoder)
210
+
211
+
212
+
213
+ def bpe(self, token):
214
+ if token in self.cache:
215
+ return self.cache[token]
216
+ word = tuple(token)
217
+ word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
218
+ pairs = get_pairs(word)
219
+
220
+ if not pairs:
221
+ return token
222
+
223
+ while True:
224
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
225
+ if bigram not in self.bpe_ranks:
226
+ break
227
+ first, second = bigram
228
+ new_word = []
229
+ i = 0
230
+ while i < len(word):
231
+ try:
232
+ j = word.index(first, i)
233
+ except ValueError:
234
+ new_word.extend(word[i:])
235
+ break
236
+ else:
237
+ new_word.extend(word[i:j])
238
+ i = j
239
+
240
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
241
+ new_word.append(first + second)
242
+ i += 2
243
+ else:
244
+ new_word.append(word[i])
245
+ i += 1
246
+ new_word = tuple(new_word)
247
+ word = new_word
248
+ if len(word) == 1:
249
+ break
250
+ else:
251
+ pairs = get_pairs(word)
252
+ word = "@@ ".join(word)
253
+ word = word[:-4]
254
+ self.cache[token] = word
255
+ return word
256
+
257
+ def _tokenize(self, text):
258
+ """Tokenize a string."""
259
+ split_tokens = []
260
+
261
+ words = re.findall(r"\S+\n?", text)
262
+
263
+ for token in words:
264
+ split_tokens.extend([t for t in self.bpe(token).split(" ")])
265
+ return split_tokens
266
+
267
+ def _convert_token_to_id(self, token):
268
+ """Converts a token (str) in an id using the vocab."""
269
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
270
+
271
+ def _convert_id_to_token(self, index):
272
+ """Converts an index (integer) in a token (str) using the vocab."""
273
+ return self.decoder.get(index, self.unk_token)
274
+
275
+ def convert_tokens_to_string(self, tokens):
276
+ """Converts a sequence of tokens (string) in a single string."""
277
+ out_string = " ".join(tokens).replace("@@ ", "").strip()
278
+ return out_string
279
+
280
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
281
+ if not os.path.isdir(save_directory):
282
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
283
+ return
284
+
285
+ out_vocab_file = os.path.join(
286
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
287
+ )
288
+
289
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
290
+ copyfile(self.vocab_file, out_vocab_file)
291
+ elif not os.path.isfile(self.vocab_file):
292
+ with open(out_vocab_file, "w", encoding="utf-8") as fp:
293
+ for token, value in self.encoder.items():
294
+ if token not in self.all_special_tokens:
295
+ fp.write(f"{str(token)} 1\n")
296
+
297
+ out_merges_file = os.path.join(
298
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
299
+ )
300
+
301
+ if os.path.abspath(self.merges_file) != os.path.abspath(out_merges_file) and os.path.isfile(self.merges_file):
302
+ copyfile(self.merges_file, out_merges_file)
303
+ elif not os.path.isfile(self.merges_file):
304
+ index = 0
305
+ with open(out_merges_file, "w", encoding="utf-8") as writer:
306
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
307
+ if index != token_index:
308
+ logger.warning(
309
+ f"Saving vocabulary to {out_merges_file}: BPE merge indices are not consecutive."
310
+ " Please check that the tokenizer is not corrupted!"
311
+ )
312
+ index = token_index
313
+ writer.write(" ".join(bpe_tokens) + " 1\n")
314
+ index += 1
315
+
316
+ return (out_vocab_file, out_merges_file)
317
+
318
+ # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
319
+ # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
320
+ # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
321
+ # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far)
322
+ # return ''.join(tokens_generated_so_far)
323
+
324
+
325
+
326
+ def add_from_file(self, f):
327
+ """
328
+ Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
329
+ """
330
+ if isinstance(f, str):
331
+ try:
332
+ with open(f, "r", encoding="utf-8") as fd:
333
+ self.add_from_file(fd)
334
+ except FileNotFoundError as fnfe:
335
+ raise fnfe
336
+ except UnicodeError:
337
+ raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
338
+ return
339
+
340
+ lines = f.readlines()
341
+ for lineTmp in lines:
342
+ line = lineTmp.strip()
343
+ idx = line.rfind(" ")
344
+ if idx == -1:
345
+ raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
346
+ word = line[:idx]
347
+ self.encoder[word] = len(self.encoder)
348
+ for word in LATEX_VOC:
349
+ self.encoder[word] = len(self.encoder)
vocab.txt ADDED
The diff for this file is too large to render. See raw diff