billingsmoore
commited on
Commit
•
85ab26c
1
Parent(s):
00ba240
Update README.md
Browse files
README.md
CHANGED
@@ -67,6 +67,61 @@ However, the model expects, and performs best on, Tibetan text that is translite
|
|
67 |
transliterations that pyewts implements. A Python module to convert Wylie to THL is currently under developement. In the mean time, if you are familiar with Perl,
|
68 |
[Roger Espel Llima has released a Perl module for this purpose here.](https://www.thlib.org/reference/transliteration/phconverter.php)
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
### Downstream Use
|
71 |
|
72 |
The model can be further finetuned using the following code:
|
|
|
67 |
transliterations that pyewts implements. A Python module to convert Wylie to THL is currently under developement. In the mean time, if you are familiar with Perl,
|
68 |
[Roger Espel Llima has released a Perl module for this purpose here.](https://www.thlib.org/reference/transliteration/phconverter.php)
|
69 |
|
70 |
+
For now, the following script does reasonably well converting from the Wylie transliteration that is outputted by pyewts into something closer to phonetics.
|
71 |
+
|
72 |
+
```python
|
73 |
+
|
74 |
+
replacements = [[['lth', 'rh', 'db'] , ' '],
|
75 |
+
[['rb', 'sb', 'sbr', 'lb', '’b', '\'b'] , 'b'],
|
76 |
+
[['c', 'cw', 'gc', 'bc', 'lc', 'py', 'lpy', 'spy', 'dpy', 'mch', '’ch', '\'ch', 'phy', '’phy', '\'phy'] , 'ch'],
|
77 |
+
[['rd', 'sd', 'gd', 'bd', 'brd', 'bsd', 'zl', 'bzl', 'ld', 'md', '’d', '\'d', 'dw'] , 'd'],
|
78 |
+
[['rgr', 'lgr', 'sgr', 'dgr', 'dbr', 'bsgr', 'rbr', 'lbr', 'sbr', 'mgr', '’gr', '\'gr', '’dr', '\'dr', '’br', '\'br', 'gr', 'br', 'grw'] , 'dr'],
|
79 |
+
[['rdz', 'gdz', 'brdz', 'mdz', '’dz', '\'dz'] , 'dz'],
|
80 |
+
[['rg', 'lg', 'sg', 'dg', 'bg', 'brg', 'bsg', 'lg', 'mg', '’g', '\'g', 'gw'] , 'g'],
|
81 |
+
[['rgy', 'lgy', 'sgy', 'dgy', 'bgy', 'brgy', 'bsgy', 'mgy', '’gy', '\'gy'] , 'gy'],
|
82 |
+
[['hw'] , 'h'],
|
83 |
+
[['rby', 'lby', 'sby', 'rj', 'gj', 'brj', 'lj', 'mj', '’j', '\'j', '’by', '\'by', 'by'] , 'j'],
|
84 |
+
[['rk', 'lk', 'sk', 'kw', 'dk', 'bk', 'brk', 'bsk'] , 'k'],
|
85 |
+
[['khw', 'mkh', '’kh', '\'kh'] , 'kh'],
|
86 |
+
[['mkhy', '’khy', '\'khy'] , 'khy'],
|
87 |
+
[['rky', 'lky', 'sky', 'dky', 'bky', 'brky', 'bsky'] , 'ky'],
|
88 |
+
[['kl', 'gl', 'bl', 'rl', 'sl', 'brl', 'bsl', 'lw'] , 'l'],
|
89 |
+
[['rm', 'sm', 'dm', 'smr', 'mr'] , 'm'],
|
90 |
+
[['rn', 'sn', 'gn', 'brn', 'bsn', 'mn'] , 'n'],
|
91 |
+
[['rng', 'lng', 'sng', 'dng', 'brng', 'bsng', 'mng'] , 'ng'],
|
92 |
+
[['rny', 'sny', 'gny', 'brny', 'bsny', 'mny', 'nyw', 'rmy', 'smy', 'my'] , 'ny'],
|
93 |
+
[['sp', 'dp', 'lp', 'ph', '’ph', '\'ph'] , 'p'],
|
94 |
+
[['rw'] , 'r'],
|
95 |
+
[['sr', 'sw', 'gs', 'bs', 'bsr'] , 's'],
|
96 |
+
[['shw', 'gsh', 'bsh'] , 'sh'],
|
97 |
+
[['rt', 'lt', 'st', 'tw', 'gt', 'bt', 'brt', 'blt', 'bst', 'bld', 'th', 'mth', '’th' '\'th'] , 't'],
|
98 |
+
[['kr', 'rkr', 'lkr', 'skr', 'pr', 'lpr', 'spr', 'dkr', 'dpr', 'bkr', 'bskr', 'bsr', 'khr', 'thr', 'phr', 'mkhr', '’khr', '’phr'] , 'tr'],
|
99 |
+
[['rts', 'sts', 'rtsw', 'stsw', 'gts', 'bts', 'brts', 'bsts', 'tsh', 'tshw', 'mtsh', '’tsh', '\'tsh'] , 'ts'],
|
100 |
+
[['db', 'b'] , 'w'],
|
101 |
+
[['g.y', 'dby'] , 'y'],
|
102 |
+
[['zw', 'gz', 'bz'] , 'z'],
|
103 |
+
[['zh', 'zhw', 'gzh', 'bzh'] , 'zh']]
|
104 |
+
|
105 |
+
def wylie_to_phonetic(wylie):
|
106 |
+
phonetic = []
|
107 |
+
for line in wylie:
|
108 |
+
if line != '':
|
109 |
+
# perform basic replacements
|
110 |
+
result = line
|
111 |
+
for elt in replacements:
|
112 |
+
replace_list = elt[0]
|
113 |
+
for string in replace_list:
|
114 |
+
result = result.replace(string, elt[1])
|
115 |
+
|
116 |
+
# remove non-alphabetical chars
|
117 |
+
result = re.sub(r'[^a-zA-Z\s]', '', result)
|
118 |
+
phonetic.append(result)
|
119 |
+
return phonetic
|
120 |
+
|
121 |
+
phonetic = wylie_to_phonetic(wylie)
|
122 |
+
|
123 |
+
```
|
124 |
+
|
125 |
### Downstream Use
|
126 |
|
127 |
The model can be further finetuned using the following code:
|