|
|
|
from transformers import CanineForTokenClassification, AutoTokenizer |
|
import re |
|
import torch |
|
|
|
|
|
import logging |
|
logger = logging.getLogger(__name__) |
|
logger.setLevel(logging.INFO) |
|
|
|
model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit") |
|
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit") |
|
|
|
|
|
|
|
def diacritics2hebrew_vowels(text, model, tokenizer): |
|
tokens = tokenizer(text, return_tensors="pt") |
|
with torch.no_grad(): |
|
pred = model(**tokens) |
|
pred = pred.logits.argmax(-1).tolist() |
|
|
|
pred = pred[0][1:-1] |
|
output = [] |
|
for p, c in zip(pred, text): |
|
if p != model.config.label2id["O"]: |
|
output.append(model.config.id2label[p]) |
|
else: |
|
output.append(c) |
|
output = "".join(output) |
|
|
|
|
|
logger.warning("Done converting to Hebrew vowels") |
|
return output |
|
|
|
|
|
|
|
|
|
arabic_to_english = { |
|
"ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a", |
|
"آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j", |
|
"ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z", |
|
"س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z", |
|
"ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l", |
|
"م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h", |
|
"َ": "a", "ُ": "u", "ِ": "i", |
|
"،": ",", |
|
"ֹ": "o", |
|
"ַ": "a", |
|
"ִ": "i", |
|
"ְ": "", |
|
"ֻ": "u", |
|
'ֵ': "e", |
|
"ّ": "SHADDA" |
|
} |
|
|
|
arabic_to_hebrew = { |
|
|
|
"ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א", |
|
"آ": "אא", "ى": "א", "ب": "בּ", "ت": "ת", "ث": "ת'", "ج": "ג'", |
|
"ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז", |
|
"س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'", |
|
"ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כּ", "ل": "ל", |
|
"م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה", |
|
|
|
"،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ", |
|
"؟": "?", "؛": ";", "ـ": "", |
|
|
|
"ّ": "\u0598", |
|
} |
|
|
|
vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ'] |
|
|
|
final_letters = { |
|
"ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף", |
|
} |
|
|
|
def reorder_hebrew_nikkud(input_string): |
|
|
|
|
|
|
|
nikkud_signs = { |
|
"ֹ": "o", |
|
"ַ": "a", |
|
"ִ": "i", |
|
"ְ": "", |
|
"ֻ": "u", |
|
"ֵ": "e", |
|
"ّ": "SHADDA" |
|
} |
|
|
|
|
|
pattern = r'([\u0590-\u05FF])(\')([' + ''.join(nikkud_signs.keys()) + '])' |
|
replacement = r'\1\3\2' |
|
|
|
result = re.sub(pattern, replacement, input_string) |
|
|
|
return result |
|
|
|
def reverse_holam_shadda_vav(input_string): |
|
|
|
|
|
pattern = r'(\u05B9)(\u0598)(\u05D5)' |
|
replacement = r'\2\3\1' |
|
result = re.sub(pattern, replacement, input_string) |
|
|
|
return result |
|
|
|
def to_taatik(arabic): |
|
taatik = [] |
|
for index, letter in enumerate(arabic): |
|
if ( |
|
(index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and |
|
letter in final_letters |
|
): |
|
taatik.append(final_letters[letter]) |
|
elif letter not in arabic_to_hebrew: |
|
taatik.append(letter) |
|
else: |
|
taatik.append(arabic_to_hebrew[letter]) |
|
reversed = reverse_holam_shadda_vav("".join(taatik)) |
|
reordered = reorder_hebrew_nikkud(reversed) |
|
|
|
logger.warning("Done converting to taatik") |
|
return reordered |
|
|
|
|
|
|
|
|
|
def to_translit(arabic): |
|
translit = [] |
|
for letter in arabic: |
|
if letter not in arabic_to_english: |
|
translit.append([letter, letter]) |
|
else: |
|
if arabic_to_english[letter] == "SHADDA": |
|
if translit[-1][0] in vowels: |
|
translit[-2][1] = translit[-2][1].upper() |
|
else: |
|
translit[-1][1] = translit[-1][1].upper() |
|
|
|
else: |
|
translit.append([letter, arabic_to_english[letter]]) |
|
|
|
return "".join([x[1] for x in translit]) |
|
|
|
|
|
|
|
|
|
def taatik(text): |
|
return to_taatik(diacritics2hebrew_vowels(text, model, tokenizer)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|