#%% from transformers import CanineForTokenClassification, AutoTokenizer import re import torch # instantiate module logger import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit") tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit") #%% def diacritics2hebrew_vowels(text, model, tokenizer): tokens = tokenizer(text, return_tensors="pt") with torch.no_grad(): pred = model(**tokens) pred = pred.logits.argmax(-1).tolist() pred = pred[0][1:-1] # remove CLS and SEP output = [] for p, c in zip(pred, text): if p != model.config.label2id["O"]: output.append(model.config.id2label[p]) else: output.append(c) output = "".join(output) # print("Done converting to Hebrew vowels") logger.warning("Done converting to Hebrew vowels") return output #%% arabic_to_english = { "ا": "ā", "أ": "ʔ", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a", "آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j", "ح": "ḥ", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z", "س": "s", "ش": "š", "ص": "s", "ض": "d", "ط": "t", "ظ": "z", "ع": "ʕ", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l", "م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h", "َ": "a", "ُ": "u", "ِ": "i", "،": ",", "ֹ": "o", # holam "ַ": "a", # patah "ִ": "i", # hiriq "ְ": "", # shva "ֻ": "u", # kubutz 'ֵ': "e", "ّ": "SHADDA" # shadda } arabic_to_english = { "ا": "ā", "أ": "ʔ", "إ": "ʔ", "ء": "ʔ", "ئ": "ʔ", "ؤ": "ʔ", "آ": "ʔā", "ى": "ā", "ب": "b", "ت": "t", "ث": "θ", "ج": "ž", "ح": "ħ", "خ": "x", "د": "d", "ذ": "ð", "ر": "r", "ز": "z", "س": "s", "ش": "š", "ص": "ṣ", "ض": "ḍ", "ط": "ṭ", "ظ": "ð̣", "ع": "ʕ", "غ": "ɣ", "ف": "f", "ق": "q", "ك": "k", "ل": "l", "م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h", "َ": "a", "ُ": "u", "ِ": "i", "،": ",", "ֹ": "o", # holam "ַ": "a", # patah "ִ": "i", # hiriq "ְ": "", # shva "ֻ": "u", # kubutz 'ֵ': "e", # tzere "ّ": "SHADDA" # shadda } arabic_to_hebrew = { # regular letters "ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א", "آ": "אא", "ى": "א", "ب": "בּ", "ت": "ת", "ث": "ת'", "ج": "ג'", "ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז", "س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'", "ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כּ", "ل": "ל", "م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה", # special characters "،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ", "؟": "?", "؛": ";", "ـ": "", # shadda to \u0598 "ّ": "\u0598", } vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ'] final_letters = { "ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף", } def reorder_hebrew_nikkud(input_string): # in case of 2 character letter transliteration, move the nikkud back to the first character and not the apostrophe # Define a dictionary for the nikkud signs nikkud_signs = { "ֹ": "o", # holam "ַ": "a", # patah "ִ": "i", # hiriq "ְ": "", # shva "ֻ": "u", # kubutz "ֵ": "e", # tsere "ّ": "SHADDA" # shadda } # Create a regex pattern for Hebrew letter followed by apostrophe and one of the nikkud signs pattern = r'([\u0590-\u05FF])(\')([' + ''.join(nikkud_signs.keys()) + '])' replacement = r'\1\3\2' result = re.sub(pattern, replacement, input_string) return result def reverse_holam_shadda_vav(input_string): # For better readability, replace (holam, shadda, ו) with (shadda, ו, holam) # instead of shadda we use the replacement \u0598 pattern = r'(\u05B9)(\u0598)(\u05D5)' replacement = r'\2\3\1' result = re.sub(pattern, replacement, input_string) return result def to_taatik(arabic): taatik = [] for index, letter in enumerate(arabic): if ( (index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and letter in final_letters ): taatik.append(final_letters[letter]) elif letter not in arabic_to_hebrew: taatik.append(letter) else: taatik.append(arabic_to_hebrew[letter]) reversed = reverse_holam_shadda_vav("".join(taatik)) reordered = reorder_hebrew_nikkud(reversed) # print("Done converting to taatik") logger.warning("Done converting to taatik") return reordered def postprocess_arabic_transliteration(text): text = re.sub(r'a(ā)(?!ā)', 'ā', text) text = re.sub(r'iy(?!y)', 'ī', text) text = re.sub(r'ow(?!w)', 'ō', text) text = re.sub(r'uw(?!w)', 'ū', text) text = re.sub(r'ay(?!y)', 'ē', text) text = re.sub(r'aw(?!w)', 'ō', text) text = re.sub(r'([āīēūō])(\W*$|\W+)', lambda m: m.group(1).translate(str.maketrans('āīēūō', 'aieuo')) + m.group(2), text) return text def to_translit(arabic): translit = [] for letter in arabic: if letter not in arabic_to_english: translit.append([letter, letter]) else: if arabic_to_english[letter] == "SHADDA": if translit[-1][0] in vowels: #translit[-2][1] = translit[-2][1].upper() translit[-2][1] = translit[-2][1] + translit[-2][1] else: #translit[-1][1] = translit[-1][1].upper() translit[-1][1] = translit[-1][1] + translit[-1][1] else: translit.append([letter, arabic_to_english[letter]]) return postprocess_arabic_transliteration("".join([x[1] for x in translit])) # %% def taatik(text): return to_taatik(diacritics2hebrew_vowels(text, model, tokenizer)) def translit(text): return to_translit(diacritics2hebrew_vowels(text, model, tokenizer)) # text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر " # heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer) # #%% # to_taatik(heb_vowels) # #%% # to_translit(heb_vowels) # # %%