Spaces:
Running
Running
#%% | |
from transformers import CanineForTokenClassification, AutoTokenizer | |
import re | |
import torch | |
# instantiate module logger | |
import logging | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit") | |
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit") | |
#%% | |
def diacritics2hebrew_vowels(text, model, tokenizer): | |
tokens = tokenizer(text, return_tensors="pt") | |
with torch.no_grad(): | |
pred = model(**tokens) | |
pred = pred.logits.argmax(-1).tolist() | |
pred = pred[0][1:-1] # remove CLS and SEP | |
output = [] | |
for p, c in zip(pred, text): | |
if p != model.config.label2id["O"]: | |
output.append(model.config.id2label[p]) | |
else: | |
output.append(c) | |
output = "".join(output) | |
# print("Done converting to Hebrew vowels") | |
logger.warning("Done converting to Hebrew vowels") | |
return output | |
#%% | |
arabic_to_english = { | |
"ا": "ā", "أ": "ʔ", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a", | |
"آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j", | |
"ح": "ḥ", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z", | |
"س": "s", "ش": "š", "ص": "s", "ض": "d", "ط": "t", "ظ": "z", | |
"ع": "ʕ", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l", | |
"م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h", | |
"َ": "a", "ُ": "u", "ِ": "i", | |
"،": ",", | |
"ֹ": "o", # holam | |
"ַ": "a", # patah | |
"ִ": "i", # hiriq | |
"ְ": "", # shva | |
"ֻ": "u", # kubutz | |
'ֵ': "e", | |
"ّ": "SHADDA" # shadda | |
} | |
arabic_to_hebrew = { | |
# regular letters | |
"ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א", | |
"آ": "אא", "ى": "א", "ب": "בּ", "ت": "ת", "ث": "ת'", "ج": "ג'", | |
"ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז", | |
"س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'", | |
"ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כּ", "ل": "ל", | |
"م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה", | |
# special characters | |
"،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ", | |
"؟": "?", "؛": ";", "ـ": "", | |
# shadda to \u0598 | |
"ّ": "\u0598", | |
} | |
vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ'] | |
final_letters = { | |
"ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף", | |
} | |
def reorder_hebrew_nikkud(input_string): | |
# in case of 2 character letter transliteration, move the nikkud back to the first character and not the apostrophe | |
# Define a dictionary for the nikkud signs | |
nikkud_signs = { | |
"ֹ": "o", # holam | |
"ַ": "a", # patah | |
"ִ": "i", # hiriq | |
"ְ": "", # shva | |
"ֻ": "u", # kubutz | |
"ֵ": "e", # tsere | |
"ّ": "SHADDA" # shadda | |
} | |
# Create a regex pattern for Hebrew letter followed by apostrophe and one of the nikkud signs | |
pattern = r'([\u0590-\u05FF])(\')([' + ''.join(nikkud_signs.keys()) + '])' | |
replacement = r'\1\3\2' | |
result = re.sub(pattern, replacement, input_string) | |
return result | |
def reverse_holam_shadda_vav(input_string): | |
# For better readability, replace (holam, shadda, ו) with (shadda, ו, holam) | |
# instead of shadda we use the replacement \u0598 | |
pattern = r'(\u05B9)(\u0598)(\u05D5)' | |
replacement = r'\2\3\1' | |
result = re.sub(pattern, replacement, input_string) | |
return result | |
def to_taatik(arabic): | |
taatik = [] | |
for index, letter in enumerate(arabic): | |
if ( | |
(index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and | |
letter in final_letters | |
): | |
taatik.append(final_letters[letter]) | |
elif letter not in arabic_to_hebrew: | |
taatik.append(letter) | |
else: | |
taatik.append(arabic_to_hebrew[letter]) | |
reversed = reverse_holam_shadda_vav("".join(taatik)) | |
reordered = reorder_hebrew_nikkud(reversed) | |
# print("Done converting to taatik") | |
logger.warning("Done converting to taatik") | |
return reordered | |
def to_translit(arabic): | |
translit = [] | |
for letter in arabic: | |
if letter not in arabic_to_english: | |
translit.append([letter, letter]) | |
else: | |
if arabic_to_english[letter] == "SHADDA": | |
if translit[-1][0] in vowels: | |
translit[-2][1] = translit[-2][1].upper() | |
else: | |
translit[-1][1] = translit[-1][1].upper() | |
else: | |
translit.append([letter, arabic_to_english[letter]]) | |
return "".join([x[1] for x in translit]) | |
# %% | |
def taatik(text): | |
return to_taatik(diacritics2hebrew_vowels(text, model, tokenizer)) | |
def translit(text): | |
return to_translit(diacritics2hebrew_vowels(text, model, tokenizer)) | |
# text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر " | |
# heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer) | |
# #%% | |
# to_taatik(heb_vowels) | |
# #%% | |
# to_translit(heb_vowels) | |
# # %% | |