Spaces:
Running
Running
File size: 1,331 Bytes
e35836c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
#%%
from transformers import RobertaForTokenClassification, AutoTokenizer
model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics")
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics")
#%%
label2diacritic = {0: 'ّ', # SHADDA
1: 'َ', # FATHA
2: 'ِ', # KASRA
3: 'ُ', # DAMMA
4: 'ْ'} # SUKKUN
def arabic2diacritics(text, model, tokenizer):
tokens = tokenizer(text, return_tensors="pt")
preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS
new_text = []
for p, c in zip(preds, text):
new_text.append(c)
for i in range(1, 5):
if p[i]:
new_text.append(label2diacritic[i])
# check shadda last
if p[0]:
new_text.append(label2diacritic[0])
new_text = "".join(new_text)
return new_text
def diacritize(text):
return arabic2diacritics(text, model, tokenizer)
def diacritize_if_not_already(text):
if any(c in label2diacritic.values() for c in text):
return text
else:
return arabic2diacritics(text, model, tokenizer)
#%%
# text = "بديش اروح عالمدرسة بكرا"
# arabic2diacritics(text, model, tokenizer)
# %%
|