File size: 1,331 Bytes
e35836c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#%%
from transformers import RobertaForTokenClassification, AutoTokenizer
model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics")
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics")

#%%
label2diacritic = {0: 'ّ', # SHADDA
                   1: 'َ', # FATHA
                   2: 'ِ', # KASRA
                   3: 'ُ', # DAMMA
                   4: 'ْ'} # SUKKUN


def arabic2diacritics(text, model, tokenizer):
    tokens = tokenizer(text, return_tensors="pt")
    preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS
    new_text = []
    for p, c in zip(preds, text):
        new_text.append(c)
        for i in range(1, 5):
            if p[i]:
                new_text.append(label2diacritic[i])
        # check shadda last
        if p[0]:
            new_text.append(label2diacritic[0])
        
    new_text = "".join(new_text)
    return new_text


def diacritize(text):
    return arabic2diacritics(text, model, tokenizer)

def diacritize_if_not_already(text):
    if any(c in label2diacritic.values() for c in text):
        return text
    else:
        return arabic2diacritics(text, model, tokenizer)
#%%
# text = "بديش اروح عالمدرسة بكرا"
# arabic2diacritics(text, model, tokenizer)
# %%