|
|
|
from transformers import RobertaForTokenClassification, AutoTokenizer |
|
model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics") |
|
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics") |
|
|
|
|
|
label2diacritic = {0: 'ّ', |
|
1: 'َ', |
|
2: 'ِ', |
|
3: 'ُ', |
|
4: 'ْ'} |
|
|
|
|
|
def arabic2diacritics(text, model, tokenizer): |
|
tokens = tokenizer(text, return_tensors="pt") |
|
preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] |
|
new_text = [] |
|
for p, c in zip(preds, text): |
|
new_text.append(c) |
|
for i in range(1, 5): |
|
if p[i]: |
|
new_text.append(label2diacritic[i]) |
|
|
|
if p[0]: |
|
new_text.append(label2diacritic[0]) |
|
|
|
new_text = "".join(new_text) |
|
return new_text |
|
|
|
|
|
def diacritize(text): |
|
return arabic2diacritics(text, model, tokenizer) |
|
|
|
def diacritize_if_not_already(text): |
|
if any(c in label2diacritic.values() for c in text): |
|
return text |
|
else: |
|
return arabic2diacritics(text, model, tokenizer) |
|
|
|
|
|
|
|
|
|
|