Spaces:
Running
Running
Guy Mor-Lan
commited on
Commit
•
46f657a
1
Parent(s):
e0e4f9c
add models
Browse files- .gitattributes +2 -0
- app.py +116 -0
- ar_he/README.md +32 -0
- ar_he/config.json +61 -0
- ar_he/generation_config.json +16 -0
- ar_he/model.safetensors +3 -0
- ar_he/source.spm +0 -0
- ar_he/special_tokens_map.json +5 -0
- ar_he/target.spm +0 -0
- ar_he/tokenizer_config.json +39 -0
- ar_he/vocab.json +0 -0
- ar_he_ct2/config.json +10 -0
- ar_he_ct2/model.bin +3 -0
- ar_he_ct2/shared_vocabulary.json +0 -0
- colorize.py +109 -0
- consts.py +114 -0
- diacritize.py +42 -0
- embedding_data.csv +3 -0
- embeddings.npy +3 -0
- he_ar/README.md +35 -0
- he_ar/config.json +61 -0
- he_ar/generation_config.json +16 -0
- he_ar/model.safetensors +3 -0
- he_ar/source.spm +0 -0
- he_ar/special_tokens_map.json +5 -0
- he_ar/target.spm +0 -0
- he_ar/tokenizer_config.json +39 -0
- he_ar/vocab.json +0 -0
- he_ar_ct2/config.json +10 -0
- he_ar_ct2/model.bin +3 -0
- he_ar_ct2/shared_vocabulary.json +0 -0
- requirements.txt +10 -0
- semsearch.py +78 -0
- translate.py +101 -0
- translit.py +158 -0
- tts.py +82 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
embedding_data.csv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
embeddings.npy filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#%%
|
2 |
+
import gradio as gr
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
from translate import run_translate
|
6 |
+
from diacritize import diacritize, diacritize_if_not_already
|
7 |
+
from translit import taatik
|
8 |
+
from semsearch import update_df
|
9 |
+
from tts import get_audio
|
10 |
+
from consts import CSS, ABOUT, JS_FUNC
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
with gr.Blocks(title = "Levanti - כלי תרגום לערבית מדוברת",
|
14 |
+
css=CSS,
|
15 |
+
theme="default") as demo:
|
16 |
+
# gr.HTML("<h2><span style='color: #2563eb'>Levantine Arabic</span> Translator</h2>")
|
17 |
+
gr.HTML("<h2><span dir='rtl'><span style='color: #2563eb'>Levanti</span>ne Translator</span></h2>כלי תרגום לערבית מדוברת")
|
18 |
+
with gr.Tab('תרגום', elem_id="tab1"):
|
19 |
+
with gr.Row():
|
20 |
+
with gr.Column():
|
21 |
+
input_text = gr.Textbox(label="קלט",
|
22 |
+
info = "עברית או ערבית מדוברת",
|
23 |
+
placeholder="הזינו טקסט בערבית או עברית",
|
24 |
+
lines=2,
|
25 |
+
elem_id="input",
|
26 |
+
rtl=True)
|
27 |
+
|
28 |
+
gr.Examples(["רציתי ללכת אתמול לחנות, אבל ירד גשם", "خلينا ندور على مطعم تاني"],
|
29 |
+
input_text,
|
30 |
+
label="דוגמאות")
|
31 |
+
|
32 |
+
btn = gr.Button("תרגום")
|
33 |
+
with gr.Row():
|
34 |
+
dialect = gr.Radio(["פלסטיני", "סורי", "לבנוני", "מצרי"],
|
35 |
+
label = "להג",
|
36 |
+
info="משפיע על תרגום לערבית",
|
37 |
+
value="פלסטיני")
|
38 |
+
|
39 |
+
# gr.Markdown("Built by [Guy Mor-Lan](mailto:[email protected]). Pronunciation model is specifically tailored to urban Palestinian Arabic. Text-to-speech uses Microsoft Azure's API and may provide different result from the transliterated pronunciation.")
|
40 |
+
gr.Markdown("נוצר על ידי [גיא מור-לן](mailto:[email protected]) כחלק מפרויקט [Levanti](https://huggingface.co/datasets/guymorlan/levanti). השמע מופק באמצעות Azure TTS על בסיס הניקוד המנובא ויוריסטיקות.", elem_id="footer", rtl=True)
|
41 |
+
|
42 |
+
with gr.Column():
|
43 |
+
with gr.Group(elem_id="grp"):
|
44 |
+
gr.HTML("<div dir='rtl'>תרגום</div>")
|
45 |
+
# gr.Markdown("תרגום", elem_id="diacritized")
|
46 |
+
translation_output = gr.HTML("<br>", visible=True, label="תרגום", elem_id="main")
|
47 |
+
|
48 |
+
hidden_arabic = gr.Textbox(lines=1, elem_id="trans", visible=False)
|
49 |
+
|
50 |
+
diacritized_output = gr.Textbox(label="ניקוד (ניסיוני)", lines=1, elem_id="diacritized",
|
51 |
+
rtl=True, interactive=False)
|
52 |
+
taatik_output = gr.Textbox(label="תעתיק (ניסיוני)", lines=1, elem_id="taatik",
|
53 |
+
text_align="right", rtl=True, interactive=False)
|
54 |
+
# diacritized_output = gr.HTML("<br>", label="ניקוד")
|
55 |
+
# taatik_output = gr.HTML("<br>", label="תעתיק")
|
56 |
+
|
57 |
+
audio = gr.Audio(label="שמע (Azure)", interactive=False,
|
58 |
+
autoplay=True)
|
59 |
+
audio_button = gr.Button("צור שמע")
|
60 |
+
audio_button.click(get_audio, inputs=[diacritized_output], outputs=[audio])
|
61 |
+
|
62 |
+
|
63 |
+
btn.click(run_translate, inputs=[input_text, dialect],
|
64 |
+
outputs=[translation_output, hidden_arabic], api_name="en2ar",
|
65 |
+
js="function jump(x, y){document.getElementById('main').scrollIntoView(); return [x, y];}")
|
66 |
+
|
67 |
+
input_text.submit(run_translate, inputs=[input_text, dialect],
|
68 |
+
outputs=[translation_output, hidden_arabic], scroll_to_output=True)
|
69 |
+
hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
|
70 |
+
diacritized_output.change(taatik, inputs=[diacritized_output], outputs=[taatik_output])
|
71 |
+
# pal.change(get_transliteration, inputs=[pal, include], outputs=[pal_translit]);
|
72 |
+
# include.change(toggle_visibility, inputs=[include], outputs=[pal_translit, sy, lb, eg])
|
73 |
+
with gr.Row():
|
74 |
+
# import pandas as pd
|
75 |
+
# ex_df = pd.DataFrame({"text": ["שלום", "כיצד ניתן לעזור לך?", "איפה נמצא המסעדה הכי טובה בעיר?"]})
|
76 |
+
# ex_df = ex_df.style.set_properties(**{
|
77 |
+
# 'font-family': 'Arial, sans-serif',
|
78 |
+
# 'text-align': 'right'
|
79 |
+
# })
|
80 |
+
# bla_df = gr.DataFrame(ex_df, visible=True, elem_id="nearest", wrap =True)
|
81 |
+
nearest_df = gr.DataFrame(headers=["ערבית", "עברית", "מאומת"], visible=False, wrap=True,
|
82 |
+
elem_id="nearest", label="תוצאות קרובות מתוך קורפוס Levanti", height=300)
|
83 |
+
|
84 |
+
hidden_arabic.change(update_df, inputs=[hidden_arabic], outputs=[nearest_df])
|
85 |
+
|
86 |
+
with gr.Tab("ניקוד ותעתוק", elem_id="tab2"):
|
87 |
+
with gr.Row():
|
88 |
+
with gr.Column():
|
89 |
+
diac_text = gr.Textbox(label="קלט", placeholder="הזינו טקסט בערבית", lines=1,
|
90 |
+
info = "בשביל תעתוק בלבד, הזינו טקסט ערבי מנוקד",
|
91 |
+
elem_id="diac_input", rtl=True)
|
92 |
+
gr.Examples(["خلينا ندور على مطعم تاني", "قَدِيْش حَقّ الْبَنْدُورَة؟"], diac_text,
|
93 |
+
label="דוגמאות", elem_id="diac_ex")
|
94 |
+
btn2 = gr.Button("שליחה")
|
95 |
+
|
96 |
+
with gr.Column():
|
97 |
+
diacritized_output2 = gr.Textbox(label="ניקוד", lines=1,
|
98 |
+
elem_id="diacritized2", rtl=True)
|
99 |
+
taatik_output2 = gr.Textbox(label="תעתיק", lines=1,
|
100 |
+
elem_id="taatik2", rtl=True)
|
101 |
+
|
102 |
+
# input_text.submit(run_translate, inputs=[input_text, dialect],
|
103 |
+
# outputs=[translation_output], scroll_to_output=True)
|
104 |
+
# hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
|
105 |
+
# diacritized_output.change(taatik, inputs=[diacritized_output], outputs=[taatik_output])
|
106 |
+
btn2.click(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
|
107 |
+
diac_text.submit(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
|
108 |
+
diacritized_output2.change(taatik, inputs=[diacritized_output2], outputs=[taatik_output2])
|
109 |
+
with gr.Tab("אודות", elem_id="tab3"):
|
110 |
+
with gr.Row():
|
111 |
+
gr.HTML("<h2>אודות</h2>")
|
112 |
+
gr.Markdown(ABOUT, elem_id="about", rtl=True)
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
demo.launch(ssl_verify=False)
|
ar_he/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: cc-by-nc-4.0
|
3 |
+
datasets:
|
4 |
+
- guymorlan/levanti
|
5 |
+
language:
|
6 |
+
- ar
|
7 |
+
- he
|
8 |
+
pipeline_tag: translation
|
9 |
+
widget:
|
10 |
+
- text: بدي أروح ع الدكان بكرا
|
11 |
+
---
|
12 |
+
|
13 |
+
# Levanti (colloquial Levantine Arabic -> Hebrew) translator
|
14 |
+
|
15 |
+
Trained on the [Levanti](https://huggingface.co/datasets/guymorlan/levanti) dataset by fine-tuning [Helsinki-NLP/opus-mt-ar-he](https://huggingface.co/Helsinki-NLP/opus-mt-ar-he) for 8 epochs.
|
16 |
+
The model supports Palestinian, Jordanian, Syrian, Lebanese and Egyptian dialects.
|
17 |
+
|
18 |
+
|
19 |
+
# Example usage
|
20 |
+
|
21 |
+
```python
|
22 |
+
from transformers import pipeline
|
23 |
+
trans = pipeline("translation", "guymorlan/levanti_translate_ar_he")
|
24 |
+
trans("بدي أروح ع الدكان بكرا")
|
25 |
+
```
|
26 |
+
```
|
27 |
+
Out[1]: [{'translation_text': 'אני רוצה ללכת לחנות מחר'}]
|
28 |
+
```
|
29 |
+
|
30 |
+
# Attribution
|
31 |
+
Created by Guy Mor-Lan.<br>
|
32 |
+
Contact: guy.mor AT mail.huji.ac.il
|
ar_he/config.json
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Helsinki-NLP/opus-mt-ar-he",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"activation_function": "swish",
|
5 |
+
"add_bias_logits": false,
|
6 |
+
"add_final_layer_norm": false,
|
7 |
+
"architectures": [
|
8 |
+
"MarianMTModel"
|
9 |
+
],
|
10 |
+
"attention_dropout": 0.0,
|
11 |
+
"bad_words_ids": [
|
12 |
+
[
|
13 |
+
63333
|
14 |
+
]
|
15 |
+
],
|
16 |
+
"bos_token_id": 0,
|
17 |
+
"classif_dropout": 0.0,
|
18 |
+
"classifier_dropout": 0.0,
|
19 |
+
"d_model": 512,
|
20 |
+
"decoder_attention_heads": 8,
|
21 |
+
"decoder_ffn_dim": 2048,
|
22 |
+
"decoder_layerdrop": 0.0,
|
23 |
+
"decoder_layers": 6,
|
24 |
+
"decoder_start_token_id": 63333,
|
25 |
+
"decoder_vocab_size": 63334,
|
26 |
+
"dropout": 0.1,
|
27 |
+
"encoder_attention_heads": 8,
|
28 |
+
"encoder_ffn_dim": 2048,
|
29 |
+
"encoder_layerdrop": 0.0,
|
30 |
+
"encoder_layers": 6,
|
31 |
+
"eos_token_id": 0,
|
32 |
+
"extra_pos_embeddings": 63334,
|
33 |
+
"forced_eos_token_id": 0,
|
34 |
+
"id2label": {
|
35 |
+
"0": "LABEL_0",
|
36 |
+
"1": "LABEL_1",
|
37 |
+
"2": "LABEL_2"
|
38 |
+
},
|
39 |
+
"init_std": 0.02,
|
40 |
+
"is_encoder_decoder": true,
|
41 |
+
"label2id": {
|
42 |
+
"LABEL_0": 0,
|
43 |
+
"LABEL_1": 1,
|
44 |
+
"LABEL_2": 2
|
45 |
+
},
|
46 |
+
"max_length": 512,
|
47 |
+
"max_position_embeddings": 512,
|
48 |
+
"model_type": "marian",
|
49 |
+
"normalize_before": false,
|
50 |
+
"normalize_embedding": false,
|
51 |
+
"num_beams": 4,
|
52 |
+
"num_hidden_layers": 6,
|
53 |
+
"pad_token_id": 63333,
|
54 |
+
"scale_embedding": true,
|
55 |
+
"share_encoder_decoder_embeddings": true,
|
56 |
+
"static_position_embeddings": true,
|
57 |
+
"torch_dtype": "float32",
|
58 |
+
"transformers_version": "4.38.1",
|
59 |
+
"use_cache": true,
|
60 |
+
"vocab_size": 63334
|
61 |
+
}
|
ar_he/generation_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bad_words_ids": [
|
3 |
+
[
|
4 |
+
63333
|
5 |
+
]
|
6 |
+
],
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"decoder_start_token_id": 63333,
|
9 |
+
"eos_token_id": 0,
|
10 |
+
"forced_eos_token_id": 0,
|
11 |
+
"max_length": 512,
|
12 |
+
"num_beams": 4,
|
13 |
+
"pad_token_id": 63333,
|
14 |
+
"renormalize_logits": true,
|
15 |
+
"transformers_version": "4.38.1"
|
16 |
+
}
|
ar_he/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:50672992291e20bc714b766615195a72d09b3950362adc5a0097b6bbfc5b630d
|
3 |
+
size 306544408
|
ar_he/source.spm
ADDED
Binary file (899 kB). View file
|
|
ar_he/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eos_token": "</s>",
|
3 |
+
"pad_token": "<pad>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
ar_he/target.spm
ADDED
Binary file (896 kB). View file
|
|
ar_he/tokenizer_config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "</s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<unk>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"63333": {
|
20 |
+
"content": "<pad>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"clean_up_tokenization_spaces": true,
|
29 |
+
"eos_token": "</s>",
|
30 |
+
"model_max_length": 512,
|
31 |
+
"pad_token": "<pad>",
|
32 |
+
"return_tensors": "pt",
|
33 |
+
"separate_vocabs": false,
|
34 |
+
"source_lang": "ara",
|
35 |
+
"sp_model_kwargs": {},
|
36 |
+
"target_lang": "heb",
|
37 |
+
"tokenizer_class": "MarianTokenizer",
|
38 |
+
"unk_token": "<unk>"
|
39 |
+
}
|
ar_he/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ar_he_ct2/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_source_bos": false,
|
3 |
+
"add_source_eos": false,
|
4 |
+
"bos_token": "<s>",
|
5 |
+
"decoder_start_token": "</s>",
|
6 |
+
"eos_token": "</s>",
|
7 |
+
"layer_norm_epsilon": null,
|
8 |
+
"multi_query_attention": false,
|
9 |
+
"unk_token": "<unk>"
|
10 |
+
}
|
ar_he_ct2/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa9558eb1d4c737e67e2d8238eb5c56e15721911cf75a23e8906225cc417c58a
|
3 |
+
size 307573250
|
ar_he_ct2/shared_vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
colorize.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
def generate_diverging_colors(num_colors, palette='Set3'): # courtesy of ChatGPT
|
6 |
+
# Generate a colormap with a specified number of colors
|
7 |
+
cmap = plt.cm.get_cmap(palette, num_colors)
|
8 |
+
|
9 |
+
# Get the RGB values of the colors in the colormap
|
10 |
+
colors_rgb = cmap(np.arange(num_colors))
|
11 |
+
|
12 |
+
# Convert the RGB values to hexadecimal color codes
|
13 |
+
colors_hex = [format(int(color[0]*255)<<16|int(color[1]*255)<<8|int(color[2]*255), '06x') for color in colors_rgb]
|
14 |
+
|
15 |
+
return colors_hex
|
16 |
+
|
17 |
+
|
18 |
+
def align_words(outputs, tokenizer, encoder_input_ids, decoder_input_ids,
|
19 |
+
threshold=0.4, skip_first_src=True, skip_second_src=False,
|
20 |
+
layer=2, head=6):
|
21 |
+
|
22 |
+
alignment = []
|
23 |
+
# threshold = 0.05
|
24 |
+
for i, tok in enumerate(outputs.cross_attentions[layer][0][head]):
|
25 |
+
alignment.append([[i], (tok > threshold).nonzero().squeeze(-1).tolist()])
|
26 |
+
|
27 |
+
# for i in alignment:
|
28 |
+
# src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
|
29 |
+
# trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
|
30 |
+
# print(src_tok, "=>", trg_tok)
|
31 |
+
|
32 |
+
merged = []
|
33 |
+
for i in alignment:
|
34 |
+
token = tokenizer.convert_ids_to_tokens([decoder_input_ids[0][i[0]]])[0]
|
35 |
+
# print(token)
|
36 |
+
if token not in ["</s>", "<pad>", "<unk>", "<s>"]:
|
37 |
+
if merged:
|
38 |
+
tomerge = False
|
39 |
+
# check overlap with previous entry
|
40 |
+
for x in i[1]:
|
41 |
+
if x in merged[-1][1]:# or tokenizer.convert_ids_to_tokens([encoder_input_ids[0][x]])[0][0] != "▁":
|
42 |
+
tomerge = True
|
43 |
+
break
|
44 |
+
# if first character is not a "▁"
|
45 |
+
if token[0] != "▁":
|
46 |
+
tomerge = True
|
47 |
+
if tomerge:
|
48 |
+
merged[-1][0] += i[0]
|
49 |
+
merged[-1][1] += i[1]
|
50 |
+
else:
|
51 |
+
merged.append(i)
|
52 |
+
else:
|
53 |
+
merged.append(i)
|
54 |
+
|
55 |
+
# print("=====MERGED=====")
|
56 |
+
# for i in merged:
|
57 |
+
# src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
|
58 |
+
# trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
|
59 |
+
# print(src_tok, "=>", trg_tok)
|
60 |
+
|
61 |
+
colordict = {}
|
62 |
+
ncolors = 0
|
63 |
+
for i in merged:
|
64 |
+
src_tok = [f"src_{x}" for x in i[0]]
|
65 |
+
trg_tok = [f"trg_{x}" for x in i[1]]
|
66 |
+
all_tok = src_tok + trg_tok
|
67 |
+
# see if any tokens in entry already have associated color
|
68 |
+
newcolor = None
|
69 |
+
for t in all_tok:
|
70 |
+
if t in colordict:
|
71 |
+
newcolor = colordict[t]
|
72 |
+
break
|
73 |
+
if not newcolor:
|
74 |
+
newcolor = ncolors
|
75 |
+
ncolors += 1
|
76 |
+
for t in all_tok:
|
77 |
+
if t not in colordict:
|
78 |
+
colordict[t] = newcolor
|
79 |
+
|
80 |
+
colors = generate_diverging_colors(ncolors, palette="Set2")
|
81 |
+
id_to_color = {i: c for i, c in enumerate(colors)}
|
82 |
+
for k, v in colordict.items():
|
83 |
+
colordict[k] = id_to_color[v]
|
84 |
+
|
85 |
+
tgthtml = []
|
86 |
+
for i, token in enumerate(decoder_input_ids[0]):
|
87 |
+
if f"src_{i}" in colordict:
|
88 |
+
label = f"src_{i}"
|
89 |
+
tgthtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
|
90 |
+
else:
|
91 |
+
tgthtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
|
92 |
+
tgthtml = "".join(tgthtml)
|
93 |
+
tgthtml = tgthtml.replace("▁", " ")
|
94 |
+
tgthtml = f"<span style='font-size: 25px'>{tgthtml}</span>"
|
95 |
+
|
96 |
+
srchtml = []
|
97 |
+
for i, token in enumerate(encoder_input_ids[0]):
|
98 |
+
if (i == 0 and skip_first_src) or (i == 1 and skip_second_src):
|
99 |
+
continue
|
100 |
+
|
101 |
+
if f"trg_{i}" in colordict:
|
102 |
+
label = f"trg_{i}"
|
103 |
+
srchtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
|
104 |
+
else:
|
105 |
+
srchtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
|
106 |
+
srchtml = "".join(srchtml)
|
107 |
+
srchtml = srchtml.replace("▁", " ")
|
108 |
+
srchtml = f"<span style='font-size: 25px'>{srchtml}</span>"
|
109 |
+
return srchtml, tgthtml
|
consts.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CSS = """
|
2 |
+
|
3 |
+
@import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Hebrew:wght@400;700&family=Noto+Naskh+Arabic:wght@400;700&display=swap');
|
4 |
+
|
5 |
+
#taatik textarea {
|
6 |
+
font-size: 25px;
|
7 |
+
font-family: 'Noto Sans Hebrew', 'Noto Naskh Arabic', 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial', 'Arial Unicode MS', sans-serif;
|
8 |
+
}
|
9 |
+
|
10 |
+
@font-face {
|
11 |
+
font-family: 'Noto Sans Hebrew';
|
12 |
+
src: url('https://fonts.gstatic.com/s/notosanshebrew/v40/or3HQ7v33eiDlKj4557q0OGCZa662.woff2') format('woff2');
|
13 |
+
unicode-range: U+0590-05FF, U+200C-2010, U+20AA, U+25CC, U+FB1D-FB4F;
|
14 |
+
}
|
15 |
+
|
16 |
+
@font-face {
|
17 |
+
font-family: 'Noto Naskh Arabic';
|
18 |
+
src: url('https://fonts.gstatic.com/s/notonaskharabic/v30/RrQ5bpV-9Dd1b1OAGA6M9PkyDuVBePeKNaxcsss0Y7bwvc5Urqjc.woff2') format('woff2');
|
19 |
+
unicode-range: U+0600-06FF, U+0750-077F, U+0870-088E, U+0890-0891, U+0898-08E1, U+08E3-08FF, U+200C-200E, U+2010-2011, U+204F, U+2E41, U+FB50-FDFF, U+FE70-FE74, U+FE76-FEFC;
|
20 |
+
}
|
21 |
+
|
22 |
+
:root { direction: rtl; }
|
23 |
+
#liter textarea, #trans textarea { font-size: 25px;}
|
24 |
+
#grp { padding: 10px; }
|
25 |
+
#trans textarea { direction: rtl; }
|
26 |
+
#taatik { direction: rtl; }
|
27 |
+
#about { direction: rtl; }
|
28 |
+
#tab1 { direction: rtl; }
|
29 |
+
#tab2 { direction: rtl; }
|
30 |
+
#footer { direction: rtl; }
|
31 |
+
#input {direction: rtl;}
|
32 |
+
#diac_input {direction: rtl;}
|
33 |
+
#diacritized { direction: rtl; }
|
34 |
+
#diacritized2 { direction: rtl; }
|
35 |
+
#taatik2 { direction: rtl; }
|
36 |
+
#diacritized textarea { font-size: 25px;}
|
37 |
+
#diacritized2 textarea { font-size: 25px;}
|
38 |
+
#taatik2 textarea { font-size: 25px;}
|
39 |
+
#input textarea { font-size: 20px;}
|
40 |
+
#diac_input textarea { font-size: 20px;}
|
41 |
+
#check { border-style: none !important; }
|
42 |
+
#nearest { font-family: 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial'; }
|
43 |
+
:root {--button-secondary-background-focus: #2563eb !important;
|
44 |
+
--button-secondary-background-base: #2563eb !important;
|
45 |
+
--button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2);
|
46 |
+
--button-secondary-text-color-base: white !important;
|
47 |
+
--button-secondary-text-color-hover: white !important;
|
48 |
+
--button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
|
49 |
+
--button-secondary-text-color-focus: white !important}
|
50 |
+
.dark {--button-secondary-background-base: #2563eb !important;
|
51 |
+
--button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
|
52 |
+
--button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2)}
|
53 |
+
.feather-music { stroke: #2563eb; }
|
54 |
+
|
55 |
+
.dataframe {
|
56 |
+
font-family: 'Arial', 'Helvetica', sans-serif !important;
|
57 |
+
}
|
58 |
+
.dataframe th, .dataframe td {
|
59 |
+
font-family: inherit !important;
|
60 |
+
|
61 |
+
}
|
62 |
+
|
63 |
+
.gradio-container .dataframe {
|
64 |
+
font-family: Arial, sans-serif !important;
|
65 |
+
|
66 |
+
}
|
67 |
+
|
68 |
+
/* Target the table header cells */
|
69 |
+
.table th .cell-wrap {
|
70 |
+
text-align: right !important;
|
71 |
+
}
|
72 |
+
|
73 |
+
/* Target the span inside the header cells */
|
74 |
+
.table th .cell-wrap span {
|
75 |
+
text-align: right !important;
|
76 |
+
display: block;
|
77 |
+
font-family: Arial, sans-serif !important;
|
78 |
+
}
|
79 |
+
|
80 |
+
/* Ensure the sort button doesn't interfere with alignment */
|
81 |
+
.table th .cell-wrap .sort-button {
|
82 |
+
float: left;
|
83 |
+
}
|
84 |
+
|
85 |
+
/* Target the table body cells */
|
86 |
+
.table td {
|
87 |
+
text-align: right !important;
|
88 |
+
}
|
89 |
+
|
90 |
+
/* Target the span inside the body cells */
|
91 |
+
.table td .cell-wrap span {
|
92 |
+
text-align: right !important;
|
93 |
+
display: block;
|
94 |
+
font-family: Arial, sans-serif !important;
|
95 |
+
font-size: 20px;
|
96 |
+
}
|
97 |
+
|
98 |
+
"""
|
99 |
+
|
100 |
+
ABOUT = """
|
101 |
+
כלי זה נוצר על ידי גיא מור-לן כחלק מפרויקט Levanti. הכלי מתרגם טקסט בעברית לדיאלקטים השונים של ערבית מדוברת, ולהפך, באמצעות מודלים יעודיים של בינה מלאכותית. כמו כן הכלי תומך בניקוד אוטומטי, המרה של הניקוד לתעתוק והפקת שמע (פיצ'רים ניסיוניים). לפרטים נוספים, גישה לדאטה ולמודלים, ולגרסה האנגלית של כלי ראו את [דף הפרויקט בהאגינג פייס](https://huggingface.co/datasets/guymorlan/levanti).
|
102 |
+
מעוניינים לתרום לפרויקט? מצאתם טעות? מוזמנים ליצור קשר [כאן](mailto:[email protected]). מוזמנים לבדוק גם את אתר האחות [סרטונים בערבית](https://videosinarabic.com/).
|
103 |
+
"""
|
104 |
+
|
105 |
+
JS_FUNC = """
|
106 |
+
function refresh() {
|
107 |
+
const url = new URL(window.location);
|
108 |
+
|
109 |
+
if (url.searchParams.get('__theme') !== 'dark') {
|
110 |
+
url.searchParams.set('__theme', 'dark');
|
111 |
+
window.location.href = url.href;
|
112 |
+
}
|
113 |
+
}
|
114 |
+
"""
|
diacritize.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#%%
|
2 |
+
from transformers import RobertaForTokenClassification, AutoTokenizer
|
3 |
+
model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics")
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics")
|
5 |
+
|
6 |
+
#%%
|
7 |
+
label2diacritic = {0: 'ّ', # SHADDA
|
8 |
+
1: 'َ', # FATHA
|
9 |
+
2: 'ِ', # KASRA
|
10 |
+
3: 'ُ', # DAMMA
|
11 |
+
4: 'ْ'} # SUKKUN
|
12 |
+
|
13 |
+
|
14 |
+
def arabic2diacritics(text, model, tokenizer):
|
15 |
+
tokens = tokenizer(text, return_tensors="pt")
|
16 |
+
preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS
|
17 |
+
new_text = []
|
18 |
+
for p, c in zip(preds, text):
|
19 |
+
new_text.append(c)
|
20 |
+
for i in range(1, 5):
|
21 |
+
if p[i]:
|
22 |
+
new_text.append(label2diacritic[i])
|
23 |
+
# check shadda last
|
24 |
+
if p[0]:
|
25 |
+
new_text.append(label2diacritic[0])
|
26 |
+
|
27 |
+
new_text = "".join(new_text)
|
28 |
+
return new_text
|
29 |
+
|
30 |
+
|
31 |
+
def diacritize(text):
|
32 |
+
return arabic2diacritics(text, model, tokenizer)
|
33 |
+
|
34 |
+
def diacritize_if_not_already(text):
|
35 |
+
if any(c in label2diacritic.values() for c in text):
|
36 |
+
return text
|
37 |
+
else:
|
38 |
+
return arabic2diacritics(text, model, tokenizer)
|
39 |
+
#%%
|
40 |
+
# text = "بديش اروح عالمدرسة بكرا"
|
41 |
+
# arabic2diacritics(text, model, tokenizer)
|
42 |
+
# %%
|
embedding_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2819479f36c2c7667febd5745f148c1c82b2c691ef1e04defec96e9c70c7b71b
|
3 |
+
size 88125517
|
embeddings.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:08d31408285b68d57a39850295e26e616291065ca3953fb6db9494e0b66ae61c
|
3 |
+
size 319545472
|
he_ar/README.md
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: cc-by-nc-4.0
|
3 |
+
datasets:
|
4 |
+
- guymorlan/levanti
|
5 |
+
language:
|
6 |
+
- ar
|
7 |
+
- he
|
8 |
+
pipeline_tag: translation
|
9 |
+
widget:
|
10 |
+
- text: P אני רוצה ללכת מחר לחנות
|
11 |
+
---
|
12 |
+
|
13 |
+
# Levanti (Hebrew -> colloquial Levantine Arabic) translator
|
14 |
+
|
15 |
+
Trained on the [Levanti](https://huggingface.co/datasets/guymorlan/levanti) dataset by fine-tuning [Helsinki-NLP/opus-mt-he-ar](https://huggingface.co/Helsinki-NLP/opus-mt-ar-he) for 8 epochs.
|
16 |
+
This model is trained to support dialect conditional generation by utilizing the first token (followed by a space) as an indicator of the desired dialect:
|
17 |
+
* **P** for Palestinian
|
18 |
+
* **L** for Lebanese
|
19 |
+
* **S** for Syrian
|
20 |
+
* **E** for Egyptian
|
21 |
+
|
22 |
+
# Example usage
|
23 |
+
|
24 |
+
```python
|
25 |
+
from transformers import pipeline
|
26 |
+
trans = pipeline("translation", "guymorlan/levanti_translate_he_ar")
|
27 |
+
trans("P אני רוצה ללכת מחר לחנות")
|
28 |
+
```
|
29 |
+
```
|
30 |
+
Out[1]: [{'translation_text': 'بدي أروح ع الدكان بكرا'}]
|
31 |
+
```
|
32 |
+
|
33 |
+
# Attribution
|
34 |
+
Created by Guy Mor-Lan.<br>
|
35 |
+
Contact: guy.mor AT mail.huji.ac.il
|
he_ar/config.json
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Helsinki-NLP/opus-mt-he-ar",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"activation_function": "swish",
|
5 |
+
"add_bias_logits": false,
|
6 |
+
"add_final_layer_norm": false,
|
7 |
+
"architectures": [
|
8 |
+
"MarianMTModel"
|
9 |
+
],
|
10 |
+
"attention_dropout": 0.0,
|
11 |
+
"bad_words_ids": [
|
12 |
+
[
|
13 |
+
63303
|
14 |
+
]
|
15 |
+
],
|
16 |
+
"bos_token_id": 0,
|
17 |
+
"classif_dropout": 0.0,
|
18 |
+
"classifier_dropout": 0.0,
|
19 |
+
"d_model": 512,
|
20 |
+
"decoder_attention_heads": 8,
|
21 |
+
"decoder_ffn_dim": 2048,
|
22 |
+
"decoder_layerdrop": 0.0,
|
23 |
+
"decoder_layers": 6,
|
24 |
+
"decoder_start_token_id": 63303,
|
25 |
+
"decoder_vocab_size": 63304,
|
26 |
+
"dropout": 0.1,
|
27 |
+
"encoder_attention_heads": 8,
|
28 |
+
"encoder_ffn_dim": 2048,
|
29 |
+
"encoder_layerdrop": 0.0,
|
30 |
+
"encoder_layers": 6,
|
31 |
+
"eos_token_id": 0,
|
32 |
+
"extra_pos_embeddings": 63304,
|
33 |
+
"forced_eos_token_id": 0,
|
34 |
+
"id2label": {
|
35 |
+
"0": "LABEL_0",
|
36 |
+
"1": "LABEL_1",
|
37 |
+
"2": "LABEL_2"
|
38 |
+
},
|
39 |
+
"init_std": 0.02,
|
40 |
+
"is_encoder_decoder": true,
|
41 |
+
"label2id": {
|
42 |
+
"LABEL_0": 0,
|
43 |
+
"LABEL_1": 1,
|
44 |
+
"LABEL_2": 2
|
45 |
+
},
|
46 |
+
"max_length": 512,
|
47 |
+
"max_position_embeddings": 512,
|
48 |
+
"model_type": "marian",
|
49 |
+
"normalize_before": false,
|
50 |
+
"normalize_embedding": false,
|
51 |
+
"num_beams": 4,
|
52 |
+
"num_hidden_layers": 6,
|
53 |
+
"pad_token_id": 63303,
|
54 |
+
"scale_embedding": true,
|
55 |
+
"share_encoder_decoder_embeddings": true,
|
56 |
+
"static_position_embeddings": true,
|
57 |
+
"torch_dtype": "float32",
|
58 |
+
"transformers_version": "4.38.1",
|
59 |
+
"use_cache": true,
|
60 |
+
"vocab_size": 63304
|
61 |
+
}
|
he_ar/generation_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bad_words_ids": [
|
3 |
+
[
|
4 |
+
63303
|
5 |
+
]
|
6 |
+
],
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"decoder_start_token_id": 63303,
|
9 |
+
"eos_token_id": 0,
|
10 |
+
"forced_eos_token_id": 0,
|
11 |
+
"max_length": 512,
|
12 |
+
"num_beams": 4,
|
13 |
+
"pad_token_id": 63303,
|
14 |
+
"renormalize_logits": true,
|
15 |
+
"transformers_version": "4.38.1"
|
16 |
+
}
|
he_ar/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:334b98a0c0db5cd649d23521bf3c7ce41d92238092c7b90811434961390ab7b2
|
3 |
+
size 306482848
|
he_ar/source.spm
ADDED
Binary file (896 kB). View file
|
|
he_ar/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eos_token": "</s>",
|
3 |
+
"pad_token": "<pad>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
he_ar/target.spm
ADDED
Binary file (899 kB). View file
|
|
he_ar/tokenizer_config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "</s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<unk>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"63303": {
|
20 |
+
"content": "<pad>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"clean_up_tokenization_spaces": true,
|
29 |
+
"eos_token": "</s>",
|
30 |
+
"model_max_length": 512,
|
31 |
+
"pad_token": "<pad>",
|
32 |
+
"return_tensors": "pt",
|
33 |
+
"separate_vocabs": false,
|
34 |
+
"source_lang": "heb",
|
35 |
+
"sp_model_kwargs": {},
|
36 |
+
"target_lang": "ara",
|
37 |
+
"tokenizer_class": "MarianTokenizer",
|
38 |
+
"unk_token": "<unk>"
|
39 |
+
}
|
he_ar/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
he_ar_ct2/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_source_bos": false,
|
3 |
+
"add_source_eos": false,
|
4 |
+
"bos_token": "<s>",
|
5 |
+
"decoder_start_token": "</s>",
|
6 |
+
"eos_token": "</s>",
|
7 |
+
"layer_norm_epsilon": null,
|
8 |
+
"multi_query_attention": false,
|
9 |
+
"unk_token": "<unk>"
|
10 |
+
}
|
he_ar_ct2/model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2ca03c089d8803b9963cfca86b1a760cb56e1f3f776743d21b33aa747bc94e8
|
3 |
+
size 307511690
|
he_ar_ct2/shared_vocabulary.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.42.3
|
2 |
+
torch==1.13.1
|
3 |
+
sentencepiece==0.1.97
|
4 |
+
sacremoses==0.0.53
|
5 |
+
pandas==1.5.1
|
6 |
+
azure-cognitiveservices-speech==1.38.0
|
7 |
+
matplotlib==3.7.0
|
8 |
+
python-dotenv
|
9 |
+
gradio==4.37.2
|
10 |
+
ctranslate2==4.1.0
|
semsearch.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
import translate
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
data = pd.read_csv("./embedding_data.csv")
|
8 |
+
embeddings = np.load("./embeddings.npy")
|
9 |
+
|
10 |
+
def normalize_vector(v):
|
11 |
+
norm = np.linalg.norm(v)
|
12 |
+
if norm == 0:
|
13 |
+
return v
|
14 |
+
return v / norm
|
15 |
+
|
16 |
+
|
17 |
+
def embed_one(model, tokenizer, text, normalize=True):
|
18 |
+
tokens = tokenizer(text, return_tensors="pt", truncation=True)
|
19 |
+
with torch.no_grad():
|
20 |
+
embedding = model.model.encoder(**tokens).last_hidden_state.mean(axis=1)
|
21 |
+
embedding = embedding.detach().numpy()[0]
|
22 |
+
|
23 |
+
if normalize:
|
24 |
+
return normalize_vector(embedding)
|
25 |
+
else:
|
26 |
+
return embedding
|
27 |
+
|
28 |
+
|
29 |
+
def knn(query_embedding, embeddings, df, k=5, hebrew=True):
|
30 |
+
sims = np.dot(embeddings, query_embedding.T)
|
31 |
+
outs = np.argsort(sims, axis=0)[-k:][::-1]
|
32 |
+
select = outs.ravel()
|
33 |
+
if hebrew:
|
34 |
+
return df.iloc[select][["arabic", "hebrew", "validated"]]
|
35 |
+
else:
|
36 |
+
return df.iloc[select][["arabic", "english", "validated"]]
|
37 |
+
|
38 |
+
def run_knn(text, k=5):
|
39 |
+
print(text)
|
40 |
+
query_embedding = embed_one(translate.model_from_ar,
|
41 |
+
translate.tokenizer_from_ar, text)
|
42 |
+
return knn(query_embedding, embeddings, data, k=k, hebrew=True)
|
43 |
+
|
44 |
+
|
45 |
+
def style_dataframe(df):
|
46 |
+
styled_df = df.style.set_properties(**{
|
47 |
+
'font-family': 'Arial, sans-serif',
|
48 |
+
'font-size': '20px',
|
49 |
+
'text-align': 'right',
|
50 |
+
'direction': 'rtl',
|
51 |
+
'align': 'right'
|
52 |
+
}).set_table_styles([
|
53 |
+
{'selector': 'th', 'props': [('text-align', 'right')]}
|
54 |
+
])
|
55 |
+
return styled_df
|
56 |
+
|
57 |
+
|
58 |
+
def style_dataframe(df):
|
59 |
+
return df.style.set_table_styles([
|
60 |
+
{'selector': 'thead', 'props': [('text-align', 'right')]},
|
61 |
+
{'selector': '.index_name', 'props': [('text-align', 'right')]},
|
62 |
+
]).set_properties(**{
|
63 |
+
'text-align': 'right',
|
64 |
+
}) # Replace 'column_name' with your actual column name
|
65 |
+
|
66 |
+
|
67 |
+
def update_df(hidden_arabic):
|
68 |
+
df = run_knn(hidden_arabic, 100)
|
69 |
+
# replace true and false in validated column with checkmark and x emoji
|
70 |
+
df["validated"] = df["validated"].apply(lambda x: "✅" if x else "❌")
|
71 |
+
# replace name validated with "מאומת"
|
72 |
+
df = df.rename(columns={"validated": "מאומת"})
|
73 |
+
# replace name arabic with "ערבית"
|
74 |
+
df = df.rename(columns={"arabic": "ערבית"})
|
75 |
+
# replace name hebrew with "עברית"
|
76 |
+
df = df.rename(columns={"hebrew": "עברית"})
|
77 |
+
styled_df = style_dataframe(df)
|
78 |
+
return gr.DataFrame(value=styled_df, visible=True)
|
translate.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import MarianMTModel, AutoTokenizer
|
3 |
+
import ctranslate2
|
4 |
+
from colorize import align_words
|
5 |
+
import logging
|
6 |
+
|
7 |
+
# Create a logger
|
8 |
+
logger = logging.getLogger()
|
9 |
+
logger.setLevel(logging.INFO) # Set to debug to capture all levels of logs
|
10 |
+
file_handler = logging.FileHandler('app.log', mode='a') # 'a' mode appends to the file
|
11 |
+
file_handler.setLevel(logging.INFO)
|
12 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
13 |
+
file_handler.setFormatter(formatter)
|
14 |
+
logger.addHandler(file_handler)
|
15 |
+
|
16 |
+
model_to_ar = MarianMTModel.from_pretrained("./he_ar/", output_attentions=True)
|
17 |
+
model_from_ar = MarianMTModel.from_pretrained("./ar_he/", output_attentions=True)
|
18 |
+
model_to_ar_ct2 = ctranslate2.Translator("./he_ar_ct2/")
|
19 |
+
model_from_ar_ct2 = ctranslate2.Translator("./ar_he_ct2/")
|
20 |
+
|
21 |
+
tokenizer_to_ar = AutoTokenizer.from_pretrained("./he_ar/")
|
22 |
+
tokenizer_from_ar = AutoTokenizer.from_pretrained("./ar_he/")
|
23 |
+
print("Done loading models")
|
24 |
+
|
25 |
+
dialect_map = {
|
26 |
+
"Palestinian": "P",
|
27 |
+
"Syrian": "S",
|
28 |
+
"Lebanese": "L",
|
29 |
+
"Egyptian": "E",
|
30 |
+
"פלסטיני": "P",
|
31 |
+
"סורי": "S",
|
32 |
+
"לבנוני": "L",
|
33 |
+
"מצרי": "E"
|
34 |
+
}
|
35 |
+
|
36 |
+
|
37 |
+
def translate(text, ct_model, hf_model, tokenizer, to_arabic=True,
|
38 |
+
threshold=None, layer=2, head=6):
|
39 |
+
|
40 |
+
logger.info(f"Translating: {text}")
|
41 |
+
inp_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
|
42 |
+
out_tokens = ct_model.translate_batch([inp_tokens])[0].hypotheses[0]
|
43 |
+
out_string = tokenizer.convert_tokens_to_string(out_tokens)
|
44 |
+
|
45 |
+
encoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(inp_tokens)).unsqueeze(0)
|
46 |
+
decoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(["<pad>"] + out_tokens +
|
47 |
+
['</s>'])).unsqueeze(0)
|
48 |
+
|
49 |
+
colorization_output = hf_model(input_ids=encoder_input_ids,
|
50 |
+
decoder_input_ids=decoder_input_ids)
|
51 |
+
|
52 |
+
if not threshold:
|
53 |
+
if len(inp_tokens) < 10:
|
54 |
+
threshold = 0.05
|
55 |
+
elif len(inp_tokens) < 20:
|
56 |
+
threshold = 0.10
|
57 |
+
else:
|
58 |
+
threshold = 0.05
|
59 |
+
|
60 |
+
srchtml, tgthtml = align_words(colorization_output,
|
61 |
+
tokenizer,
|
62 |
+
encoder_input_ids,
|
63 |
+
decoder_input_ids,
|
64 |
+
threshold,
|
65 |
+
skip_first_src=to_arabic,
|
66 |
+
skip_second_src=False,
|
67 |
+
layer=layer,
|
68 |
+
head=head)
|
69 |
+
|
70 |
+
html = f"<div style='direction: rtl'>{srchtml}<br><br>{tgthtml}</div>"
|
71 |
+
|
72 |
+
arabic = out_string if is_arabic(out_string) else text
|
73 |
+
return html, arabic
|
74 |
+
|
75 |
+
|
76 |
+
#%%
|
77 |
+
|
78 |
+
|
79 |
+
def is_arabic(text):
|
80 |
+
# return True if text has more than 50% arabic characters, False otherwise
|
81 |
+
text = text.replace(" ", "")
|
82 |
+
arabic_chars = 0
|
83 |
+
for c in text:
|
84 |
+
if "\u0600" <= c <= "\u06FF":
|
85 |
+
arabic_chars += 1
|
86 |
+
|
87 |
+
return arabic_chars / len(text) > 0.5
|
88 |
+
|
89 |
+
def run_translate(text, dialect=None):
|
90 |
+
if not text:
|
91 |
+
return ""
|
92 |
+
if is_arabic(text):
|
93 |
+
return translate(text, model_from_ar_ct2, model_from_ar, tokenizer_from_ar,
|
94 |
+
to_arabic=False, threshold=None, layer=2, head=1)
|
95 |
+
else:
|
96 |
+
if dialect in dialect_map:
|
97 |
+
dialect = dialect_map[dialect]
|
98 |
+
|
99 |
+
text = f"{dialect} {text}" if dialect else text
|
100 |
+
return translate(text, model_to_ar_ct2, model_to_ar, tokenizer_to_ar,
|
101 |
+
to_arabic=True, threshold=None, layer=2, head=6)
|
translit.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#%%
|
2 |
+
from transformers import CanineForTokenClassification, AutoTokenizer
|
3 |
+
import re
|
4 |
+
import torch
|
5 |
+
|
6 |
+
# instantiate module logger
|
7 |
+
import logging
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
logger.setLevel(logging.INFO)
|
10 |
+
|
11 |
+
model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")
|
13 |
+
|
14 |
+
#%%
|
15 |
+
|
16 |
+
def diacritics2hebrew_vowels(text, model, tokenizer):
|
17 |
+
tokens = tokenizer(text, return_tensors="pt")
|
18 |
+
with torch.no_grad():
|
19 |
+
pred = model(**tokens)
|
20 |
+
pred = pred.logits.argmax(-1).tolist()
|
21 |
+
|
22 |
+
pred = pred[0][1:-1] # remove CLS and SEP
|
23 |
+
output = []
|
24 |
+
for p, c in zip(pred, text):
|
25 |
+
if p != model.config.label2id["O"]:
|
26 |
+
output.append(model.config.id2label[p])
|
27 |
+
else:
|
28 |
+
output.append(c)
|
29 |
+
output = "".join(output)
|
30 |
+
|
31 |
+
# print("Done converting to Hebrew vowels")
|
32 |
+
logger.warning("Done converting to Hebrew vowels")
|
33 |
+
return output
|
34 |
+
|
35 |
+
#%%
|
36 |
+
|
37 |
+
|
38 |
+
arabic_to_english = {
|
39 |
+
"ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
|
40 |
+
"آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
|
41 |
+
"ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
|
42 |
+
"س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
|
43 |
+
"ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
|
44 |
+
"م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
|
45 |
+
"َ": "a", "ُ": "u", "ِ": "i",
|
46 |
+
"،": ",",
|
47 |
+
"ֹ": "o", # holam
|
48 |
+
"ַ": "a", # patah
|
49 |
+
"ִ": "i", # hiriq
|
50 |
+
"ְ": "", # shva
|
51 |
+
"ֻ": "u", # kubutz
|
52 |
+
'ֵ': "e",
|
53 |
+
"ّ": "SHADDA" # shadda
|
54 |
+
}
|
55 |
+
|
56 |
+
arabic_to_hebrew = {
|
57 |
+
# regular letters
|
58 |
+
"ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א",
|
59 |
+
"آ": "אא", "ى": "א", "ب": "בּ", "ت": "ת", "ث": "ת'", "ج": "ג'",
|
60 |
+
"ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז",
|
61 |
+
"س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'",
|
62 |
+
"ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כּ", "ل": "ל",
|
63 |
+
"م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
|
64 |
+
# special characters
|
65 |
+
"،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
|
66 |
+
"؟": "?", "؛": ";", "ـ": "",
|
67 |
+
# shadda to \u0598
|
68 |
+
"ّ": "\u0598",
|
69 |
+
}
|
70 |
+
|
71 |
+
vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']
|
72 |
+
|
73 |
+
final_letters = {
|
74 |
+
"ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
|
75 |
+
}
|
76 |
+
|
77 |
+
def reorder_hebrew_nikkud(input_string):
|
78 |
+
# in case of 2 character letter transliteration, move the nikkud back to the first character and not the apostrophe
|
79 |
+
|
80 |
+
# Define a dictionary for the nikkud signs
|
81 |
+
nikkud_signs = {
|
82 |
+
"ֹ": "o", # holam
|
83 |
+
"ַ": "a", # patah
|
84 |
+
"ִ": "i", # hiriq
|
85 |
+
"ְ": "", # shva
|
86 |
+
"ֻ": "u", # kubutz
|
87 |
+
"ֵ": "e", # tsere
|
88 |
+
"ّ": "SHADDA" # shadda
|
89 |
+
}
|
90 |
+
|
91 |
+
# Create a regex pattern for Hebrew letter followed by apostrophe and one of the nikkud signs
|
92 |
+
pattern = r'([\u0590-\u05FF])(\')([' + ''.join(nikkud_signs.keys()) + '])'
|
93 |
+
replacement = r'\1\3\2'
|
94 |
+
|
95 |
+
result = re.sub(pattern, replacement, input_string)
|
96 |
+
|
97 |
+
return result
|
98 |
+
|
99 |
+
def reverse_holam_shadda_vav(input_string):
|
100 |
+
# For better readability, replace (holam, shadda, ו) with (shadda, ו, holam)
|
101 |
+
# instead of shadda we use the replacement \u0598
|
102 |
+
pattern = r'(\u05B9)(\u0598)(\u05D5)'
|
103 |
+
replacement = r'\2\3\1'
|
104 |
+
result = re.sub(pattern, replacement, input_string)
|
105 |
+
|
106 |
+
return result
|
107 |
+
|
108 |
+
def to_taatik(arabic):
|
109 |
+
taatik = []
|
110 |
+
for index, letter in enumerate(arabic):
|
111 |
+
if (
|
112 |
+
(index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and
|
113 |
+
letter in final_letters
|
114 |
+
):
|
115 |
+
taatik.append(final_letters[letter])
|
116 |
+
elif letter not in arabic_to_hebrew:
|
117 |
+
taatik.append(letter)
|
118 |
+
else:
|
119 |
+
taatik.append(arabic_to_hebrew[letter])
|
120 |
+
reversed = reverse_holam_shadda_vav("".join(taatik))
|
121 |
+
reordered = reorder_hebrew_nikkud(reversed)
|
122 |
+
# print("Done converting to taatik")
|
123 |
+
logger.warning("Done converting to taatik")
|
124 |
+
return reordered
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
def to_translit(arabic):
|
130 |
+
translit = []
|
131 |
+
for letter in arabic:
|
132 |
+
if letter not in arabic_to_english:
|
133 |
+
translit.append([letter, letter])
|
134 |
+
else:
|
135 |
+
if arabic_to_english[letter] == "SHADDA":
|
136 |
+
if translit[-1][0] in vowels:
|
137 |
+
translit[-2][1] = translit[-2][1].upper()
|
138 |
+
else:
|
139 |
+
translit[-1][1] = translit[-1][1].upper()
|
140 |
+
|
141 |
+
else:
|
142 |
+
translit.append([letter, arabic_to_english[letter]])
|
143 |
+
|
144 |
+
return "".join([x[1] for x in translit])
|
145 |
+
|
146 |
+
|
147 |
+
# %%
|
148 |
+
|
149 |
+
def taatik(text):
|
150 |
+
return to_taatik(diacritics2hebrew_vowels(text, model, tokenizer))
|
151 |
+
|
152 |
+
# text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِل��َّجَر "
|
153 |
+
# heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
|
154 |
+
# #%%
|
155 |
+
# to_taatik(heb_vowels)
|
156 |
+
# #%%
|
157 |
+
# to_translit(heb_vowels)
|
158 |
+
# # %%
|
tts.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#%%
|
2 |
+
import azure.cognitiveservices.speech as speechsdk
|
3 |
+
import re
|
4 |
+
import os
|
5 |
+
import hashlib
|
6 |
+
import random
|
7 |
+
|
8 |
+
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'),
|
9 |
+
region=os.environ.get('SPEECH_REGION'))
|
10 |
+
|
11 |
+
def do_cleanup(dir='wavs', num_files=100):
|
12 |
+
files = os.listdir(dir)
|
13 |
+
if len(files) > num_files:
|
14 |
+
for file in files[:len(files) - num_files]:
|
15 |
+
os.remove(f"{dir}/{file}")
|
16 |
+
|
17 |
+
def add_sukun(text):
|
18 |
+
# Define Arabic letters and sukun
|
19 |
+
arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي'
|
20 |
+
shadda = 'ّ'
|
21 |
+
arabic_letters += shadda
|
22 |
+
sukun = 'ْ'
|
23 |
+
punctuation = '.,;!?،؛؟'
|
24 |
+
|
25 |
+
def process_word(word):
|
26 |
+
# If the last character is punctuation, process the letter before it
|
27 |
+
if word[-1] in punctuation:
|
28 |
+
if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun:
|
29 |
+
return word[:-2] + word[-2] + sukun + word[-1]
|
30 |
+
return word
|
31 |
+
# If the last character is an Arabic letter and does not have a sukun, add one
|
32 |
+
elif word[-1] in arabic_letters and word[-1] != sukun:
|
33 |
+
return word + sukun
|
34 |
+
return word
|
35 |
+
|
36 |
+
# Use regex to split text into words and punctuation
|
37 |
+
words = re.findall(r'\S+|[.,;!?،؛؟]', text)
|
38 |
+
processed_text = ' '.join(process_word(word) for word in words)
|
39 |
+
return processed_text
|
40 |
+
|
41 |
+
def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'):
|
42 |
+
return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>'
|
43 |
+
|
44 |
+
|
45 |
+
def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True):
|
46 |
+
|
47 |
+
input_text = add_sukun(input_text)
|
48 |
+
hash = hashlib.md5(input_text.encode()).hexdigest()
|
49 |
+
|
50 |
+
if os.path.exists(f"wavs/{hash}.wav"):
|
51 |
+
return f"wavs/{hash}.wav"
|
52 |
+
|
53 |
+
audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav")
|
54 |
+
# speech_config.speech_synthesis_voice_name=voice
|
55 |
+
# speech_config.speech_synthesis_language = "ar-EG"
|
56 |
+
speech_config.set_speech_synthesis_output_format(
|
57 |
+
speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
|
58 |
+
)
|
59 |
+
|
60 |
+
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
|
61 |
+
audio_config=audio_config)
|
62 |
+
if use_ssml:
|
63 |
+
# print("Using SSML")
|
64 |
+
ssml = get_ssml(input_text, voice=voice)
|
65 |
+
result = speech_synthesizer.speak_ssml_async(ssml).get()
|
66 |
+
else:
|
67 |
+
# print("Using text")
|
68 |
+
result = speech_synthesizer.speak_text_async(input_text).get()
|
69 |
+
|
70 |
+
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
|
71 |
+
print("Speech synthesized for text [{}]".format(input_text))
|
72 |
+
elif result.reason == speechsdk.ResultReason.Canceled:
|
73 |
+
cancellation_details = result.cancellation_details
|
74 |
+
print("Speech synthesis canceled: {}".format(cancellation_details.reason))
|
75 |
+
if cancellation_details.reason == speechsdk.CancellationReason.Error:
|
76 |
+
print("Error details: {}".format(cancellation_details.error_details))
|
77 |
+
|
78 |
+
# randomly every 50 calls, clean up the wavs folder
|
79 |
+
if random.randint(1, 50) == 1:
|
80 |
+
do_cleanup()
|
81 |
+
|
82 |
+
return f"wavs/{hash}.wav"
|