Guy Mor-Lan commited on
Commit
e35836c
·
1 Parent(s): 909a784
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ import gradio as gr
3
+ from dotenv import load_dotenv
4
+
5
+ from translate import run_translate
6
+ from diacritize import diacritize, diacritize_if_not_already
7
+ from translit import taatik, translit
8
+ from semsearch import update_df
9
+ from tts import get_audio
10
+ from consts import CSS, ABOUT, JS_FUNC
11
+ load_dotenv()
12
+
13
+ with gr.Blocks(title = "Levanti - Levantine Arabic Translation Tools",
14
+ css=CSS,
15
+ theme="default") as demo:
16
+ # gr.HTML("<h2><span style='color: #2563eb'>Levantine Arabic</span> Translator</h2>")
17
+ gr.HTML("<h2><span><span style='color: #2563eb'>Levanti</span>ne Translator</span></h2>Levantine Arabic Translation Tools")
18
+ with gr.Tab('Translation', elem_id="tab1"):
19
+ with gr.Row():
20
+ with gr.Column():
21
+ input_text = gr.Textbox(label="Input",
22
+ info = "Colloquial Arabic or English",
23
+ placeholder="Enter text in Arabic or English",
24
+ lines=2,
25
+ elem_id="input")
26
+
27
+ gr.Examples(["I called him two times, he's not picking up", "خلينا ندور على مطعم تاني"],
28
+ input_text,
29
+ label="Examples")
30
+
31
+ btn = gr.Button("Translation")
32
+ with gr.Row():
33
+ dialect = gr.Radio(["Palestinian", "Syrian", "Lebanese", "Egyptian"],
34
+ label = "Dialect",
35
+ info="Affects translation to Arabic",
36
+ value="Palestinian")
37
+
38
+ # gr.Markdown("Built by [Guy Mor-Lan](mailto:[email protected]). Pronunciation model is specifically tailored to urban Palestinian Arabic. Text-to-speech uses Microsoft Azure's API and may provide different result from the transliterated pronunciation.")
39
+ gr.Markdown("Create by [Guy Mor-Lan](mailto:[email protected]) as part of the [Levanti](https://huggingface.co/datasets/guymorlan/levanti) project. Audio is produced using Azure TTS with predicted diacritics and heuristics.", elem_id="footer")
40
+
41
+ with gr.Column():
42
+ with gr.Group(elem_id="grp"):
43
+ gr.HTML("Translation")
44
+ # gr.Markdown("תרגום", elem_id="diacritized")
45
+ translation_output = gr.HTML("<br>", visible=True, label="Translation", elem_id="main")
46
+
47
+ hidden_arabic = gr.Textbox(lines=1, elem_id="trans", visible=False)
48
+
49
+ diacritized_output = gr.Textbox(label="Diacritization (experimental)", lines=1, elem_id="diacritized",
50
+ interactive=False)
51
+ taatik_output = gr.Textbox(label="Transliteration (Experimental)", lines=1, elem_id="taatik",
52
+ text_align="right", interactive=False)
53
+ # diacritized_output = gr.HTML("<br>", label="ניקוד")
54
+ # taatik_output = gr.HTML("<br>", label="תעתיק")
55
+
56
+ audio = gr.Audio(label="Audio (Azure)", interactive=False,
57
+ autoplay=True)
58
+ audio_button = gr.Button("Generate Audio")
59
+ audio_button.click(get_audio, inputs=[diacritized_output], outputs=[audio])
60
+
61
+
62
+ btn.click(run_translate, inputs=[input_text, dialect],
63
+ outputs=[translation_output, hidden_arabic], api_name="en2ar",
64
+ js="function jump(x, y){document.getElementById('main').scrollIntoView(); return [x, y];}")
65
+
66
+ input_text.submit(run_translate, inputs=[input_text, dialect],
67
+ outputs=[translation_output, hidden_arabic], scroll_to_output=True)
68
+ hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
69
+ diacritized_output.change(translit, inputs=[diacritized_output], outputs=[taatik_output])
70
+ # with gr.Row():
71
+ # nearest_df = gr.DataFrame(headers=["ערבית", "עברית", "מאומת"], visible=False, wrap=True,
72
+ # elem_id="nearest", label="תוצאות קרובות מתוך קורפוס Levanti", height=300)
73
+
74
+ # hidden_arabic.change(update_df, inputs=[hidden_arabic], outputs=[nearest_df])
75
+
76
+ with gr.Tab("Diacritization and Transliteration", elem_id="tab2"):
77
+ with gr.Row():
78
+ with gr.Column():
79
+ diac_text = gr.Textbox(label="Input", placeholder="Insert text in Arabic", lines=1,
80
+ info = "For transliteration only, insert diacritized text",
81
+ elem_id="diac_input")
82
+ gr.Examples(["خلينا ندور على مطعم تاني", "قَدِيْش حَقّ الْبَنْدُورَة؟"], diac_text,
83
+ label="Examples", elem_id="diac_ex")
84
+ btn2 = gr.Button("Send")
85
+
86
+ with gr.Column():
87
+ diacritized_output2 = gr.Textbox(label="Diacritization", lines=1,
88
+ elem_id="diacritized2")
89
+ taatik_output2 = gr.Textbox(label="Transliteration", lines=1,
90
+ elem_id="taatik2")
91
+
92
+ # input_text.submit(run_translate, inputs=[input_text, dialect],
93
+ # outputs=[translation_output], scroll_to_output=True)
94
+ # hidden_arabic.change(diacritize, inputs=[hidden_arabic], outputs=[diacritized_output])
95
+ # diacritized_output.change(taatik, inputs=[diacritized_output], outputs=[taatik_output])
96
+ btn2.click(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
97
+ diac_text.submit(diacritize_if_not_already, inputs=[diac_text], outputs=[diacritized_output2])
98
+ diacritized_output2.change(translit, inputs=[diacritized_output2], outputs=[taatik_output2])
99
+ with gr.Tab("About", elem_id="tab3"):
100
+ with gr.Row():
101
+ gr.HTML("<h2>About</h2>")
102
+ gr.Markdown(ABOUT, elem_id="about")
103
+
104
+
105
+
106
+ demo.launch(ssl_verify=False)
ar_en ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit b5626c21d9814e83302354362e60d813003f8b97
ar_en_ct2/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_source_bos": false,
3
+ "add_source_eos": false,
4
+ "bos_token": "<s>",
5
+ "decoder_start_token": "</s>",
6
+ "eos_token": "</s>",
7
+ "layer_norm_epsilon": null,
8
+ "multi_query_attention": false,
9
+ "unk_token": "<unk>"
10
+ }
ar_en_ct2/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e12f837210b68c3e3f915b1de10bd762feee9e0f7c515f56f521f79d0b6dc5c
3
+ size 306547250
ar_en_ct2/shared_vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
colorize.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+
4
+
5
+ def generate_diverging_colors(num_colors, palette='Set3'): # courtesy of ChatGPT
6
+ # Generate a colormap with a specified number of colors
7
+ cmap = plt.cm.get_cmap(palette, num_colors)
8
+
9
+ # Get the RGB values of the colors in the colormap
10
+ colors_rgb = cmap(np.arange(num_colors))
11
+
12
+ # Convert the RGB values to hexadecimal color codes
13
+ colors_hex = [format(int(color[0]*255)<<16|int(color[1]*255)<<8|int(color[2]*255), '06x') for color in colors_rgb]
14
+
15
+ return colors_hex
16
+
17
+
18
+ def align_words(outputs, tokenizer, encoder_input_ids, decoder_input_ids,
19
+ threshold=0.4, skip_first_src=True, skip_second_src=False,
20
+ layer=2, head=6):
21
+
22
+ alignment = []
23
+ # threshold = 0.05
24
+ for i, tok in enumerate(outputs.cross_attentions[layer][0][head]):
25
+ alignment.append([[i], (tok > threshold).nonzero().squeeze(-1).tolist()])
26
+
27
+ # for i in alignment:
28
+ # src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
29
+ # trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
30
+ # print(src_tok, "=>", trg_tok)
31
+
32
+ merged = []
33
+ for i in alignment:
34
+ token = tokenizer.convert_ids_to_tokens([decoder_input_ids[0][i[0]]])[0]
35
+ # print(token)
36
+ if token not in ["</s>", "<pad>", "<unk>", "<s>"]:
37
+ if merged:
38
+ tomerge = False
39
+ # check overlap with previous entry
40
+ for x in i[1]:
41
+ if x in merged[-1][1]:# or tokenizer.convert_ids_to_tokens([encoder_input_ids[0][x]])[0][0] != "▁":
42
+ tomerge = True
43
+ break
44
+ # if first character is not a "▁"
45
+ if token[0] != "▁":
46
+ tomerge = True
47
+ if tomerge:
48
+ merged[-1][0] += i[0]
49
+ merged[-1][1] += i[1]
50
+ else:
51
+ merged.append(i)
52
+ else:
53
+ merged.append(i)
54
+
55
+ # print("=====MERGED=====")
56
+ # for i in merged:
57
+ # src_tok = [tokenizer.decode(decoder_input_ids[0][x]) for x in i[0]]
58
+ # trg_tok = [tokenizer.decode(encoder_input_ids[0][x]) for x in i[1]]
59
+ # print(src_tok, "=>", trg_tok)
60
+
61
+ colordict = {}
62
+ ncolors = 0
63
+ for i in merged:
64
+ src_tok = [f"src_{x}" for x in i[0]]
65
+ trg_tok = [f"trg_{x}" for x in i[1]]
66
+ all_tok = src_tok + trg_tok
67
+ # see if any tokens in entry already have associated color
68
+ newcolor = None
69
+ for t in all_tok:
70
+ if t in colordict:
71
+ newcolor = colordict[t]
72
+ break
73
+ if not newcolor:
74
+ newcolor = ncolors
75
+ ncolors += 1
76
+ for t in all_tok:
77
+ if t not in colordict:
78
+ colordict[t] = newcolor
79
+
80
+ colors = generate_diverging_colors(ncolors, palette="Set2")
81
+ id_to_color = {i: c for i, c in enumerate(colors)}
82
+ for k, v in colordict.items():
83
+ colordict[k] = id_to_color[v]
84
+
85
+ tgthtml = []
86
+ for i, token in enumerate(decoder_input_ids[0]):
87
+ if f"src_{i}" in colordict:
88
+ label = f"src_{i}"
89
+ tgthtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
90
+ else:
91
+ tgthtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
92
+ tgthtml = "".join(tgthtml)
93
+ tgthtml = tgthtml.replace("▁", " ")
94
+ tgthtml = f"<span style='font-size: 25px'>{tgthtml}</span>"
95
+
96
+ srchtml = []
97
+ for i, token in enumerate(encoder_input_ids[0]):
98
+ if (i == 0 and skip_first_src) or (i == 1 and skip_second_src):
99
+ continue
100
+
101
+ if f"trg_{i}" in colordict:
102
+ label = f"trg_{i}"
103
+ srchtml.append(f"<span style='color: #{colordict[label]}'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
104
+ else:
105
+ srchtml.append(f"<span style='color: --color-text-body'>{tokenizer.convert_ids_to_tokens([token])[0]}</span>")
106
+ srchtml = "".join(srchtml)
107
+ srchtml = srchtml.replace("▁", " ")
108
+ srchtml = f"<span style='font-size: 25px'>{srchtml}</span>"
109
+ return srchtml, tgthtml
consts.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CSS = """
2
+
3
+ @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Hebrew:wght@400;700&family=Noto+Naskh+Arabic:wght@400;700&display=swap');
4
+
5
+ #taatik textarea {
6
+ font-size: 25px;
7
+ font-family: 'Noto Sans Hebrew', 'Noto Naskh Arabic', 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial', 'Arial Unicode MS', sans-serif;
8
+ }
9
+
10
+ @font-face {
11
+ font-family: 'Noto Sans Hebrew';
12
+ src: url('https://fonts.gstatic.com/s/notosanshebrew/v40/or3HQ7v33eiDlKj4557q0OGCZa662.woff2') format('woff2');
13
+ unicode-range: U+0590-05FF, U+200C-2010, U+20AA, U+25CC, U+FB1D-FB4F;
14
+ }
15
+
16
+ @font-face {
17
+ font-family: 'Noto Naskh Arabic';
18
+ src: url('https://fonts.gstatic.com/s/notonaskharabic/v30/RrQ5bpV-9Dd1b1OAGA6M9PkyDuVBePeKNaxcsss0Y7bwvc5Urqjc.woff2') format('woff2');
19
+ unicode-range: U+0600-06FF, U+0750-077F, U+0870-088E, U+0890-0891, U+0898-08E1, U+08E3-08FF, U+200C-200E, U+2010-2011, U+204F, U+2E41, U+FB50-FDFF, U+FE70-FE74, U+FE76-FEFC;
20
+ }
21
+
22
+ #liter textarea, #trans textarea { font-size: 25px;}
23
+ #grp { padding: 10px; }
24
+ #diac_input textarea {direction: rtl;}
25
+ #diacritized textarea { direction: rtl; }
26
+ #diacritized2 textarea { direction: rtl; }
27
+ #diacritized textarea { font-size: 25px;}
28
+ #diacritized2 textarea { font-size: 25px;}
29
+ #taatik2 textarea { font-size: 25px;}
30
+ #input textarea { font-size: 20px;}
31
+ #diac_input textarea { font-size: 20px;}
32
+ #check { border-style: none !important; }
33
+ #nearest { font-family: 'SBL Hebrew', 'David CLM', 'FrankRuehl CLM', 'Narkisim', 'Arial'; }
34
+ :root {--button-secondary-background-focus: #2563eb !important;
35
+ --button-secondary-background-base: #2563eb !important;
36
+ --button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2);
37
+ --button-secondary-text-color-base: white !important;
38
+ --button-secondary-text-color-hover: white !important;
39
+ --button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
40
+ --button-secondary-text-color-focus: white !important}
41
+ .dark {--button-secondary-background-base: #2563eb !important;
42
+ --button-secondary-background-focus: rgb(51 122 216 / 70%) !important;
43
+ --button-secondary-background-hover: linear-gradient(to bottom right, #0692e8, #5859c2)}
44
+ .feather-music { stroke: #2563eb; }
45
+
46
+ .dataframe {
47
+ font-family: 'Arial', 'Helvetica', sans-serif !important;
48
+ }
49
+ .dataframe th, .dataframe td {
50
+ font-family: inherit !important;
51
+
52
+ }
53
+
54
+ .gradio-container .dataframe {
55
+ font-family: Arial, sans-serif !important;
56
+
57
+ }
58
+
59
+ /* Target the table header cells */
60
+ .table th .cell-wrap {
61
+ text-align: right !important;
62
+ }
63
+
64
+ /* Target the span inside the header cells */
65
+ .table th .cell-wrap span {
66
+ text-align: right !important;
67
+ display: block;
68
+ font-family: Arial, sans-serif !important;
69
+ }
70
+
71
+ /* Ensure the sort button doesn't interfere with alignment */
72
+ .table th .cell-wrap .sort-button {
73
+ float: left;
74
+ }
75
+
76
+ /* Target the table body cells */
77
+ .table td {
78
+ text-align: right !important;
79
+ }
80
+
81
+ /* Target the span inside the body cells */
82
+ .table td .cell-wrap span {
83
+ text-align: right !important;
84
+ display: block;
85
+ font-family: Arial, sans-serif !important;
86
+ font-size: 20px;
87
+ }
88
+
89
+ """
90
+
91
+ ABOUT = """
92
+ This tool was created by Guy Mor-Lan as part of the Levanti project. The tool translates text between Hebrew and various Levantine Arabic dialects using specialized AI models. The tool also supports automatic diacritization, conversion of diacritization to transliteration, and audio generation (experimental features). For more information, access to data and models, and the Hebrew version of the tool, see the [project page on Hugging Face](https://huggingface.co/datasets/guymorlan/levanti).
93
+ """
94
+
95
+ JS_FUNC = """
96
+ function refresh() {
97
+ const url = new URL(window.location);
98
+
99
+ if (url.searchParams.get('__theme') !== 'dark') {
100
+ url.searchParams.set('__theme', 'dark');
101
+ window.location.href = url.href;
102
+ }
103
+ }
104
+ """
diacritize.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ from transformers import RobertaForTokenClassification, AutoTokenizer
3
+ model = RobertaForTokenClassification.from_pretrained("guymorlan/levanti_arabic2diacritics")
4
+ tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_arabic2diacritics")
5
+
6
+ #%%
7
+ label2diacritic = {0: 'ّ', # SHADDA
8
+ 1: 'َ', # FATHA
9
+ 2: 'ِ', # KASRA
10
+ 3: 'ُ', # DAMMA
11
+ 4: 'ْ'} # SUKKUN
12
+
13
+
14
+ def arabic2diacritics(text, model, tokenizer):
15
+ tokens = tokenizer(text, return_tensors="pt")
16
+ preds = (model(**tokens).logits.sigmoid() > 0.5)[0][1:-1] # remove preds for BOS and EOS
17
+ new_text = []
18
+ for p, c in zip(preds, text):
19
+ new_text.append(c)
20
+ for i in range(1, 5):
21
+ if p[i]:
22
+ new_text.append(label2diacritic[i])
23
+ # check shadda last
24
+ if p[0]:
25
+ new_text.append(label2diacritic[0])
26
+
27
+ new_text = "".join(new_text)
28
+ return new_text
29
+
30
+
31
+ def diacritize(text):
32
+ return arabic2diacritics(text, model, tokenizer)
33
+
34
+ def diacritize_if_not_already(text):
35
+ if any(c in label2diacritic.values() for c in text):
36
+ return text
37
+ else:
38
+ return arabic2diacritics(text, model, tokenizer)
39
+ #%%
40
+ # text = "بديش اروح عالمدرسة بكرا"
41
+ # arabic2diacritics(text, model, tokenizer)
42
+ # %%
en_ar ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 76d2a612d5c6b4cc8fe16bd7608e7a0809a96ba5
en_ar_ct2/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_source_bos": false,
3
+ "add_source_eos": false,
4
+ "bos_token": "<s>",
5
+ "decoder_start_token": "</s>",
6
+ "eos_token": "</s>",
7
+ "layer_norm_epsilon": null,
8
+ "multi_query_attention": false,
9
+ "unk_token": "<unk>"
10
+ }
en_ar_ct2/model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00ce25a723eef083222e4700514043b654c329c9202e11204e630484810b2235
3
+ size 306481586
en_ar_ct2/shared_vocabulary.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers==4.42.3
2
+ torch==1.13.1
3
+ sentencepiece==0.1.97
4
+ sacremoses==0.0.53
5
+ pandas==1.5.1
6
+ azure-cognitiveservices-speech==1.25.0
7
+ matplotlib==3.7.0
8
+ python-dotenv
9
+ gradio==4.37.2
10
+ ctranslate2==4.1.0
semsearch.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import pandas as pd
4
+ import translate
5
+ import gradio as gr
6
+
7
+ # data = pd.read_csv("./embedding_data.csv")
8
+ # embeddings = np.load("./embeddings.npy")
9
+
10
+ def normalize_vector(v):
11
+ norm = np.linalg.norm(v)
12
+ if norm == 0:
13
+ return v
14
+ return v / norm
15
+
16
+
17
+ def embed_one(model, tokenizer, text, normalize=True):
18
+ tokens = tokenizer(text, return_tensors="pt", truncation=True)
19
+ with torch.no_grad():
20
+ embedding = model.model.encoder(**tokens).last_hidden_state.mean(axis=1)
21
+ embedding = embedding.detach().numpy()[0]
22
+
23
+ if normalize:
24
+ return normalize_vector(embedding)
25
+ else:
26
+ return embedding
27
+
28
+
29
+ def knn(query_embedding, embeddings, df, k=5, hebrew=True):
30
+ sims = np.dot(embeddings, query_embedding.T)
31
+ outs = np.argsort(sims, axis=0)[-k:][::-1]
32
+ select = outs.ravel()
33
+ if hebrew:
34
+ return df.iloc[select][["arabic", "hebrew", "validated"]]
35
+ else:
36
+ return df.iloc[select][["arabic", "english", "validated"]]
37
+
38
+ def run_knn(text, k=5):
39
+ print(text)
40
+ query_embedding = embed_one(translate.model_from_ar,
41
+ translate.tokenizer_from_ar, text)
42
+ return knn(query_embedding, embeddings, data, k=k, hebrew=True)
43
+
44
+
45
+ def style_dataframe(df):
46
+ styled_df = df.style.set_properties(**{
47
+ 'font-family': 'Arial, sans-serif',
48
+ 'font-size': '20px',
49
+ 'text-align': 'right',
50
+ 'direction': 'rtl',
51
+ 'align': 'right'
52
+ }).set_table_styles([
53
+ {'selector': 'th', 'props': [('text-align', 'right')]}
54
+ ])
55
+ return styled_df
56
+
57
+
58
+ def style_dataframe(df):
59
+ return df.style.set_table_styles([
60
+ {'selector': 'thead', 'props': [('text-align', 'right')]},
61
+ {'selector': '.index_name', 'props': [('text-align', 'right')]},
62
+ ]).set_properties(**{
63
+ 'text-align': 'right',
64
+ }) # Replace 'column_name' with your actual column name
65
+
66
+
67
+ def update_df(hidden_arabic):
68
+ df = run_knn(hidden_arabic, 100)
69
+ # replace true and false in validated column with checkmark and x emoji
70
+ df["validated"] = df["validated"].apply(lambda x: "✅" if x else "❌")
71
+ # replace name validated with "מאומת"
72
+ df = df.rename(columns={"validated": "מאומת"})
73
+ # replace name arabic with "ערבית"
74
+ df = df.rename(columns={"arabic": "ערבית"})
75
+ # replace name hebrew with "עברית"
76
+ df = df.rename(columns={"hebrew": "עברית"})
77
+ styled_df = style_dataframe(df)
78
+ return gr.DataFrame(value=styled_df, visible=True)
translate.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import MarianMTModel, AutoTokenizer
3
+ import ctranslate2
4
+ from colorize import align_words
5
+ import logging
6
+
7
+ # Create a logger
8
+ logger = logging.getLogger()
9
+ logger.setLevel(logging.INFO) # Set to debug to capture all levels of logs
10
+ file_handler = logging.FileHandler('app.log', mode='a') # 'a' mode appends to the file
11
+ file_handler.setLevel(logging.INFO)
12
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
13
+ file_handler.setFormatter(formatter)
14
+ logger.addHandler(file_handler)
15
+
16
+ model_to_ar = MarianMTModel.from_pretrained("./en_ar/", output_attentions=True)
17
+ model_from_ar = MarianMTModel.from_pretrained("./ar_en/", output_attentions=True)
18
+ model_to_ar_ct2 = ctranslate2.Translator("./en_ar_ct2/")
19
+ model_from_ar_ct2 = ctranslate2.Translator("./ar_en_ct2/")
20
+
21
+ tokenizer_to_ar = AutoTokenizer.from_pretrained("./en_ar/")
22
+ tokenizer_from_ar = AutoTokenizer.from_pretrained("./ar_en/")
23
+ print("Done loading models")
24
+
25
+ dialect_map = {
26
+ "Palestinian": "P",
27
+ "Syrian": "S",
28
+ "Lebanese": "L",
29
+ "Egyptian": "E",
30
+ "פלסטיני": "P",
31
+ "סורי": "S",
32
+ "לבנוני": "L",
33
+ "מצרי": "E"
34
+ }
35
+
36
+
37
+ def translate(text, ct_model, hf_model, tokenizer, to_arabic=True,
38
+ threshold=None, layer=2, head=6):
39
+
40
+ logger.info(f"Translating: {text}")
41
+ inp_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(text))
42
+ out_tokens = ct_model.translate_batch([inp_tokens])[0].hypotheses[0]
43
+ out_string = tokenizer.convert_tokens_to_string(out_tokens)
44
+
45
+ encoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(inp_tokens)).unsqueeze(0)
46
+ decoder_input_ids = torch.tensor(tokenizer.convert_tokens_to_ids(["<pad>"] + out_tokens +
47
+ ['</s>'])).unsqueeze(0)
48
+
49
+ colorization_output = hf_model(input_ids=encoder_input_ids,
50
+ decoder_input_ids=decoder_input_ids)
51
+
52
+ if not threshold:
53
+ if len(inp_tokens) < 10:
54
+ threshold = 0.05
55
+ elif len(inp_tokens) < 20:
56
+ threshold = 0.10
57
+ else:
58
+ threshold = 0.05
59
+
60
+ srchtml, tgthtml = align_words(colorization_output,
61
+ tokenizer,
62
+ encoder_input_ids,
63
+ decoder_input_ids,
64
+ threshold,
65
+ skip_first_src=to_arabic,
66
+ skip_second_src=False,
67
+ layer=layer,
68
+ head=head)
69
+
70
+ html = f"<div style='direction: rtl'>{srchtml}<br><br>{tgthtml}</div>"
71
+
72
+ arabic = out_string if is_arabic(out_string) else text
73
+ return html, arabic
74
+
75
+
76
+ #%%
77
+
78
+
79
+ def is_arabic(text):
80
+ # return True if text has more than 50% arabic characters, False otherwise
81
+ text = text.replace(" ", "")
82
+ arabic_chars = 0
83
+ for c in text:
84
+ if "\u0600" <= c <= "\u06FF":
85
+ arabic_chars += 1
86
+
87
+ return arabic_chars / len(text) > 0.5
88
+
89
+ def run_translate(text, dialect=None):
90
+ if not text:
91
+ return ""
92
+ if is_arabic(text):
93
+ return translate(text, model_from_ar_ct2, model_from_ar, tokenizer_from_ar,
94
+ to_arabic=False, threshold=None, layer=2, head=7)
95
+ else:
96
+ if dialect in dialect_map:
97
+ dialect = dialect_map[dialect]
98
+
99
+ text = f"{dialect} {text}" if dialect else text
100
+ return translate(text, model_to_ar_ct2, model_to_ar, tokenizer_to_ar,
101
+ to_arabic=True, threshold=None, layer=2, head=7)
translit.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ from transformers import CanineForTokenClassification, AutoTokenizer
3
+ import re
4
+ import torch
5
+
6
+ # instantiate module logger
7
+ import logging
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+
11
+ model = CanineForTokenClassification.from_pretrained("guymorlan/levanti_diacritics2translit")
12
+ tokenizer = AutoTokenizer.from_pretrained("guymorlan/levanti_diacritics2translit")
13
+
14
+ #%%
15
+
16
+ def diacritics2hebrew_vowels(text, model, tokenizer):
17
+ tokens = tokenizer(text, return_tensors="pt")
18
+ with torch.no_grad():
19
+ pred = model(**tokens)
20
+ pred = pred.logits.argmax(-1).tolist()
21
+
22
+ pred = pred[0][1:-1] # remove CLS and SEP
23
+ output = []
24
+ for p, c in zip(pred, text):
25
+ if p != model.config.label2id["O"]:
26
+ output.append(model.config.id2label[p])
27
+ else:
28
+ output.append(c)
29
+ output = "".join(output)
30
+
31
+ # print("Done converting to Hebrew vowels")
32
+ logger.warning("Done converting to Hebrew vowels")
33
+ return output
34
+
35
+ #%%
36
+
37
+
38
+ arabic_to_english = {
39
+ "ا": "a", "أ": "a", "إ": "a", "ء": "a", "ئ": "a", "ؤ": "a",
40
+ "آ": "aa", "ى": "a", "ب": "b", "ت": "t", "ث": "th", "ج": "j",
41
+ "ح": "h", "خ": "kh", "د": "d", "ذ": "dh", "ر": "r", "ز": "z",
42
+ "س": "s", "ش": "sh", "ص": "s", "ض": "d", "ط": "t", "ظ": "z",
43
+ "ع": "a", "غ": "gh", "ف": "f", "ق": "q", "ك": "k", "ل": "l",
44
+ "م": "m", "ن": "n", "ه": "h", "و": "w", "ي": "y", "ة": "h",
45
+ "َ": "a", "ُ": "u", "ِ": "i",
46
+ "،": ",",
47
+ "ֹ": "o", # holam
48
+ "ַ": "a", # patah
49
+ "ִ": "i", # hiriq
50
+ "ְ": "", # shva
51
+ "ֻ": "u", # kubutz
52
+ 'ֵ': "e",
53
+ "ّ": "SHADDA" # shadda
54
+ }
55
+
56
+ arabic_to_hebrew = {
57
+ # regular letters
58
+ "ا": "א", "أ": "א", "إ": "א", "ء": "א", "ئ": "א", "ؤ": "א",
59
+ "آ": "אא", "ى": "א", "ب": "בּ", "ت": "ת", "ث": "ת'", "ج": "ג'",
60
+ "ح": "ח", "خ": "ח'", "د": "ד", "ذ": "ד'", "ر": "ר", "ز": "ז",
61
+ "س": "ס", "ش": "ש", "ص": "צ", "ض": "צ'", "ط": "ט", "ظ": "ט'",
62
+ "ع": "ע", "غ": "ע'", "ف": "פ", "ق": "ק", "ك": "כּ", "ل": "ל",
63
+ "م": "מ", "ن": "נ", "ه": "ה", "و": "ו", "ي": "י", "ة": "ה",
64
+ # special characters
65
+ "،": ",", "َ": "ַ", "ُ": "ֻ", "ِ": "ִ",
66
+ "؟": "?", "؛": ";", "ـ": "",
67
+ # shadda to \u0598
68
+ "ّ": "\u0598",
69
+ }
70
+
71
+ vowels = ["،", ",", "َ", "ַ", "ُ", "ֻ", "ِ", "ִ", 'ֵ']
72
+
73
+ final_letters = {
74
+ "ن": "ן", "م": "ם", "ص": "ץ", "ض": "ץ'", "ف": "ף",
75
+ }
76
+
77
+ def reorder_hebrew_nikkud(input_string):
78
+ # in case of 2 character letter transliteration, move the nikkud back to the first character and not the apostrophe
79
+
80
+ # Define a dictionary for the nikkud signs
81
+ nikkud_signs = {
82
+ "ֹ": "o", # holam
83
+ "ַ": "a", # patah
84
+ "ִ": "i", # hiriq
85
+ "ְ": "", # shva
86
+ "ֻ": "u", # kubutz
87
+ "ֵ": "e", # tsere
88
+ "ّ": "SHADDA" # shadda
89
+ }
90
+
91
+ # Create a regex pattern for Hebrew letter followed by apostrophe and one of the nikkud signs
92
+ pattern = r'([\u0590-\u05FF])(\')([' + ''.join(nikkud_signs.keys()) + '])'
93
+ replacement = r'\1\3\2'
94
+
95
+ result = re.sub(pattern, replacement, input_string)
96
+
97
+ return result
98
+
99
+ def reverse_holam_shadda_vav(input_string):
100
+ # For better readability, replace (holam, shadda, ו) with (shadda, ו, holam)
101
+ # instead of shadda we use the replacement \u0598
102
+ pattern = r'(\u05B9)(\u0598)(\u05D5)'
103
+ replacement = r'\2\3\1'
104
+ result = re.sub(pattern, replacement, input_string)
105
+
106
+ return result
107
+
108
+ def to_taatik(arabic):
109
+ taatik = []
110
+ for index, letter in enumerate(arabic):
111
+ if (
112
+ (index == len(arabic) - 1 or arabic[index + 1] in {" ", ".", "،"}) and
113
+ letter in final_letters
114
+ ):
115
+ taatik.append(final_letters[letter])
116
+ elif letter not in arabic_to_hebrew:
117
+ taatik.append(letter)
118
+ else:
119
+ taatik.append(arabic_to_hebrew[letter])
120
+ reversed = reverse_holam_shadda_vav("".join(taatik))
121
+ reordered = reorder_hebrew_nikkud(reversed)
122
+ # print("Done converting to taatik")
123
+ logger.warning("Done converting to taatik")
124
+ return reordered
125
+
126
+
127
+
128
+
129
+ def to_translit(arabic):
130
+ translit = []
131
+ for letter in arabic:
132
+ if letter not in arabic_to_english:
133
+ translit.append([letter, letter])
134
+ else:
135
+ if arabic_to_english[letter] == "SHADDA":
136
+ if translit[-1][0] in vowels:
137
+ translit[-2][1] = translit[-2][1].upper()
138
+ else:
139
+ translit[-1][1] = translit[-1][1].upper()
140
+
141
+ else:
142
+ translit.append([letter, arabic_to_english[letter]])
143
+
144
+ return "".join([x[1] for x in translit])
145
+
146
+
147
+ # %%
148
+
149
+ def taatik(text):
150
+ return to_taatik(diacritics2hebrew_vowels(text, model, tokenizer))
151
+
152
+ def translit(text):
153
+ return to_translit(diacritics2hebrew_vowels(text, model, tokenizer))
154
+
155
+ # text = "لَازِم نِعْطِي رَشَّات وِقَائِيِّة لِلشَّجَر "
156
+ # heb_vowels = diacritics2hebrew_vowels(text, model, tokenizer)
157
+ # #%%
158
+ # to_taatik(heb_vowels)
159
+ # #%%
160
+ # to_translit(heb_vowels)
161
+ # # %%
tts.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #%%
2
+ import azure.cognitiveservices.speech as speechsdk
3
+ import re
4
+ import os
5
+ import hashlib
6
+ import random
7
+ from dotenv import load_dotenv
8
+ load_dotenv(".env")
9
+
10
+ print(os.environ.get('SPEECH_KEY'))
11
+ print(os.environ.get('SPEECH_REGION'))
12
+ speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'),
13
+ region=os.environ.get('SPEECH_REGION'))
14
+
15
+ def do_cleanup(dir='wavs', num_files=100):
16
+ files = os.listdir(dir)
17
+ if len(files) > num_files:
18
+ for file in files[:len(files) - num_files]:
19
+ os.remove(f"{dir}/{file}")
20
+
21
+ def add_sukun(text):
22
+ # Define Arabic letters and sukun
23
+ arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي'
24
+ shadda = 'ّ'
25
+ arabic_letters += shadda
26
+ sukun = 'ْ'
27
+ punctuation = '.,;!?،؛؟'
28
+
29
+ def process_word(word):
30
+ # If the last character is punctuation, process the letter before it
31
+ if word[-1] in punctuation:
32
+ if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun:
33
+ return word[:-2] + word[-2] + sukun + word[-1]
34
+ return word
35
+ # If the last character is an Arabic letter and does not have a sukun, add one
36
+ elif word[-1] in arabic_letters and word[-1] != sukun:
37
+ return word + sukun
38
+ return word
39
+
40
+ # Use regex to split text into words and punctuation
41
+ words = re.findall(r'\S+|[.,;!?،؛؟]', text)
42
+ processed_text = ' '.join(process_word(word) for word in words)
43
+ return processed_text
44
+
45
+ def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'):
46
+ return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>'
47
+
48
+
49
+ def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True):
50
+
51
+ input_text = add_sukun(input_text)
52
+ hash = hashlib.md5(input_text.encode()).hexdigest()
53
+
54
+ if os.path.exists(f"wavs/{hash}.wav"):
55
+ return f"wavs/{hash}.wav"
56
+
57
+ audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav")
58
+ # speech_config.speech_synthesis_voice_name=voice
59
+ # speech_config.speech_synthesis_language = "ar-EG"
60
+ speech_config.set_speech_synthesis_output_format(
61
+ speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
62
+ )
63
+
64
+ speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
65
+ audio_config=audio_config)
66
+ if use_ssml:
67
+ # print("Using SSML")
68
+ ssml = get_ssml(input_text, voice=voice)
69
+ result = speech_synthesizer.speak_ssml_async(ssml).get()
70
+ else:
71
+ # print("Using text")
72
+ result = speech_synthesizer.speak_text_async(input_text).get()
73
+
74
+ if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
75
+ print("Speech synthesized for text [{}]".format(input_text))
76
+ elif result.reason == speechsdk.ResultReason.Canceled:
77
+ cancellation_details = result.cancellation_details
78
+ print("Speech synthesis canceled: {}".format(cancellation_details.reason))
79
+ if cancellation_details.reason == speechsdk.CancellationReason.Error:
80
+ print("Error details: {}".format(cancellation_details.error_details))
81
+
82
+ # randomly every 50 calls, clean up the wavs folder
83
+ if random.randint(1, 50) == 1:
84
+ do_cleanup()
85
+
86
+ return f"wavs/{hash}.wav"
wavs/1cfdc7d62daa8ab925371ac17ea4e792.wav ADDED
Binary file (133 kB). View file
 
wavs/5fc7e1c64e032cdbfc6435dc9a6a32ce.wav ADDED
Binary file (226 kB). View file
 
wavs/8300935681825cf0e5c467f08fb31325.wav ADDED
Binary file (128 kB). View file