Spaces:
Sleeping
Sleeping
SinaAhmadi
commited on
Commit
•
4f3ec12
1
Parent(s):
e9d8fdb
Update app.py
Browse files
app.py
CHANGED
@@ -17,56 +17,69 @@ from joeynmt.datasets import build_dataset
|
|
17 |
|
18 |
import gradio as gr
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
load_model = load_model if ckpt is None else Path(ckpt)
|
34 |
-
ckpt = resolve_ckpt_path(load_model, model_dir)
|
35 |
-
|
36 |
-
src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)
|
37 |
-
|
38 |
-
model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
|
39 |
-
|
40 |
-
# load model state from disk
|
41 |
-
model_checkpoint = load_checkpoint(ckpt, device=device)
|
42 |
-
model.load_state_dict(model_checkpoint["model_state"])
|
43 |
-
|
44 |
-
if device.type == "cuda":
|
45 |
-
model.to(device)
|
46 |
-
|
47 |
-
tokenizer = build_tokenizer(cfg["data"])
|
48 |
-
sequence_encoder = {
|
49 |
-
src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
|
50 |
-
trg_cfg["lang"]: None,
|
51 |
}
|
52 |
|
53 |
-
|
54 |
-
test_cfg["batch_type"] = "sentence"
|
55 |
-
|
56 |
-
test_data = build_dataset(
|
57 |
-
dataset_type="stream",
|
58 |
-
path=None,
|
59 |
-
src_lang=src_cfg["lang"],
|
60 |
-
trg_lang=trg_cfg["lang"],
|
61 |
-
split="test",
|
62 |
-
tokenizer=tokenizer,
|
63 |
-
sequence_encoder=sequence_encoder,
|
64 |
-
)
|
65 |
-
# test_data.set_item(INPUT.rstrip())
|
66 |
-
|
67 |
-
|
68 |
-
def _translate_data(test_data, cfg=test_cfg):
|
69 |
"""Translates given dataset, using parameters from outer scope."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
_, _, hypotheses, trg_tokens, trg_scores, _ = predict(
|
71 |
model=model,
|
72 |
data=test_data,
|
@@ -84,7 +97,7 @@ def _translate_data(test_data, cfg=test_cfg):
|
|
84 |
|
85 |
def normalize(text, language_script):
|
86 |
test_data.set_item(text)
|
87 |
-
result = _translate_data(test_data)
|
88 |
return result
|
89 |
|
90 |
|
@@ -106,35 +119,32 @@ description = """
|
|
106 |
For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a>
|
107 |
"""
|
108 |
|
109 |
-
languages_scripts = {
|
110 |
-
"Azeri Turkish in Persian": "AzeriTurkish-Persian",
|
111 |
-
"Central Kurdish in Arabic": "Sorani-Arabic",
|
112 |
-
"Central Kurdish in Persian": "Sorani-Persian",
|
113 |
-
"Gilaki in Persian": "Gilaki-Persian",
|
114 |
-
"Gorani in Arabic": "Gorani-Arabic",
|
115 |
-
"Gorani in Central Kurdish": "Gorani-Sorani",
|
116 |
-
"Gorani in Persian": "Gorani-Persian",
|
117 |
-
"Kashmiri in Urdu": "Kashmiri-Urdu",
|
118 |
-
"Mazandarani in Persian": "Mazandarani-Persian",
|
119 |
-
"Northern Kurdish in Arabic": "Kurmanji-Arabic",
|
120 |
-
"Northern Kurdish in Persian": "Kurmanji-Persian",
|
121 |
-
"Sindhi in Urdu": "Sindhi-Urdu"
|
122 |
-
}
|
123 |
-
|
124 |
examples = [
|
|
|
125 |
["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
|
126 |
-
["
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
]
|
128 |
|
|
|
|
|
129 |
demo = gr.Interface(
|
130 |
title=title,
|
131 |
description=description,
|
132 |
fn=normalize,
|
133 |
inputs = [
|
134 |
-
gr.inputs.Textbox(lines=4, label="Noisy Text"),
|
135 |
gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
|
136 |
],
|
137 |
-
outputs=gr.outputs.Textbox(label="Normalized Text"),
|
138 |
examples=examples
|
139 |
)
|
140 |
|
|
|
17 |
|
18 |
import gradio as gr
|
19 |
|
20 |
+
languages_scripts = {
|
21 |
+
"Azeri Turkish in Persian": "AzeriTurkish-Persian",
|
22 |
+
"Central Kurdish in Arabic": "Sorani-Arabic",
|
23 |
+
"Central Kurdish in Persian": "Sorani-Persian",
|
24 |
+
"Gilaki in Persian": "Gilaki-Persian",
|
25 |
+
"Gorani in Arabic": "Gorani-Arabic",
|
26 |
+
"Gorani in Central Kurdish": "Gorani-Sorani",
|
27 |
+
"Gorani in Persian": "Gorani-Persian",
|
28 |
+
"Kashmiri in Urdu": "Kashmiri-Urdu",
|
29 |
+
"Mazandarani in Persian": "Mazandarani-Persian",
|
30 |
+
"Northern Kurdish in Arabic": "Kurmanji-Arabic",
|
31 |
+
"Northern Kurdish in Persian": "Kurmanji-Persian",
|
32 |
+
"Sindhi in Urdu": "Sindhi-Urdu"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
}
|
34 |
|
35 |
+
def _translate_data(test_data, language_script):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
"""Translates given dataset, using parameters from outer scope."""
|
37 |
+
cfg_file = './models/%s/config.yaml'
|
38 |
+
ckpt = "./models/%s/best.ckpt"%languages_scripts[language_script]
|
39 |
+
|
40 |
+
cfg = load_config(Path(cfg_file))
|
41 |
+
# parse and validate cfg
|
42 |
+
model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(
|
43 |
+
cfg["training"], mode="prediction")
|
44 |
+
test_cfg = cfg["testing"]
|
45 |
+
src_cfg = cfg["data"]["src"]
|
46 |
+
trg_cfg = cfg["data"]["trg"]
|
47 |
+
|
48 |
+
load_model = load_model if ckpt is None else Path(ckpt)
|
49 |
+
ckpt = resolve_ckpt_path(load_model, model_dir)
|
50 |
+
|
51 |
+
src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)
|
52 |
+
|
53 |
+
model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
|
54 |
+
|
55 |
+
# load model state from disk
|
56 |
+
model_checkpoint = load_checkpoint(ckpt, device=device)
|
57 |
+
model.load_state_dict(model_checkpoint["model_state"])
|
58 |
+
|
59 |
+
if device.type == "cuda":
|
60 |
+
model.to(device)
|
61 |
+
|
62 |
+
tokenizer = build_tokenizer(cfg["data"])
|
63 |
+
sequence_encoder = {
|
64 |
+
src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
|
65 |
+
trg_cfg["lang"]: None,
|
66 |
+
}
|
67 |
+
|
68 |
+
test_cfg["batch_size"] = 1 # CAUTION: this will raise an error if n_gpus > 1
|
69 |
+
test_cfg["batch_type"] = "sentence"
|
70 |
+
|
71 |
+
test_data = build_dataset(
|
72 |
+
dataset_type="stream",
|
73 |
+
path=None,
|
74 |
+
src_lang=src_cfg["lang"],
|
75 |
+
trg_lang=trg_cfg["lang"],
|
76 |
+
split="test",
|
77 |
+
tokenizer=tokenizer,
|
78 |
+
sequence_encoder=sequence_encoder,
|
79 |
+
)
|
80 |
+
# test_data.set_item(INPUT.rstrip())
|
81 |
+
|
82 |
+
cfg=test_cfg
|
83 |
_, _, hypotheses, trg_tokens, trg_scores, _ = predict(
|
84 |
model=model,
|
85 |
data=test_data,
|
|
|
97 |
|
98 |
def normalize(text, language_script):
|
99 |
test_data.set_item(text)
|
100 |
+
result = _translate_data(test_data, language_script)
|
101 |
return result
|
102 |
|
103 |
|
|
|
119 |
For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a>
|
120 |
"""
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
examples = [
|
123 |
+
["بو شهرین نوفوسو ، 2014 نجی ایلين نوفوس ساییمی اساسيندا 41 نفر ایمیش .", "Azeri Turkish in Persian"],#"بۇ شهرین نۆفوسو ، 2014 نجی ایلين نۆفوس ساییمی اساسيندا 41 نفر ایمیش ."
|
124 |
["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
|
125 |
+
["یکیک له جوانیکانی ام شاره جوانه", "Central Kurdish in Persian"],
|
126 |
+
["نمک درهٰ مردوم گيلک ايسن ؤ اوشان زوان ني گيلکي ايسه .", "Gilaki in Persian"],
|
127 |
+
["شؤنةو اانةيةرة گةشت و گلي ناجارانةو اؤجالاني دةستش پنةكةرد", "Gorani in Arabic"], #شۆنەو ئانەیەرە گەشت و گێڵی ناچارانەو ئۆجالانی دەستش پنەکەرد
|
128 |
+
["ڕوٙو زوانی ئەذایی چەنی پەیذابی ؟", "Gorani in Central Kurdish"], # ڕوٙو زوانی ئەڎایی چەنی پەیڎابی ؟
|
129 |
+
["هنگامکان ظميٛ ر چمان ، بپا کريٛلي بيشان :", "Gorani in Persian"], # هەنگامەکان وزمیٛ وەرو چەمان ، بەپاو کریٛڵی بیەشان :
|
130 |
+
["ربعی بن افکل اُسے اَکھ صُحابی .", "Kashmiri in Urdu"], # ربعی بن افکل ٲسؠ اَکھ صُحابی .
|
131 |
+
["اینتا زون گنشکرون 85 میلیون نفر هسن", "Mazandarani in Persian"], # اینتا زوون گِنِشکَرون 85 میلیون نفر هسنه
|
132 |
+
["بة رطكا هة صطئن ژ دل هاطة بة لافكرن", "Northern Kurdish in Arabic"], #پەرتوکا هەستێن ژ دل هاتە بەلافکرن
|
133 |
+
["ثرکى همرنگ نرميني دويت هندک قوناغين دي ببريت", "Northern Kurdish in Persian"], # سەرەکی هەمەرەنگ نەرمینێ دڤێت هندەک قوناغێن دی ببڕیت
|
134 |
+
["ہتی کجھ اپ ۽ تمام دائون ترینون بیھندیون آھن .", "Sindhi in Urdu"] # هتي ڪجھ اپ ۽ تمام ڊائون ٽرينون بيھنديون آھن .
|
135 |
]
|
136 |
|
137 |
+
|
138 |
+
|
139 |
demo = gr.Interface(
|
140 |
title=title,
|
141 |
description=description,
|
142 |
fn=normalize,
|
143 |
inputs = [
|
144 |
+
gr.inputs.Textbox(lines=4, label="Noisy Text \U0001F974"),
|
145 |
gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
|
146 |
],
|
147 |
+
outputs=gr.outputs.Textbox(label="Normalized Text \U0001F642"),
|
148 |
examples=examples
|
149 |
)
|
150 |
|