cointegrated
commited on
Commit
·
193923d
1
Parent(s):
3c7eb3b
evolve from the myv-rus demo
Browse files- README.md +4 -6
- app.py +1 -1
- requirements.txt +1 -1
- translation.py +14 -30
README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
---
|
2 |
-
title: NLLB
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.46.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: NLLB-extended translation demo
|
3 |
+
emoji: 🐘
|
4 |
+
colorFrom: cyan
|
5 |
+
colorTo: blue
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.46.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
|
app.py
CHANGED
@@ -23,7 +23,7 @@ def translate_wrapper(text, src, trg, by_sentence=True, preprocess=True, random=
|
|
23 |
|
24 |
|
25 |
article = f"""
|
26 |
-
This is the demo for a NLLB-200-600M model fine-tuned for
|
27 |
|
28 |
The model itself is available at https://huggingface.co/{MODEL_URL}
|
29 |
|
|
|
23 |
|
24 |
|
25 |
article = f"""
|
26 |
+
This is the demo for a NLLB-200-600M model fine-tuned for a few (mostly new) languages.
|
27 |
|
28 |
The model itself is available at https://huggingface.co/{MODEL_URL}
|
29 |
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
transformers==4.
|
2 |
sentencepiece
|
3 |
gradio>=3.18.0
|
4 |
torch
|
|
|
1 |
+
transformers==4.39
|
2 |
sentencepiece
|
3 |
gradio>=3.18.0
|
4 |
torch
|
translation.py
CHANGED
@@ -8,13 +8,23 @@ from sacremoses import MosesPunctNormalizer
|
|
8 |
from sentence_splitter import SentenceSplitter
|
9 |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
|
10 |
|
11 |
-
MODEL_URL = "slone/nllb-
|
12 |
LANGUAGES = {
|
13 |
-
"
|
14 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
16 |
L1 = "rus_Cyrl"
|
17 |
-
L2 = "
|
18 |
|
19 |
|
20 |
def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
|
@@ -54,31 +64,6 @@ class TextPreprocessor:
|
|
54 |
return clean
|
55 |
|
56 |
|
57 |
-
def fix_tokenizer(tokenizer, new_lang=L2):
|
58 |
-
"""Add a new language token to the tokenizer vocabulary
|
59 |
-
(this should be done each time after its initialization)
|
60 |
-
"""
|
61 |
-
old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
|
62 |
-
tokenizer.lang_code_to_id[new_lang] = old_len - 1
|
63 |
-
tokenizer.id_to_lang_code[old_len - 1] = new_lang
|
64 |
-
# always move "mask" to the last position
|
65 |
-
tokenizer.fairseq_tokens_to_ids["<mask>"] = (
|
66 |
-
len(tokenizer.sp_model)
|
67 |
-
+ len(tokenizer.lang_code_to_id)
|
68 |
-
+ tokenizer.fairseq_offset
|
69 |
-
)
|
70 |
-
|
71 |
-
tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
|
72 |
-
tokenizer.fairseq_ids_to_tokens = {
|
73 |
-
v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()
|
74 |
-
}
|
75 |
-
if new_lang not in tokenizer._additional_special_tokens:
|
76 |
-
tokenizer._additional_special_tokens.append(new_lang)
|
77 |
-
# clear the added token encoder; otherwise a new token may end up there by mistake
|
78 |
-
tokenizer.added_tokens_encoder = {}
|
79 |
-
tokenizer.added_tokens_decoder = {}
|
80 |
-
|
81 |
-
|
82 |
def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
|
83 |
"""Apply a sentence splitter and return the sentences and all separators before and after them"""
|
84 |
if fix_double_space:
|
@@ -104,7 +89,6 @@ class Translator:
|
|
104 |
if torch.cuda.is_available():
|
105 |
self.model.cuda()
|
106 |
self.tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
|
107 |
-
fix_tokenizer(self.tokenizer)
|
108 |
|
109 |
self.splitter = SentenceSplitter("ru")
|
110 |
self.preprocessor = TextPreprocessor()
|
|
|
8 |
from sentence_splitter import SentenceSplitter
|
9 |
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
|
10 |
|
11 |
+
MODEL_URL = "slone/nllb-210-v1"
|
12 |
LANGUAGES = {
|
13 |
+
"Русский | Russian": "rus_Cyrl",
|
14 |
+
"English | Английский": "eng_Latn",
|
15 |
+
"Azərbaycan | Azerbaijani | Азербайджанский": "azj_Latn",
|
16 |
+
"Башҡорт | Bashkir | Башкирский": "bak_Cyrl",
|
17 |
+
"Буряад | Buryat | Бурятский": "bxr_Cyrl",
|
18 |
+
"Чӑваш | Chuvash | Чувашский": "chv_Cyrl",
|
19 |
+
"Хакас | Khakas | Хакасский": "kjh_Cyrl",
|
20 |
+
"Къарачай-малкъар | Karachay-Balkar | Карачаево-балкарский": "krc_Cyrl",
|
21 |
+
"Марий | Meadow Mari | Марийский": "mhr_Cyrl",
|
22 |
+
"Эрзянь | Erzya | Эрзянский": "myv_Cyrl",
|
23 |
+
"Татар | Tatar | Татарский": "tat_Cyrl",
|
24 |
+
"Тыва | Тувинский | Tuvan ": "tyv_Cyrl",
|
25 |
}
|
26 |
L1 = "rus_Cyrl"
|
27 |
+
L2 = "eng_Latn"
|
28 |
|
29 |
|
30 |
def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
|
|
|
64 |
return clean
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
|
68 |
"""Apply a sentence splitter and return the sentences and all separators before and after them"""
|
69 |
if fix_double_space:
|
|
|
89 |
if torch.cuda.is_available():
|
90 |
self.model.cuda()
|
91 |
self.tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
|
|
|
92 |
|
93 |
self.splitter = SentenceSplitter("ru")
|
94 |
self.preprocessor = TextPreprocessor()
|