This a model to translate texts from the Erzya language (myv
, cyrillic script) to 11 other languages: ru,fi,de,es,en,hi,zh,tr,uk,fr,ar
. See its demo!
It is described in the paper The first neural machine translation system for the Erzya language.
This model is based on facebook/mbart-large-50, but with updated vocabulary and checkpoint:
- Added an extra language token
myv_XX
and 19K new BPE tokens for the Erzya language; - Fine-tuned to translate to Erzya: first from Russian, then from all 11 languages.
The following code can be used to run translation using the model:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
def fix_tokenizer(tokenizer):
""" Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
old_len = len(tokenizer) - int('myv_XX' in tokenizer.added_tokens_encoder)
tokenizer.lang_code_to_id['myv_XX'] = old_len-1
tokenizer.id_to_lang_code[old_len-1] = 'myv_XX'
tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
if 'myv_XX' not in tokenizer._additional_special_tokens:
tokenizer._additional_special_tokens.append('myv_XX')
tokenizer.added_tokens_encoder = {}
def translate(text, model, tokenizer, src='ru_RU', trg='myv_XX', max_length='auto', num_beams=3, repetition_penalty=5.0, train_mode=False, n_out=None, **kwargs):
tokenizer.src_lang = src
encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)
if max_length == 'auto':
max_length = int(32 + 1.5 * encoded.input_ids.shape[1])
if train_mode:
model.train()
else:
model.eval()
generated_tokens = model.generate(
**encoded.to(model.device),
forced_bos_token_id=tokenizer.lang_code_to_id[trg],
max_length=max_length,
num_beams=num_beams,
repetition_penalty=repetition_penalty,
num_return_sequences=n_out or 1,
**kwargs
)
out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
if isinstance(text, str) and n_out is None:
return out[0]
return out
mname = 'slone/mbart-large-51-mul-myv-v1'
model = MBartForConditionalGeneration.from_pretrained(mname)
tokenizer = MBart50Tokenizer.from_pretrained(mname)
fix_tokenizer(tokenizer)
print(translate('Привет, собака!', model, tokenizer, src='ru_RU', trg='myv_XX'))
# Шумбрат, киска! # действительно, по-эрзянски собака именно так
print(translate('Hello, doggy!', model, tokenizer, src='en_XX', trg='myv_XX'))
# Шумбрат, киска!
- Downloads last month
- 119
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.