--- datasets: - alexjerpelea/AroTranslate-rup-ron-dataset language: - ro - en license: cc-by-nc-4.0 tags: - aromanian - macedo-romanian --- This model is an extension of [the first coherent Aromanian translator](https://huggingface.co/alexjerpelea/NLLB-aromanian-romanian-v1).
It is a [NLLB-200-600M](https://huggingface.co/facebook/nllb-200-distilled-600M) model fine-tuned for translating between any two languages out of: Aromanian, Romanian & English, using this [dataset](https://huggingface.co/datasets/alexjerpelea/aromanian-romanian-MT-corpus), which was artificially extended with Google Translate API. Read more about AroTranslate at [this GitHub repository](https://github.com/lolismek/AroTranslate.git). We present the following results: | | ron -> rup | rup -> ron | rup -> eng | eng -> rup | ron -> eng | eng -> ron |:----|:-----|:-----|:----|:-----|:-----|:-----| | BLEU | 33.18 | 54.36 | 51.25 | 25.16 | 66.96 | 52.16 | ChrF2++ | 59.47 | 68.54 | 66.13 | 52.68 | 78.84 | 70.34 Note: * As Aromanian does not have a standard writing system, please see code below for text normalization. * This model was trained for production, being able to handle absence of diacritics. We do however recommend using them. How to use the model: ```py from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, NllbTokenizer import re # load model and tokenizer: model = AutoModelForSeq2SeqLM.from_pretrained('alexjerpelea/NLLB-aromanian-romanian-english') tokenizer = tokenizer = AutoTokenizer.from_pretrained('alexjerpelea/NLLB-aromanian-romanian-english') # translate function: def translate( text, src_lang='ron_Latn', tgt_lang='rup_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs ): tokenizer.src_lang = src_lang tokenizer.tgt_lang = tgt_lang inputs = tokenizer( text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length ) model.eval() result = model.generate( **inputs.to(model.device), forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang), max_new_tokens=int(a + b * inputs.input_ids.shape[1]), num_beams=num_beams, **kwargs ) return tokenizer.batch_decode(result, skip_special_tokens=True) def clean_text(text, lang): if isinstance(text, float): return text # consecutive spaces text = re.sub(r'\s+', ' ', text).strip() # old romanian î in the middle of the word text = re.sub(r'(?<=\w)î(?=\w)', 'â', text) if lang == 'ron': text = text.replace('Ş', 'Ș') text = text.replace('ş', 'ș') text = text.replace('Ţ', 'Ț') text = text.replace('ţ', 'ț') else: text = text.replace('ş', 'sh') text = text.replace('ș', 'sh') text = text.replace('ţ', 'ts') text = text.replace('ț', 'ts') text = text.replace('Ş', 'Sh') text = text.replace('Ș', 'Sh') text = text.replace('Ţ', 'Ts') text = text.replace('Ț', 'Ts') text = text.replace('ľ', 'lj') text = text.replace('Ľ', 'L') text = text.replace("l'", "lj") text = text.replace("l’", "lj") text = text.replace("L'", "Lj") text = text.replace("L’", "Lj") text = text.replace('ḑ', 'dz') text = text.replace('Ḑ', 'dz') text = text.replace('ḍ', 'dz') text = text.replace('Ḍ', 'Dz') # TODO: add n' text = text.replace('ń', 'nj') text = text.replace('Ń', 'Nj') text = text.replace('ñ', 'nj') text = text.replace('Ñ', 'Nj') text = text.replace('ă', 'ã') text = text.replace('Â', 'Ã') text = text.replace('â', 'ã') text = text.replace('Ă', 'Ã') text = text.replace('á', 'ã') text = text.replace('à', 'ã') text = text.replace('Á', 'Ã') text = text.replace('À', 'Ã') text = text.replace('Î', 'Ã') text = text.replace('î', 'ã') # weird foreign characters text = text.replace('ŭ', 'u') text = text.replace('ς', 'c') text = text.replace('é', 'e') text = text.replace('í', 'i') text = text.replace('ū', 'u') text = text.replace('ì', 'i') text = text.replace('ā', 'a') text = text.replace('ĭ', 'i') text = text.replace('γ', 'y') text = text.replace('ï', 'i') text = text.replace('ó', 'o') text = text.replace('θ', 'O') # for both languages: text = text.replace('—', '-') text = text.replace('–', '-') text = text.replace('…', '...') text = text.replace('*', '') text = text.replace('<', '') text = text.replace('>', '') text = text.replace('„', '"') text = text.replace('”', '"') text = text.replace('“', '"') text = text.replace('”', '"') text = text.replace('\xa0', '') text = text.replace('\ufeff', '') text = text.replace('\n', '') return text # Aromanian to Romanian: t = '''Trã atsea cãdzu pri mare cripare, shi tutã dzua stãtea ãnvirinat.''' t = clean_text(t, 'rup') print(translate(t, 'rup_Latn', 'ron_Latn')) # Romanian to Aromanian: t = '''Apoi se opri puțin, o sorbi din ochi, o sărută și - când începu să scâncească, îi cântă iar:''' t = clean_text(t, 'rup') print(translate(t, 'rup_Latn', 'ron_Latn')) # Aromanian to English: t = '''Cã a ta boatsi e birbil ti suflitu a meu.''' t = clean_text(t, 'rup') print(translate(t, 'rup_Latn', 'eng_Latn')) # English to Aromanian: t = '''That your voice is the nightingale of my soul.''' print(translate(t, 'eng_Latn', 'rup_Latn')) ``` ## License Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License. When using this work, please mention its name as "AroTranslate" and the author.