Spaces:

flutter-painter
/

nllb-fra-fuf-v2

Runtime error

App Files Files Community

flutter-painter commited on Jan 27, 2024

Commit

be86b23

verified ·

1 Parent(s): 5a4a746

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -172

app.py CHANGED Viewed

@@ -1,182 +1,53 @@
-import re
-import sys
-import typing as tp
-import unicodedata
-import torch
-from sacremoses import MosesPunctNormalizer
-from sentence_splitter import SentenceSplitter
-from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
-MODEL_URL = "flutter-painter/nllb-fra-fuf-v2"
-LANGUAGES = {
-    "French": "fra_Latn",
-    "Fula": "fuf_Latn",
-}
-def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
-    non_printable_map = {
-        ord(c): replace_by
-        for c in (chr(i) for i in range(sys.maxunicode + 1))
-        # same as \p{C} in perl
-        # see https://www.unicode.org/reports/tr44/#General_Category_Values
-        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
-    }
-    def replace_non_printing_char(line) -> str:
-        return line.translate(non_printable_map)
-    return replace_non_printing_char
-class TextPreprocessor:
-    """
-    Mimic the text preprocessing made for the NLLB model.
-    This code is adapted from the Stopes repo of the NLLB team:
-    https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214
-    """
-    def __init__(self, lang="en"):
-        self.mpn = MosesPunctNormalizer(lang=lang)
-        self.mpn.substitutions = [
-            (re.compile(r), sub) for r, sub in self.mpn.substitutions
-        ]
-        self.replace_nonprint = get_non_printing_char_replacer(" ")
-    def __call__(self, text: str) -> str:
-        clean = self.mpn.normalize(text)
-        clean = self.replace_nonprint(clean)
-        # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
-        clean = unicodedata.normalize("NFKC", clean)
-        return clean
-def fix_tokenizer(tokenizer, new_lang="fuf_Latn"):
-    """Add a new language token to the tokenizer vocabulary
-    (this should be done each time after its initialization)
-    """
-    old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
-    tokenizer.lang_code_to_id[new_lang] = old_len - 1
-    tokenizer.id_to_lang_code[old_len - 1] = new_lang
-    # always move "mask" to the last position
-    tokenizer.fairseq_tokens_to_ids["<mask>"] = (
-        len(tokenizer.sp_model)
-        + len(tokenizer.lang_code_to_id)
-        + tokenizer.fairseq_offset
     )
-    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
-    tokenizer.fairseq_ids_to_tokens = {
-        v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()
-    }
-    if new_lang not in tokenizer._additional_special_tokens:
-        tokenizer._additional_special_tokens.append(new_lang)
-    # clear the added token encoder; otherwise a new token may end up there by mistake
-    tokenizer.added_tokens_encoder = {}
-    tokenizer.added_tokens_decoder = {}
-def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
-    """Apply a sentence splitter and return the sentences and all separators before and after them"""
-    if fix_double_space:
-        text = re.sub(" +", " ", text)
-    sentences = splitter.split(text)
-    fillers = []
-    i = 0
-    for sentence in sentences:
-        start_idx = text.find(sentence, i)
-        if ignore_errors and start_idx == -1:
-            # print(f"sent not found after {i}: `{sentence}`")
-            start_idx = i + 1
-        assert start_idx != -1, f"sent not found after {i}: `{sentence}`"
-        fillers.append(text[i:start_idx])
-        i = start_idx + len(sentence)
-    fillers.append(text[i:])
-    return sentences, fillers
-class Translator:
-    def __init__(self):
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
-        if torch.cuda.is_available():
-            self.model.cuda()
-        self.tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
-        fix_tokenizer(self.tokenizer)
-        self.splitter = SentenceSplitter("fr")
-        self.preprocessor = TextPreprocessor()
-        self.languages = LANGUAGES
-    def translate(
-        self,
-        text,
-        src_lang="fra_Latn",
-        tgt_lang="fuf_Latn",
-        max_length="auto",
-        num_beams=4,
-        by_sentence=True,
-        preprocess=True,
-        **kwargs,
-    ):
-        """Translate a text sentence by sentence, preserving the fillers around the sentences."""
-        if by_sentence:
-            sents, fillers = sentenize_with_fillers(
-                text, splitter=self.splitter, ignore_errors=True
-            )
-        else:
-            sents = [text]
-            fillers = ["", ""]
-        if preprocess:
-            sents = [self.preprocessor(sent) for sent in sents]
-        results = []
-        for sent, sep in zip(sents, fillers):
-            results.append(sep)
-            results.append(
-                self.translate_single(
-                    sent,
-                    src_lang=src_lang,
-                    tgt_lang=tgt_lang,
-                    max_length=max_length,
-                    num_beams=num_beams,
-                    **kwargs,
-                )
-            )
-        results.append(fillers[-1])
-        return "".join(results)
-    def translate_single(
-        self,
-        text,
-        src_lang="fra_Latn",
-        tgt_lang="fuf_Latn",
-        max_length="auto",
-        num_beams=4,
-        n_out=None,
-        **kwargs,
-    ):
-        self.tokenizer.src_lang = src_lang
-        encoded = self.tokenizer(
-            text, return_tensors="pt", truncation=True, max_length=512
-        )
-        if max_length == "auto":
-            max_length = int(32 + 2.0 * encoded.input_ids.shape[1])
-        generated_tokens = self.model.generate(
-            **encoded.to(self.model.device),
-            forced_bos_token_id=self.tokenizer.lang_code_to_id[tgt_lang],
-            max_length=max_length,
-            num_beams=num_beams,
-            num_return_sequences=n_out or 1,
-            **kwargs,
-        )
-        out = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-        if isinstance(text, str) and n_out is None:
-            return out[0]
-        return out
-if __name__ == "__main__":
-    print("Initializing a translator to pre-download models...")
     translator = Translator()
-    print("Initialization successful!")

+import gradio as gr
+from translation import Translator, LANGUAGES
+LANGUAGES_LIST = list(LANGUAGES.keys())
+def translate_wrapper(text, src, trg, by_sentence=True, preprocess=True, random=False, num_beams=4):
+    src_lang = LANGUAGES.get(src)
+    tgt_lang = LANGUAGES.get(trg)
+    # if src == trg:
+    #     return 'Please choose two different languages'
+    result = translator.translate(
+        text=text,
+        src_lang=src_lang,
+        tgt_lang=tgt_lang,
+        do_sample=random,
+        num_beams=int(num_beams),
+        by_sentence=by_sentence,
+        preprocess=preprocess,
     )
+    return result
+article = """
+This is a NLLB-200-600M model fine-tuned for translation between French and Fula (Guinea) languages,
+This model is described in https://cointegrated.medium.com/a37fc706b865.
+"""
+interface = gr.Interface(
+    translate_wrapper,
+    [
+        gr.Textbox(label="Text", lines=2, placeholder='text to translate '),
+        gr.Dropdown(LANGUAGES_LIST, type="value", label='source language', value=LANGUAGES_LIST[0]),
+        gr.Dropdown(LANGUAGES_LIST, type="value", label='target language', value=LANGUAGES_LIST[1]),
+        gr.Checkbox(label="by sentence", value=True),
+        gr.Checkbox(label="text preprocesing", value=True),
+        gr.Checkbox(label="randomize", value=False),
+        gr.Dropdown([1, 2, 3, 4, 5], label="number of beams", value=4),
+    ],
+    "text",
+    title='Tyvan-Russian translaton',
+    article=article,
+)
+if __name__ == '__main__':
     translator = Translator()
+    interface.launch()