Spaces:

SCBconsulting
/

synclm-demo

Running

App Files Files Community

SCBconsulting commited on 16 days ago

Commit

4615658

verified ·

1 Parent(s): 0369c37

Update utils/translator.py

Browse files

Files changed (1) hide show

utils/translator.py +22 -42

utils/translator.py CHANGED Viewed

@@ -1,65 +1,45 @@
-# utils/translator.py
-import torch
-import spacy
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# ✅ Load HF translation model (PT-BR)
-model_name = "unicamp-dl/translation-en-pt-t5"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-# ✅ Load spaCy for sentence-aware chunking
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    import spacy.cli
-    spacy.cli.download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
-def split_into_chunks(text, max_chunk_len=300):
     """
-    Split input into sentence-aware chunks using spaCy.
     """
-    doc = nlp(text)
     chunks = []
     current_chunk = ""
-    for sent in doc.sents:
-        if len(current_chunk) + len(sent.text) < max_chunk_len:
-            current_chunk += sent.text + " "
         else:
             chunks.append(current_chunk.strip())
-            current_chunk = sent.text + " "
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
-def translate_text(text):
     """
-    Translate contract from English to Brazilian Portuguese using Hugging Face model.
-    Includes chunking, cleanup, and progress timing.
     """
-    import time
     if not text.strip():
-        return "No input to translate."
-    # Clean input
-    text = text.replace("\n", " ").replace("  ", " ").strip()
-    chunks = split_into_chunks(text)
-    translated_chunks = []
-    start = time.time()
     for chunk in chunks:
-        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
-        with torch.no_grad():
-            outputs = model.generate(**inputs, max_length=512, num_beams=4)
-        translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        translated_chunks.append(translated)
-    end = time.time()
-    print(f"🕒 Translation took {end - start:.2f} seconds")
     return " ".join(translated_chunks)

+# utils/translate.py
+from transformers import pipeline
+# 🌍 Translation pipeline (English → Portuguese)
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-pt")
+def clean_text(text):
+    return text.replace("\n", " ").replace("  ", " ").strip()
+def chunk_text(text, max_chunk_chars=500):
     """
+    🔪 Chunk long text into segments that fit model constraints.
     """
+    words = text.split()
     chunks = []
     current_chunk = ""
+    for word in words:
+        if len(current_chunk) + len(word) + 1 <= max_chunk_chars:
+            current_chunk += " " + word
         else:
             chunks.append(current_chunk.strip())
+            current_chunk = word
     if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
+def translate_to_portuguese(text):
     """
+    🌐 Clean, chunk, and translate English text into Portuguese.
     """
     if not text.strip():
+        return "No input provided."
+    text = clean_text(text)
+    chunks = chunk_text(text)
+    translated_chunks = []
     for chunk in chunks:
+        result = translator(chunk, max_length=512)
+        translated_chunks.append(result[0]["translation_text"])
     return " ".join(translated_chunks)