Spaces:

SCBconsulting
/

synclm-demo

Running

App Files Files Community

SCBconsulting commited on 18 days ago

Commit

be438da

verified ·

1 Parent(s): 15b6b6b

Update utils/translator.py

Browse files

Files changed (1) hide show

utils/translator.py +64 -18

utils/translator.py CHANGED Viewed

@@ -2,35 +2,81 @@
 import os
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
-# ✅ Better fallback model (Brazilian Portuguese)
 model_name = "unicamp-dl/translation-en-pt-t5"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 def translate_text(text):
     if not text.strip():
         return "No input to translate."
-    try:
-        import requests
-        response = requests.post(
-            "https://api.deepl.com/v2/translate",
-            data={
-                "auth_key": DEEPL_API_KEY,
-                "text": text,
-                "target_lang": "PT-BR"  # 🟢 Brazil-specific
-            },
-        )
-        return response.json()["translations"][0]["text"]
-    except Exception:
-        # 🔁 Use HF fallback model
-        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model.generate(**inputs, max_length=512, num_beams=4)
-        return tokenizer.decode(outputs[0], skip_special_tokens=True)

 import os
 import torch
+import spacy
+import requests
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# Optional DeepL API key
 DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
+# ✅ Hugging Face fallback model (PT-BR)
 model_name = "unicamp-dl/translation-en-pt-t5"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+# ✅ Load spaCy English model for sentence parsing
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    # For Hugging Face Spaces: auto-download if missing
+    import spacy.cli
+    spacy.cli.download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+def split_into_chunks(text, max_chunk_len=500):
+    """
+    Split text into NLP-aware sentence chunks using spaCy.
+    """
+    doc = nlp(text)
+    chunks = []
+    current_chunk = ""
+    for sent in doc.sents:
+        if len(current_chunk) + len(sent.text) < max_chunk_len:
+            current_chunk += sent.text + " "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sent.text + " "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
 def translate_text(text):
+    """
+    Translate full contract using DeepL (if available) or fallback Hugging Face model.
+    """
     if not text.strip():
         return "No input to translate."
+    # ✅ Try DeepL first if available
+    if DEEPL_API_KEY:
+        try:
+            response = requests.post(
+                "https://api.deepl.com/v2/translate",
+                data={
+                    "auth_key": DEEPL_API_KEY,
+                    "text": text,
+                    "target_lang": "PT-BR"  # 🇧🇷 Specific for Brazil
+                },
+            )
+            return response.json()["translations"][0]["text"]
+        except Exception as e:
+            print("⚠️ DeepL failed, falling back to Hugging Face:", str(e))
+    # ✅ Use Hugging Face fallback model with spaCy chunking
+    chunks = split_into_chunks(text)
+    translated_chunks = []
+    for chunk in chunks:
+        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model.generate(**inputs, max_length=512, num_beams=4)
+        translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        translated_chunks.append(translated)
+    return " ".join(translated_chunks)