Spaces:

SCBconsulting
/

synclm-demo

Running

App Files Files Community

SCBconsulting commited on 20 days ago

Commit

63fd1be

verified ·

1 Parent(s): feddd31

Update utils/translator.py

Browse files

Files changed (1) hide show

utils/translator.py +15 -6

utils/translator.py CHANGED Viewed

@@ -2,14 +2,14 @@
 import os
 import torch
-import spacy
 import requests
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-# Optional DeepL API key
 DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
-# ✅ Hugging Face fallback model (PT-BR)
 model_name = "unicamp-dl/translation-en-pt-t5"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -18,7 +18,6 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
-    # For Hugging Face Spaces: auto-download if missing
     import spacy.cli
     spacy.cli.download("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
@@ -48,13 +47,17 @@ def split_into_chunks(text, max_chunk_len=500):
 def translate_text(text):
     """
     Translate full contract using DeepL (if available) or fallback Hugging Face model.
     """
     if not text.strip():
         return "No input to translate."
-    # ✅ Try DeepL first if available
     if DEEPL_API_KEY:
         try:
             response = requests.post(
                 "https://api.deepl.com/v2/translate",
                 data={
@@ -63,20 +66,26 @@ def translate_text(text):
                     "target_lang": "PT-BR"  # 🇧🇷 Specific for Brazil
                 },
             )
             return response.json()["translations"][0]["text"]
         except Exception as e:
             print("⚠️ DeepL failed, falling back to Hugging Face:", str(e))
-    # ✅ Use Hugging Face fallback model with spaCy chunking
     chunks = split_into_chunks(text)
     translated_chunks = []
     for chunk in chunks:
         inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model.generate(**inputs, max_length=512, num_beams=4)
         translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
         translated_chunks.append(translated)
     return " ".join(translated_chunks)

 import os
 import torch
 import requests
+import spacy
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# ✅ Optional DeepL API key
 DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
+# ✅ Hugging Face fallback model (Brazilian Portuguese)
 model_name = "unicamp-dl/translation-en-pt-t5"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
     import spacy.cli
     spacy.cli.download("en_core_web_sm")
     nlp = spacy.load("en_core_web_sm")
 def translate_text(text):
     """
     Translate full contract using DeepL (if available) or fallback Hugging Face model.
+    Includes chunking and timing.
     """
     if not text.strip():
         return "No input to translate."
+    import time  # ⏱️
+    # ✅ Try DeepL first
     if DEEPL_API_KEY:
         try:
+            start = time.time()
             response = requests.post(
                 "https://api.deepl.com/v2/translate",
                 data={
                     "target_lang": "PT-BR"  # 🇧🇷 Specific for Brazil
                 },
             )
+            end = time.time()
+            print(f"🕒 DeepL Translation took {end - start:.2f} seconds")
             return response.json()["translations"][0]["text"]
         except Exception as e:
             print("⚠️ DeepL failed, falling back to Hugging Face:", str(e))
+    # ✅ Hugging Face fallback with sentence chunking
     chunks = split_into_chunks(text)
     translated_chunks = []
+    start = time.time()
     for chunk in chunks:
         inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model.generate(**inputs, max_length=512, num_beams=4)
         translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
         translated_chunks.append(translated)
+    end = time.time()
+    print(f"🕒 Hugging Face Translation took {end - start:.2f} seconds")
     return " ".join(translated_chunks)