Spaces:
Running
Running
Update utils/translator.py
Browse files- utils/translator.py +15 -6
utils/translator.py
CHANGED
@@ -2,14 +2,14 @@
|
|
2 |
|
3 |
import os
|
4 |
import torch
|
5 |
-
import spacy
|
6 |
import requests
|
|
|
7 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
8 |
|
9 |
-
# Optional DeepL API key
|
10 |
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
|
11 |
|
12 |
-
# β
Hugging Face fallback model (
|
13 |
model_name = "unicamp-dl/translation-en-pt-t5"
|
14 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
15 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
@@ -18,7 +18,6 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
18 |
try:
|
19 |
nlp = spacy.load("en_core_web_sm")
|
20 |
except OSError:
|
21 |
-
# For Hugging Face Spaces: auto-download if missing
|
22 |
import spacy.cli
|
23 |
spacy.cli.download("en_core_web_sm")
|
24 |
nlp = spacy.load("en_core_web_sm")
|
@@ -48,13 +47,17 @@ def split_into_chunks(text, max_chunk_len=500):
|
|
48 |
def translate_text(text):
|
49 |
"""
|
50 |
Translate full contract using DeepL (if available) or fallback Hugging Face model.
|
|
|
51 |
"""
|
52 |
if not text.strip():
|
53 |
return "No input to translate."
|
54 |
|
55 |
-
#
|
|
|
|
|
56 |
if DEEPL_API_KEY:
|
57 |
try:
|
|
|
58 |
response = requests.post(
|
59 |
"https://api.deepl.com/v2/translate",
|
60 |
data={
|
@@ -63,20 +66,26 @@ def translate_text(text):
|
|
63 |
"target_lang": "PT-BR" # π§π· Specific for Brazil
|
64 |
},
|
65 |
)
|
|
|
|
|
66 |
return response.json()["translations"][0]["text"]
|
67 |
except Exception as e:
|
68 |
print("β οΈ DeepL failed, falling back to Hugging Face:", str(e))
|
69 |
|
70 |
-
# β
|
71 |
chunks = split_into_chunks(text)
|
72 |
translated_chunks = []
|
73 |
|
|
|
74 |
for chunk in chunks:
|
75 |
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
76 |
with torch.no_grad():
|
77 |
outputs = model.generate(**inputs, max_length=512, num_beams=4)
|
78 |
translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
79 |
translated_chunks.append(translated)
|
|
|
80 |
|
|
|
81 |
return " ".join(translated_chunks)
|
82 |
|
|
|
|
2 |
|
3 |
import os
|
4 |
import torch
|
|
|
5 |
import requests
|
6 |
+
import spacy
|
7 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
8 |
|
9 |
+
# β
Optional DeepL API key
|
10 |
DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
|
11 |
|
12 |
+
# β
Hugging Face fallback model (Brazilian Portuguese)
|
13 |
model_name = "unicamp-dl/translation-en-pt-t5"
|
14 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
15 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
|
18 |
try:
|
19 |
nlp = spacy.load("en_core_web_sm")
|
20 |
except OSError:
|
|
|
21 |
import spacy.cli
|
22 |
spacy.cli.download("en_core_web_sm")
|
23 |
nlp = spacy.load("en_core_web_sm")
|
|
|
47 |
def translate_text(text):
|
48 |
"""
|
49 |
Translate full contract using DeepL (if available) or fallback Hugging Face model.
|
50 |
+
Includes chunking and timing.
|
51 |
"""
|
52 |
if not text.strip():
|
53 |
return "No input to translate."
|
54 |
|
55 |
+
import time # β±οΈ
|
56 |
+
|
57 |
+
# β
Try DeepL first
|
58 |
if DEEPL_API_KEY:
|
59 |
try:
|
60 |
+
start = time.time()
|
61 |
response = requests.post(
|
62 |
"https://api.deepl.com/v2/translate",
|
63 |
data={
|
|
|
66 |
"target_lang": "PT-BR" # π§π· Specific for Brazil
|
67 |
},
|
68 |
)
|
69 |
+
end = time.time()
|
70 |
+
print(f"π DeepL Translation took {end - start:.2f} seconds")
|
71 |
return response.json()["translations"][0]["text"]
|
72 |
except Exception as e:
|
73 |
print("β οΈ DeepL failed, falling back to Hugging Face:", str(e))
|
74 |
|
75 |
+
# β
Hugging Face fallback with sentence chunking
|
76 |
chunks = split_into_chunks(text)
|
77 |
translated_chunks = []
|
78 |
|
79 |
+
start = time.time()
|
80 |
for chunk in chunks:
|
81 |
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
82 |
with torch.no_grad():
|
83 |
outputs = model.generate(**inputs, max_length=512, num_beams=4)
|
84 |
translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
85 |
translated_chunks.append(translated)
|
86 |
+
end = time.time()
|
87 |
|
88 |
+
print(f"π Hugging Face Translation took {end - start:.2f} seconds")
|
89 |
return " ".join(translated_chunks)
|
90 |
|
91 |
+
|