SCBconsulting commited on
Commit
be438da
Β·
verified Β·
1 Parent(s): 15b6b6b

Update utils/translator.py

Browse files
Files changed (1) hide show
  1. utils/translator.py +64 -18
utils/translator.py CHANGED
@@ -2,35 +2,81 @@
2
 
3
  import os
4
  import torch
 
 
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
 
 
7
  DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
8
 
9
- # βœ… Better fallback model (Brazilian Portuguese)
10
  model_name = "unicamp-dl/translation-en-pt-t5"
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def translate_text(text):
 
 
 
15
  if not text.strip():
16
  return "No input to translate."
17
 
18
- try:
19
- import requests
20
-
21
- response = requests.post(
22
- "https://api.deepl.com/v2/translate",
23
- data={
24
- "auth_key": DEEPL_API_KEY,
25
- "text": text,
26
- "target_lang": "PT-BR" # 🟒 Brazil-specific
27
- },
28
- )
29
- return response.json()["translations"][0]["text"]
30
-
31
- except Exception:
32
- # πŸ” Use HF fallback model
33
- inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
 
 
 
 
34
  with torch.no_grad():
35
  outputs = model.generate(**inputs, max_length=512, num_beams=4)
36
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
2
 
3
  import os
4
  import torch
5
+ import spacy
6
+ import requests
7
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
 
9
+ # Optional DeepL API key
10
  DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
11
 
12
+ # βœ… Hugging Face fallback model (PT-BR)
13
  model_name = "unicamp-dl/translation-en-pt-t5"
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
16
 
17
+ # βœ… Load spaCy English model for sentence parsing
18
+ try:
19
+ nlp = spacy.load("en_core_web_sm")
20
+ except OSError:
21
+ # For Hugging Face Spaces: auto-download if missing
22
+ import spacy.cli
23
+ spacy.cli.download("en_core_web_sm")
24
+ nlp = spacy.load("en_core_web_sm")
25
+
26
+
27
+ def split_into_chunks(text, max_chunk_len=500):
28
+ """
29
+ Split text into NLP-aware sentence chunks using spaCy.
30
+ """
31
+ doc = nlp(text)
32
+ chunks = []
33
+ current_chunk = ""
34
+
35
+ for sent in doc.sents:
36
+ if len(current_chunk) + len(sent.text) < max_chunk_len:
37
+ current_chunk += sent.text + " "
38
+ else:
39
+ chunks.append(current_chunk.strip())
40
+ current_chunk = sent.text + " "
41
+
42
+ if current_chunk:
43
+ chunks.append(current_chunk.strip())
44
+
45
+ return chunks
46
+
47
+
48
  def translate_text(text):
49
+ """
50
+ Translate full contract using DeepL (if available) or fallback Hugging Face model.
51
+ """
52
  if not text.strip():
53
  return "No input to translate."
54
 
55
+ # βœ… Try DeepL first if available
56
+ if DEEPL_API_KEY:
57
+ try:
58
+ response = requests.post(
59
+ "https://api.deepl.com/v2/translate",
60
+ data={
61
+ "auth_key": DEEPL_API_KEY,
62
+ "text": text,
63
+ "target_lang": "PT-BR" # πŸ‡§πŸ‡· Specific for Brazil
64
+ },
65
+ )
66
+ return response.json()["translations"][0]["text"]
67
+ except Exception as e:
68
+ print("⚠️ DeepL failed, falling back to Hugging Face:", str(e))
69
+
70
+ # βœ… Use Hugging Face fallback model with spaCy chunking
71
+ chunks = split_into_chunks(text)
72
+ translated_chunks = []
73
+
74
+ for chunk in chunks:
75
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
76
  with torch.no_grad():
77
  outputs = model.generate(**inputs, max_length=512, num_beams=4)
78
+ translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
+ translated_chunks.append(translated)
80
+
81
+ return " ".join(translated_chunks)
82
+