SCBconsulting commited on
Commit
32c204d
·
verified ·
1 Parent(s): 2ceb2ac

Update utils/translator.py

Browse files
Files changed (1) hide show
  1. utils/translator.py +24 -3
utils/translator.py CHANGED
@@ -1,9 +1,30 @@
1
  # utils/translate.py
2
 
3
- from transformers import pipeline
4
 
5
- # 🌍 Translation pipeline (English → Portuguese)
6
- translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def clean_text(text):
9
  return text.replace("\n", " ").replace(" ", " ").strip()
 
1
  # utils/translate.py
2
 
3
+ # utils/translator.py
4
 
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ import torch
7
+
8
+ # 🧠 Load formal Brazilian Portuguese legal translator
9
+ model_name = "unicamp-dl/translation-en-pt-t5"
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
12
+
13
+ def translate_text(text):
14
+ if not text.strip():
15
+ return "No input provided."
16
+
17
+ text = text.replace("\n", " ").strip()
18
+ chunks = [text[i:i + 512] for i in range(0, len(text), 512)]
19
+ translated = []
20
+
21
+ for chunk in chunks:
22
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
23
+ with torch.no_grad():
24
+ outputs = model.generate(**inputs, max_length=512, num_beams=4)
25
+ translated.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
26
+
27
+ return " ".join(translated)
28
 
29
  def clean_text(text):
30
  return text.replace("\n", " ").replace(" ", " ").strip()