SCBconsulting commited on
Commit
7aa9e67
Β·
verified Β·
1 Parent(s): d63338c

Update utils/translator.py

Browse files
Files changed (1) hide show
  1. utils/translator.py +67 -22
utils/translator.py CHANGED
@@ -2,34 +2,26 @@
2
 
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  import torch
 
5
 
6
- # 🧠 Load formal Brazilian Portuguese legal translator
7
- model_name = "unicamp-dl/translation-en-pt-t5"
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
10
 
11
- def translate_text(text):
12
- if not text.strip():
13
- return "No input provided."
14
-
15
- text = text.replace("\n", " ").strip()
16
- chunks = [text[i:i + 512] for i in range(0, len(text), 512)]
17
- translated = []
18
-
19
- for chunk in chunks:
20
- inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
21
- with torch.no_grad():
22
- outputs = model.generate(**inputs, max_length=512, num_beams=4)
23
- translated.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
24
 
25
- return " ".join(translated)
26
 
27
  def clean_text(text):
28
  return text.replace("\n", " ").replace(" ", " ").strip()
29
 
30
  def chunk_text(text, max_chunk_chars=500):
31
  """
32
- πŸ”ͺ Chunk long text into segments that fit model constraints.
33
  """
34
  words = text.split()
35
  chunks = []
@@ -46,9 +38,32 @@ def chunk_text(text, max_chunk_chars=500):
46
 
47
  return chunks
48
 
 
 
49
  def translate_to_portuguese(text):
50
  """
51
- 🌐 Clean, chunk, and translate English text into Portuguese.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  """
53
  if not text.strip():
54
  return "No input provided."
@@ -58,7 +73,37 @@ def translate_to_portuguese(text):
58
 
59
  translated_chunks = []
60
  for chunk in chunks:
61
- result = translator(chunk, max_length=512)
62
- translated_chunks.append(result[0]["translation_text"])
 
 
 
63
 
64
  return " ".join(translated_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  import torch
5
+ from docx import Document
6
 
7
+ # ========== Load EN β†’ PT model ==========
8
+ en_pt_model_name = "unicamp-dl/translation-en-pt-t5"
9
+ tokenizer_en_pt = AutoTokenizer.from_pretrained(en_pt_model_name)
10
+ model_en_pt = AutoModelForSeq2SeqLM.from_pretrained(en_pt_model_name)
11
 
12
+ # ========== Load PT β†’ EN model ==========
13
+ pt_en_model_name = "unicamp-dl/translation-pt-en-t5"
14
+ tokenizer_pt_en = AutoTokenizer.from_pretrained(pt_en_model_name)
15
+ model_pt_en = AutoModelForSeq2SeqLM.from_pretrained(pt_en_model_name)
 
 
 
 
 
 
 
 
 
16
 
17
+ # ========== Text Cleaning & Chunking ==========
18
 
19
  def clean_text(text):
20
  return text.replace("\n", " ").replace(" ", " ").strip()
21
 
22
  def chunk_text(text, max_chunk_chars=500):
23
  """
24
+ πŸ”ͺ Break long input into token-safe chunks.
25
  """
26
  words = text.split()
27
  chunks = []
 
38
 
39
  return chunks
40
 
41
+ # ========== Translation Functions ==========
42
+
43
  def translate_to_portuguese(text):
44
  """
45
+ πŸ‡ΊπŸ‡Έ ➑️ πŸ‡§πŸ‡· Translate English to Portuguese.
46
+ """
47
+ if not text.strip():
48
+ return "No input provided."
49
+
50
+ text = clean_text(text)
51
+ chunks = chunk_text(text)
52
+
53
+ translated_chunks = []
54
+ for chunk in chunks:
55
+ inputs = tokenizer_en_pt(chunk, return_tensors="pt", truncation=True, padding=True)
56
+ with torch.no_grad():
57
+ outputs = model_en_pt.generate(**inputs, max_length=512, num_beams=4)
58
+ translated = tokenizer_en_pt.decode(outputs[0], skip_special_tokens=True)
59
+ translated_chunks.append(translated)
60
+
61
+ return " ".join(translated_chunks)
62
+
63
+
64
+ def translate_to_english(text):
65
+ """
66
+ πŸ‡§πŸ‡· ➑️ πŸ‡ΊπŸ‡Έ Translate Portuguese to English.
67
  """
68
  if not text.strip():
69
  return "No input provided."
 
73
 
74
  translated_chunks = []
75
  for chunk in chunks:
76
+ inputs = tokenizer_pt_en(chunk, return_tensors="pt", truncation=True, padding=True)
77
+ with torch.no_grad():
78
+ outputs = model_pt_en.generate(**inputs, max_length=512, num_beams=4)
79
+ translated = tokenizer_pt_en.decode(outputs[0], skip_special_tokens=True)
80
+ translated_chunks.append(translated)
81
 
82
  return " ".join(translated_chunks)
83
+
84
+ # ========== Bilingual Layout ==========
85
+
86
+ def bilingual_clauses(english_text):
87
+ """
88
+ πŸ“„ Generate side-by-side bilingual clauses.
89
+ """
90
+ clauses_en = chunk_text(clean_text(english_text), max_chunk_chars=300)
91
+ clauses_pt = [translate_to_portuguese(c) for c in clauses_en]
92
+
93
+ bilingual = []
94
+ for en, pt in zip(clauses_en, clauses_pt):
95
+ bilingual.append(f"πŸ“˜ EN: {en}\nπŸ“— PT: {pt}\n" + "-" * 60)
96
+ return "\n\n".join(bilingual)
97
+
98
+ # ========== Export to DOCX ==========
99
+
100
+ def export_to_word(text, filename="translated_contract.docx"):
101
+ """
102
+ πŸ“ Export text block to Word document.
103
+ """
104
+ doc = Document()
105
+ doc.add_heading("Legal Translation Output", level=1)
106
+ for para in text.split("\n\n"):
107
+ doc.add_paragraph(para)
108
+ doc.save(filename)
109
+ return filename