SCBconsulting commited on
Commit
4615658
Β·
verified Β·
1 Parent(s): 0369c37

Update utils/translator.py

Browse files
Files changed (1) hide show
  1. utils/translator.py +22 -42
utils/translator.py CHANGED
@@ -1,65 +1,45 @@
1
- # utils/translator.py
2
 
3
- import torch
4
- import spacy
5
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
 
7
- # βœ… Load HF translation model (PT-BR)
8
- model_name = "unicamp-dl/translation-en-pt-t5"
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
10
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
11
 
12
- # βœ… Load spaCy for sentence-aware chunking
13
- try:
14
- nlp = spacy.load("en_core_web_sm")
15
- except OSError:
16
- import spacy.cli
17
- spacy.cli.download("en_core_web_sm")
18
- nlp = spacy.load("en_core_web_sm")
19
 
20
- def split_into_chunks(text, max_chunk_len=300):
21
  """
22
- Split input into sentence-aware chunks using spaCy.
23
  """
24
- doc = nlp(text)
25
  chunks = []
26
  current_chunk = ""
27
 
28
- for sent in doc.sents:
29
- if len(current_chunk) + len(sent.text) < max_chunk_len:
30
- current_chunk += sent.text + " "
31
  else:
32
  chunks.append(current_chunk.strip())
33
- current_chunk = sent.text + " "
34
-
35
  if current_chunk:
36
  chunks.append(current_chunk.strip())
37
-
38
  return chunks
39
 
40
- def translate_text(text):
41
  """
42
- Translate contract from English to Brazilian Portuguese using Hugging Face model.
43
- Includes chunking, cleanup, and progress timing.
44
  """
45
- import time
46
-
47
  if not text.strip():
48
- return "No input to translate."
49
 
50
- # Clean input
51
- text = text.replace("\n", " ").replace(" ", " ").strip()
52
- chunks = split_into_chunks(text)
53
- translated_chunks = []
54
 
55
- start = time.time()
56
  for chunk in chunks:
57
- inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
58
- with torch.no_grad():
59
- outputs = model.generate(**inputs, max_length=512, num_beams=4)
60
- translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
61
- translated_chunks.append(translated)
62
- end = time.time()
63
 
64
- print(f"πŸ•’ Translation took {end - start:.2f} seconds")
65
  return " ".join(translated_chunks)
 
1
+ # utils/translate.py
2
 
3
+ from transformers import pipeline
 
 
4
 
5
+ # 🌍 Translation pipeline (English β†’ Portuguese)
6
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-pt")
 
 
7
 
8
+ def clean_text(text):
9
+ return text.replace("\n", " ").replace(" ", " ").strip()
 
 
 
 
 
10
 
11
+ def chunk_text(text, max_chunk_chars=500):
12
  """
13
+ πŸ”ͺ Chunk long text into segments that fit model constraints.
14
  """
15
+ words = text.split()
16
  chunks = []
17
  current_chunk = ""
18
 
19
+ for word in words:
20
+ if len(current_chunk) + len(word) + 1 <= max_chunk_chars:
21
+ current_chunk += " " + word
22
  else:
23
  chunks.append(current_chunk.strip())
24
+ current_chunk = word
 
25
  if current_chunk:
26
  chunks.append(current_chunk.strip())
27
+
28
  return chunks
29
 
30
+ def translate_to_portuguese(text):
31
  """
32
+ 🌐 Clean, chunk, and translate English text into Portuguese.
 
33
  """
 
 
34
  if not text.strip():
35
+ return "No input provided."
36
 
37
+ text = clean_text(text)
38
+ chunks = chunk_text(text)
 
 
39
 
40
+ translated_chunks = []
41
  for chunk in chunks:
42
+ result = translator(chunk, max_length=512)
43
+ translated_chunks.append(result[0]["translation_text"])
 
 
 
 
44
 
 
45
  return " ".join(translated_chunks)