SCBconsulting commited on
Commit
63fd1be
Β·
verified Β·
1 Parent(s): feddd31

Update utils/translator.py

Browse files
Files changed (1) hide show
  1. utils/translator.py +15 -6
utils/translator.py CHANGED
@@ -2,14 +2,14 @@
2
 
3
  import os
4
  import torch
5
- import spacy
6
  import requests
 
7
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
 
9
- # Optional DeepL API key
10
  DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
11
 
12
- # βœ… Hugging Face fallback model (PT-BR)
13
  model_name = "unicamp-dl/translation-en-pt-t5"
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -18,7 +18,6 @@ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
18
  try:
19
  nlp = spacy.load("en_core_web_sm")
20
  except OSError:
21
- # For Hugging Face Spaces: auto-download if missing
22
  import spacy.cli
23
  spacy.cli.download("en_core_web_sm")
24
  nlp = spacy.load("en_core_web_sm")
@@ -48,13 +47,17 @@ def split_into_chunks(text, max_chunk_len=500):
48
  def translate_text(text):
49
  """
50
  Translate full contract using DeepL (if available) or fallback Hugging Face model.
 
51
  """
52
  if not text.strip():
53
  return "No input to translate."
54
 
55
- # βœ… Try DeepL first if available
 
 
56
  if DEEPL_API_KEY:
57
  try:
 
58
  response = requests.post(
59
  "https://api.deepl.com/v2/translate",
60
  data={
@@ -63,20 +66,26 @@ def translate_text(text):
63
  "target_lang": "PT-BR" # πŸ‡§πŸ‡· Specific for Brazil
64
  },
65
  )
 
 
66
  return response.json()["translations"][0]["text"]
67
  except Exception as e:
68
  print("⚠️ DeepL failed, falling back to Hugging Face:", str(e))
69
 
70
- # βœ… Use Hugging Face fallback model with spaCy chunking
71
  chunks = split_into_chunks(text)
72
  translated_chunks = []
73
 
 
74
  for chunk in chunks:
75
  inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
76
  with torch.no_grad():
77
  outputs = model.generate(**inputs, max_length=512, num_beams=4)
78
  translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
  translated_chunks.append(translated)
 
80
 
 
81
  return " ".join(translated_chunks)
82
 
 
 
2
 
3
  import os
4
  import torch
 
5
  import requests
6
+ import spacy
7
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
 
9
+ # βœ… Optional DeepL API key
10
  DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")
11
 
12
+ # βœ… Hugging Face fallback model (Brazilian Portuguese)
13
  model_name = "unicamp-dl/translation-en-pt-t5"
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
18
  try:
19
  nlp = spacy.load("en_core_web_sm")
20
  except OSError:
 
21
  import spacy.cli
22
  spacy.cli.download("en_core_web_sm")
23
  nlp = spacy.load("en_core_web_sm")
 
47
  def translate_text(text):
48
  """
49
  Translate full contract using DeepL (if available) or fallback Hugging Face model.
50
+ Includes chunking and timing.
51
  """
52
  if not text.strip():
53
  return "No input to translate."
54
 
55
+ import time # ⏱️
56
+
57
+ # βœ… Try DeepL first
58
  if DEEPL_API_KEY:
59
  try:
60
+ start = time.time()
61
  response = requests.post(
62
  "https://api.deepl.com/v2/translate",
63
  data={
 
66
  "target_lang": "PT-BR" # πŸ‡§πŸ‡· Specific for Brazil
67
  },
68
  )
69
+ end = time.time()
70
+ print(f"πŸ•’ DeepL Translation took {end - start:.2f} seconds")
71
  return response.json()["translations"][0]["text"]
72
  except Exception as e:
73
  print("⚠️ DeepL failed, falling back to Hugging Face:", str(e))
74
 
75
+ # βœ… Hugging Face fallback with sentence chunking
76
  chunks = split_into_chunks(text)
77
  translated_chunks = []
78
 
79
+ start = time.time()
80
  for chunk in chunks:
81
  inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
82
  with torch.no_grad():
83
  outputs = model.generate(**inputs, max_length=512, num_beams=4)
84
  translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
85
  translated_chunks.append(translated)
86
+ end = time.time()
87
 
88
+ print(f"πŸ•’ Hugging Face Translation took {end - start:.2f} seconds")
89
  return " ".join(translated_chunks)
90
 
91
+