Spaces:
Running
Running
Update utils/translator.py
Browse files- utils/translator.py +13 -17
utils/translator.py
CHANGED
@@ -24,7 +24,7 @@ def clean_text(text: str) -> str:
|
|
24 |
|
25 |
def chunk_text(text: str, max_chunk_chars: int = 500):
|
26 |
"""
|
27 |
-
Split long text into
|
28 |
"""
|
29 |
words = text.split()
|
30 |
chunks, current_chunk = [], ""
|
@@ -53,18 +53,12 @@ def translate_chunks(chunks, tokenizer, model):
|
|
53 |
return " ".join(translated)
|
54 |
|
55 |
def translate_to_portuguese(text: str) -> str:
|
56 |
-
"""
|
57 |
-
๐บ๐ธ โก๏ธ ๐ง๐ท Translate from English to Portuguese.
|
58 |
-
"""
|
59 |
if not text.strip():
|
60 |
return "No input provided."
|
61 |
chunks = chunk_text(clean_text(text))
|
62 |
return translate_chunks(chunks, tokenizer_en_pt, model_en_pt)
|
63 |
|
64 |
def translate_to_english(text: str) -> str:
|
65 |
-
"""
|
66 |
-
๐ง๐ท โก๏ธ ๐บ๐ธ Translate from Portuguese to English.
|
67 |
-
"""
|
68 |
if not text.strip():
|
69 |
return "No input provided."
|
70 |
chunks = chunk_text(clean_text(text))
|
@@ -72,8 +66,7 @@ def translate_to_english(text: str) -> str:
|
|
72 |
|
73 |
def translate_text(text: str, direction: str = "en-pt") -> str:
|
74 |
"""
|
75 |
-
|
76 |
-
direction = "en-pt" ou "pt-en"
|
77 |
"""
|
78 |
if direction == "en-pt":
|
79 |
return translate_to_portuguese(text)
|
@@ -84,24 +77,27 @@ def translate_text(text: str, direction: str = "en-pt") -> str:
|
|
84 |
|
85 |
# ========== Bilingual View ==========
|
86 |
|
87 |
-
def bilingual_clauses(
|
88 |
"""
|
89 |
-
Create
|
90 |
"""
|
91 |
-
|
92 |
-
|
93 |
|
|
|
94 |
bilingual_output = []
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
98 |
return "\n\n".join(bilingual_output)
|
99 |
|
100 |
# ========== Export to DOCX ==========
|
101 |
|
102 |
def export_to_word(text: str, filename: str = "translated_contract.docx") -> str:
|
103 |
"""
|
104 |
-
Export bilingual
|
105 |
"""
|
106 |
doc = Document()
|
107 |
doc.add_heading("Legal Translation Output", level=1)
|
|
|
24 |
|
25 |
def chunk_text(text: str, max_chunk_chars: int = 500):
|
26 |
"""
|
27 |
+
Split long text into chunks based on character count.
|
28 |
"""
|
29 |
words = text.split()
|
30 |
chunks, current_chunk = [], ""
|
|
|
53 |
return " ".join(translated)
|
54 |
|
55 |
def translate_to_portuguese(text: str) -> str:
|
|
|
|
|
|
|
56 |
if not text.strip():
|
57 |
return "No input provided."
|
58 |
chunks = chunk_text(clean_text(text))
|
59 |
return translate_chunks(chunks, tokenizer_en_pt, model_en_pt)
|
60 |
|
61 |
def translate_to_english(text: str) -> str:
|
|
|
|
|
|
|
62 |
if not text.strip():
|
63 |
return "No input provided."
|
64 |
chunks = chunk_text(clean_text(text))
|
|
|
66 |
|
67 |
def translate_text(text: str, direction: str = "en-pt") -> str:
|
68 |
"""
|
69 |
+
direction = 'en-pt' or 'pt-en'
|
|
|
70 |
"""
|
71 |
if direction == "en-pt":
|
72 |
return translate_to_portuguese(text)
|
|
|
77 |
|
78 |
# ========== Bilingual View ==========
|
79 |
|
80 |
+
def bilingual_clauses(text: str) -> str:
|
81 |
"""
|
82 |
+
Create bilingual clause-by-clause output (EN + PT).
|
83 |
"""
|
84 |
+
if not text.strip():
|
85 |
+
return "No input provided."
|
86 |
|
87 |
+
clauses_en = chunk_text(clean_text(text), max_chunk_chars=300)
|
88 |
bilingual_output = []
|
89 |
+
|
90 |
+
for clause in clauses_en:
|
91 |
+
translated = translate_to_portuguese(clause)
|
92 |
+
bilingual_output.append(f"๐ EN: {clause}\n๐ PT: {translated}\n" + "-" * 60)
|
93 |
+
|
94 |
return "\n\n".join(bilingual_output)
|
95 |
|
96 |
# ========== Export to DOCX ==========
|
97 |
|
98 |
def export_to_word(text: str, filename: str = "translated_contract.docx") -> str:
|
99 |
"""
|
100 |
+
Export text (bilingual or full) to Word DOCX.
|
101 |
"""
|
102 |
doc = Document()
|
103 |
doc.add_heading("Legal Translation Output", level=1)
|