# utils/translator.py

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from docx import Document

# ========== Model Loading (Cached Once) ==========

def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

# English → Portuguese
tokenizer_en_pt, model_en_pt = load_model_and_tokenizer("unicamp-dl/translation-en-pt-t5")

# Portuguese → English
tokenizer_pt_en, model_pt_en = load_model_and_tokenizer("unicamp-dl/translation-pt-en-t5")

# ========== Preprocessing ==========

def clean_text(text: str) -> str:
    return text.replace("\n", " ").replace("  ", " ").strip()

def chunk_text(text: str, max_chunk_chars: int = 500):
    """
    Split long text into chunks based on character count.
    """
    words = text.split()
    chunks, current_chunk = [], ""

    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_chunk_chars:
            current_chunk += " " + word
        else:
            chunks.append(current_chunk.strip())
            current_chunk = word
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# ========== Translation Core Logic ==========

def translate_chunks(chunks, tokenizer, model):
    translated = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=512, num_beams=4)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        translated.append(decoded)
    return " ".join(translated)

def translate_to_portuguese(text: str) -> str:
    if not text.strip():
        return "No input provided."
    chunks = chunk_text(clean_text(text))
    return translate_chunks(chunks, tokenizer_en_pt, model_en_pt)

def translate_to_english(text: str) -> str:
    if not text.strip():
        return "No input provided."
    chunks = chunk_text(clean_text(text))
    return translate_chunks(chunks, tokenizer_pt_en, model_pt_en)

def translate_text(text: str, direction: str = "en-pt") -> str:
    """
    direction = 'en-pt' or 'pt-en'
    """
    if direction == "en-pt":
        return translate_to_portuguese(text)
    elif direction == "pt-en":
        return translate_to_english(text)
    else:
        return "Unsupported translation direction."

# ========== Bilingual View ==========

def bilingual_clauses(text: str) -> str:
    """
    Create bilingual clause-by-clause output (EN + PT).
    """
    if not text.strip():
        return "No input provided."

    clauses_en = chunk_text(clean_text(text), max_chunk_chars=300)
    bilingual_output = []

    for clause in clauses_en:
        translated = translate_to_portuguese(clause)
        bilingual_output.append(f"📘 EN: {clause}\n📗 PT: {translated}\n" + "-" * 60)

    return "\n\n".join(bilingual_output)

# ========== Export to DOCX ==========

def export_to_word(text: str, filename: str = "translated_contract.docx") -> str:
    """
    Export text (bilingual or full) to Word DOCX.
    """
    doc = Document()
    doc.add_heading("Legal Translation Output", level=1)
    for para in text.split("\n\n"):
        doc.add_paragraph(para)
    doc.save(filename)
    return filename