Spaces:

gauravchand11
/

try

Build error

try

File size: 7,643 Bytes

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import streamlit as st
from PyPDF2 import PdfReader
import docx
import os
import re
from datetime import datetime

# Page config
st.set_page_config(
    page_title="Document Translator (NLLB-200)",
    page_icon="📄",
    layout="wide"
)

# Load NLLB model and tokenizer
@st.cache_resource
def load_translation_model():
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

# Initialize model
@st.cache_resource
def initialize_models():
    tokenizer, model = load_translation_model()
    return {"nllb": (tokenizer, model)}

def split_long_sentence(sentence, max_length=200):
    """Split long sentences into smaller chunks at appropriate break points."""
    if len(sentence) <= max_length:
        return [sentence]
    
    chunks = []
    current_chunk = ""
    words = sentence.split()
    
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_length:
            current_chunk += (" " + word if current_chunk else word)
        else:
            chunks.append(current_chunk)
            current_chunk = word
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def preprocess_idioms(text, src_lang, tgt_lang):
    if src_lang == "en" and tgt_lang == "hi":
        idiom_map = {
            # Common English-Hindi idiom mappings
            "no piece of cake": "कोई आसान काम नहीं",
            "bite the bullet": "दांतों तले उंगली दबाना",
            "tackle it head-on": "इसे पूरे मन से हाथ में लेना",
            "fell into place": "ठीक हो गया",
            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
            "with a little perseverance": "थोड़े से धैर्य से",
            "break the ice": "बातचीत की शुरुआत करना",
            "on cloud nine": "सातवें आसमान पर होना",
            "once in a blue moon": "कभी-कभार",
            "beating around the bush": "इधर-उधर की बात करना",
            "burning the midnight oil": "रात-रात भर जागकर काम करना",
            "calm before the storm": "तूफान से पहले की शांति",
            "cost an arm and a leg": "बहुत महंगा होना",
            "blessing in disguise": "छुपा हुआ वरदान",
            "kill two birds with one stone": "एक पंथ दो काज",
            "a piece of cake": "बहुत आसान काम",
            "under the weather": "तबीयत ठीक न होना",
            "pull yourself together": "खुद को संभालो",
            "rise and shine": "जल्दी उठो और तैयार हो जाओ",
            "time flies": "समय पंख लगाकर उड़ता है",
            "actions speak louder than words": "कथनी से करनी बड़ी",
            "all ears": "पूरा ध्यान से सुन रहा हूं",
            "back to square one": "वापस शुरुआत में",
            "better late than never": "देर आये दुरुस्त आये",
            "cry over spilled milk": "बीती बात पर पछताना",
            "down to earth": "सरल स्वभाव का",
            "every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
            "food for thought": "सोचने वाली बात",
            "give someone the benefit of the doubt": "शक का फायदा देना",
            "hit the nail on the head": "सटीक बात कहना",
            "in hot water": "मुसीबत में होना"
        }
        
        # Sort idioms by length (longest first) to handle overlapping phrases
        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
        
        # Replace idioms with their translations
        for idiom in sorted_idioms:
            pattern = r'\b' + re.escape(idiom) + r'\b'
            text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)
            
    elif src_lang == "en" and tgt_lang == "mr":
        idiom_map = {
            "no piece of cake": "सोपं काम नाही",
            "bite the bullet": "कठीण निर्णय घेणे",
            "tackle it head-on": "समस्येला थेट सामोरे जाणे",
            "fell into place": "सगळं व्यवस्थित झालं",
            "see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
            "with a little perseverance": "थोड्या धीराने",
            "break the ice": "संभाषणाची सुरुवात करणे",
            "on cloud nine": "आनंदात असणे",
            "once in a blue moon": "क्वचितच",
            "burning the midnight oil": "रात्रंदिवस मेहनत करणे",
            "better late than never": "उशीर का होईना पण योग्य वेळी"
        }
        for idiom, translation in idiom_map.items():
            pattern = r'\b' + re.escape(idiom) + r'\b'
            text = re.sub(pattern, translation, text, flags=re.IGNORECASE)
    
    return text

def extract_text(file):
    ext = os.path.splitext(file.name)[1].lower()
    
    if ext == ".pdf":
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    
    elif ext == ".docx":
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    
    elif ext == ".txt":
        return file.read().decode("utf-8")
    
    else:
        raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")

def translate_text(text, src_lang, tgt_lang, models):
    if src_lang == tgt_lang:
        return text

    # Language codes for NLLB
    lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}

    if src_lang not in lang_map or tgt_lang not in lang_map:
        return "Error: Unsupported language combination"

    tgt_lang_code = lang_map[tgt_lang]

    tokenizer, model = models["nllb"]
    
    # Preprocess for idioms
    preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
    
    # Split text into smaller chunks (sentences)
    sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
    translated_text = []
    
    for sentence in sentences:
        if sentence.strip():
            chunks = split_long_sentence(sentence, max_length=200)
            
            for chunk in chunks:
                try:
                    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
                    translated = model.generate(
                        **inputs