Spaces:

gauravchand11
/

try

Build error

try

File size: 10,212 Bytes

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import streamlit as st
from PyPDF2 import PdfReader
import docx
import os
import re

# Load NLLB model and tokenizer
@st.cache_resource
def load_translation_model():
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

# Initialize model
@st.cache_resource
def initialize_models():
    tokenizer, model = load_translation_model()
    return {"nllb": (tokenizer, model)}

# Enhanced idiom mapping with more comprehensive translations
def preprocess_idioms(text, src_lang, tgt_lang):
    if src_lang == "en" and tgt_lang == "hi":
        idiom_map = {
            # Basic phrases
            "no piece of cake": "कोई आसान काम नहीं",
            "piece of cake": "बहुत आसान काम",
            "bite the bullet": "दांतों तले उंगली दबाना",
            "tackle it head-on": "सीधे मुकाबला करना",
            "fell into place": "सब कुछ ठीक हो गया",
            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
            "with a little perseverance": "थोड़े से धैर्य से",
            
            # Additional common idioms
            "break a leg": "बहुत बहुत शुभकामनाएं",
            "hit the nail on the head": "बिल्कुल सही बात कहना",
            "once in a blue moon": "बहुत कम, कभी-कभार",
            "under the weather": "तबीयत ठीक नहीं",
            "cost an arm and a leg": "बहुत महंगा",
            "beating around the bush": "इधर-उधर की बात करना",
            "call it a day": "काम समाप्त करना",
            "burn the midnight oil": "रात-रात भर जागकर काम करना",
            "get the ball rolling": "शुरुआत करना",
            "pull yourself together": "खुद को संभालो",
            "shoot yourself in the foot": "अपना ही नुकसान करना",
            "take it with a grain of salt": "संदेह से लेना",
            "the last straw": "सहनशीलता की आखिरी सीमा",
            "time flies": "समय पंख लगाकर उड़ता है",
            "wrap your head around": "समझने की कोशिश करना",
            "cut corners": "काम में छोटा रास्ता अपनाना",
            "back to square one": "फिर से शुरू से",
            "blessing in disguise": "छिपा हुआ वरदान",
            "cry over spilled milk": "बीती बात पर पछताना",
            "keep your chin up": "हिम्मत रखना",
            
            # Work-related idioms
            "think outside the box": "नए तरीके से सोचना",
            "raise the bar": "मानक ऊंचा करना",
            "learning curve": "सीखने की प्रक्रिया",
            "up and running": "चालू और कार्यरत",
            "back to the drawing board": "फिर से योजना बनाना",
            
            # Project-related phrases
            "running into issues": "समस्याओं का सामना करना",
            "iron out the bugs": "खामियां दूर करना",
            "in the pipeline": "विचाराधीन",
            "moving forward": "आगे बढ़ते हुए",
            "touch base": "संपर्क में रहना",
            
            # Technical phrases
            "user-friendly": "उपयोगकर्ता के अनुकूल",
            "cutting-edge": "अत्याधुनिक",
            "state of the art": "अत्याधुनिक तकनीक",
            "proof of concept": "व्यवहार्यता का प्रमाण",
            "game changer": "खेल बदलने वाला"
        }
        
        # Sort idioms by length (longest first) to handle overlapping phrases
        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
        
        # Create a single regex pattern for all idioms
        pattern = '|'.join(map(re.escape, sorted_idioms))
        
        def replace_idiom(match):
            return idiom_map[match.group(0).lower()]
        
        # Replace all idioms in one pass, case-insensitive
        text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
    
    return text

# Function to extract text from different file types
def extract_text(file):
    ext = os.path.splitext(file.name)[1].lower()
    
    if ext == ".pdf":
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    
    elif ext == ".docx":
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    
    elif ext == ".txt":
        return file.read().decode("utf-8")
    
    else:
        raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")

# Translation function with improved chunking and fixed tokenizer issue
def translate_text(text, src_lang, tgt_lang, models):
    if src_lang == tgt_lang:
        return text

    # Language codes for NLLB
    lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}

    if src_lang not in lang_map or tgt_lang not in lang_map:
        return "Error: Unsupported language combination"

    tgt_lang_code = lang_map[tgt_lang]
    tokenizer, model = models["nllb"]
    
    # Preprocess for idioms
    preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
    
    # Improved chunking: Split by sentences while preserving context
    chunks = []
    current_chunk = ""
    
    for sentence in re.split('([.!?।]+)', preprocessed_text):
        if sentence.strip():
            if len(current_chunk) + len(sentence) < 450:  # Leave room for tokenization
                current_chunk += sentence
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk)
    
    translated_text = ""
    
    for chunk in chunks:
        if chunk.strip():
            # Add target language token to the beginning of the input
            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
            
            # Get the token ID for the target language
            tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
            
            translated = model.generate(
                **inputs,
                forced_bos_token_id=tgt_lang_id,  # Fixed: Using convert_tokens_to_ids instead of lang_code_to_id
                max_length=512,
                num_beams=5,
                length_penalty=1.0,
                no_repeat_ngram_size=3
            )
            translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
            translated_text += translated_chunk + " "
    
    return translated_text.strip()

# Function to save text as a file
def save_text_to_file(text, original_filename, prefix="translated"):
    output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(text)
    return output_filename

# Main processing function
def process_document(file, source_lang, target_lang, models):
    try:
        # Extract text from uploaded file
        text = extract_text(file)
        
        # Translate the text
        translated_text = translate_text(text, source_lang, target_lang, models)
        
        # Save the result
        if translated_text.startswith("Error:"):
            output_file = save_text_to_file(translated_text, file.name, prefix="error")
        else:
            output_file = save_text_to_file(translated_text, file.name)
        
        return output_file, translated_text
    except Exception as e:
        error_message = f"Error: {str(e)}"
        output_file = save_text_to_file(error_message, file.name, prefix="error")
        return output_file, error_message

# Streamlit interface
def main():
    st.title("Document Translator (NLLB-200)")
    st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
    
    # Initialize models
    models = initialize_models()
    
    # File uploader
    uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
    
    # Language selection
    col1, col2 = st.columns(2)
    with col1:
        source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
    with col2:
        target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
    
    if uploaded_file is not None and st.button("Translate"):
        with st.spinner("Translating..."):
            output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
            
            # Display result
            st.text_area("Translated Text", result_text, height=300)
            
            # Provide download button
            with open(output_file, "rb") as file:
                st.download_button(
                    label="Download Translated Document",
                    data=file,
                    file_name=os.path.basename(output_file),
                    mime="text/plain"
                )

if _name_ == "_main_":
    main()