File size: 7,643 Bytes
c124a1a
67419d9
c124a1a
67419d9
c98f2e3
1337d1b
77a6efe
 
 
 
 
 
 
 
4174664
c124a1a
 
 
 
 
 
 
5a89d4a
c124a1a
 
 
 
 
5a89d4a
77a6efe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c124a1a
 
 
 
 
 
 
 
 
 
 
5e3207d
c124a1a
 
 
 
bcda6d5
c124a1a
 
bcda6d5
c124a1a
 
5e3207d
c124a1a
 
 
5e3207d
c124a1a
 
 
 
 
 
 
 
 
 
 
 
 
77a6efe
 
 
c124a1a
 
 
77a6efe
5a89d4a
77a6efe
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import streamlit as st
from PyPDF2 import PdfReader
import docx
import os
import re
from datetime import datetime

# Page config
st.set_page_config(
    page_title="Document Translator (NLLB-200)",
    page_icon="📄",
    layout="wide"
)

# Load NLLB model and tokenizer
@st.cache_resource
def load_translation_model():
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

# Initialize model
@st.cache_resource
def initialize_models():
    tokenizer, model = load_translation_model()
    return {"nllb": (tokenizer, model)}

def split_long_sentence(sentence, max_length=200):
    """Split long sentences into smaller chunks at appropriate break points."""
    if len(sentence) <= max_length:
        return [sentence]
    
    chunks = []
    current_chunk = ""
    words = sentence.split()
    
    for word in words:
        if len(current_chunk) + len(word) + 1 <= max_length:
            current_chunk += (" " + word if current_chunk else word)
        else:
            chunks.append(current_chunk)
            current_chunk = word
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

def preprocess_idioms(text, src_lang, tgt_lang):
    if src_lang == "en" and tgt_lang == "hi":
        idiom_map = {
            # Common English-Hindi idiom mappings
            "no piece of cake": "कोई आसान काम नहीं",
            "bite the bullet": "दांतों तले उंगली दबाना",
            "tackle it head-on": "इसे पूरे मन से हाथ में लेना",
            "fell into place": "ठीक हो गया",
            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
            "with a little perseverance": "थोड़े से धैर्य से",
            "break the ice": "बातचीत की शुरुआत करना",
            "on cloud nine": "सातवें आसमान पर होना",
            "once in a blue moon": "कभी-कभार",
            "beating around the bush": "इधर-उधर की बात करना",
            "burning the midnight oil": "रात-रात भर जागकर काम करना",
            "calm before the storm": "तूफान से पहले की शांति",
            "cost an arm and a leg": "बहुत महंगा होना",
            "blessing in disguise": "छुपा हुआ वरदान",
            "kill two birds with one stone": "एक पंथ दो काज",
            "a piece of cake": "बहुत आसान काम",
            "under the weather": "तबीयत ठीक न होना",
            "pull yourself together": "खुद को संभालो",
            "rise and shine": "जल्दी उठो और तैयार हो जाओ",
            "time flies": "समय पंख लगाकर उड़ता है",
            "actions speak louder than words": "कथनी से करनी बड़ी",
            "all ears": "पूरा ध्यान से सुन रहा हूं",
            "back to square one": "वापस शुरुआत में",
            "better late than never": "देर आये दुरुस्त आये",
            "cry over spilled milk": "बीती बात पर पछताना",
            "down to earth": "सरल स्वभाव का",
            "every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
            "food for thought": "सोचने वाली बात",
            "give someone the benefit of the doubt": "शक का फायदा देना",
            "hit the nail on the head": "सटीक बात कहना",
            "in hot water": "मुसीबत में होना"
        }
        
        # Sort idioms by length (longest first) to handle overlapping phrases
        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
        
        # Replace idioms with their translations
        for idiom in sorted_idioms:
            pattern = r'\b' + re.escape(idiom) + r'\b'
            text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)
            
    elif src_lang == "en" and tgt_lang == "mr":
        idiom_map = {
            "no piece of cake": "सोपं काम नाही",
            "bite the bullet": "कठीण निर्णय घेणे",
            "tackle it head-on": "समस्येला थेट सामोरे जाणे",
            "fell into place": "सगळं व्यवस्थित झालं",
            "see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
            "with a little perseverance": "थोड्या धीराने",
            "break the ice": "संभाषणाची सुरुवात करणे",
            "on cloud nine": "आनंदात असणे",
            "once in a blue moon": "क्वचितच",
            "burning the midnight oil": "रात्रंदिवस मेहनत करणे",
            "better late than never": "उशीर का होईना पण योग्य वेळी"
        }
        for idiom, translation in idiom_map.items():
            pattern = r'\b' + re.escape(idiom) + r'\b'
            text = re.sub(pattern, translation, text, flags=re.IGNORECASE)
    
    return text

def extract_text(file):
    ext = os.path.splitext(file.name)[1].lower()
    
    if ext == ".pdf":
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    
    elif ext == ".docx":
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    
    elif ext == ".txt":
        return file.read().decode("utf-8")
    
    else:
        raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")

def translate_text(text, src_lang, tgt_lang, models):
    if src_lang == tgt_lang:
        return text

    # Language codes for NLLB
    lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}

    if src_lang not in lang_map or tgt_lang not in lang_map:
        return "Error: Unsupported language combination"

    tgt_lang_code = lang_map[tgt_lang]

    tokenizer, model = models["nllb"]
    
    # Preprocess for idioms
    preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
    
    # Split text into smaller chunks (sentences)
    sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
    translated_text = []
    
    for sentence in sentences:
        if sentence.strip():
            chunks = split_long_sentence(sentence, max_length=200)
            
            for chunk in chunks:
                try:
                    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
                    translated = model.generate(
                        **inputs