File size: 10,212 Bytes
2ea2438
67419d9
c124a1a
67419d9
c98f2e3
1337d1b
4174664
2ea2438
c124a1a
2ea2438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed75acb
2ea2438
 
ed75acb
2ea2438
 
ed75acb
2ea2438
 
 
 
 
77a6efe
2ea2438
77a6efe
2ea2438
c124a1a
 
 
 
 
 
 
 
 
 
 
5e3207d
c124a1a
 
 
 
bcda6d5
c124a1a
 
bcda6d5
c124a1a
 
5e3207d
2ea2438
c124a1a
 
 
5e3207d
2ea2438
c124a1a
 
 
 
 
 
 
 
2ea2438
 
8b4e117
2ea2438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b4e117
2ea2438
8b4e117
2ea2438
8b4e117
 
 
 
2ea2438
8b4e117
 
 
 
 
ed75acb
8b4e117
 
 
 
 
 
 
 
 
 
 
 
 
 
2ea2438
8b4e117
2ea2438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b4e117
2ea2438
8b4e117
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import streamlit as st
from PyPDF2 import PdfReader
import docx
import os
import re

# Load NLLB model and tokenizer
@st.cache_resource
def load_translation_model():
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

# Initialize model
@st.cache_resource
def initialize_models():
    tokenizer, model = load_translation_model()
    return {"nllb": (tokenizer, model)}

# Enhanced idiom mapping with more comprehensive translations
def preprocess_idioms(text, src_lang, tgt_lang):
    if src_lang == "en" and tgt_lang == "hi":
        idiom_map = {
            # Basic phrases
            "no piece of cake": "कोई आसान काम नहीं",
            "piece of cake": "बहुत आसान काम",
            "bite the bullet": "दांतों तले उंगली दबाना",
            "tackle it head-on": "सीधे मुकाबला करना",
            "fell into place": "सब कुछ ठीक हो गया",
            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
            "with a little perseverance": "थोड़े से धैर्य से",
            
            # Additional common idioms
            "break a leg": "बहुत बहुत शुभकामनाएं",
            "hit the nail on the head": "बिल्कुल सही बात कहना",
            "once in a blue moon": "बहुत कम, कभी-कभार",
            "under the weather": "तबीयत ठीक नहीं",
            "cost an arm and a leg": "बहुत महंगा",
            "beating around the bush": "इधर-उधर की बात करना",
            "call it a day": "काम समाप्त करना",
            "burn the midnight oil": "रात-रात भर जागकर काम करना",
            "get the ball rolling": "शुरुआत करना",
            "pull yourself together": "खुद को संभालो",
            "shoot yourself in the foot": "अपना ही नुकसान करना",
            "take it with a grain of salt": "संदेह से लेना",
            "the last straw": "सहनशीलता की आखिरी सीमा",
            "time flies": "समय पंख लगाकर उड़ता है",
            "wrap your head around": "समझने की कोशिश करना",
            "cut corners": "काम में छोटा रास्ता अपनाना",
            "back to square one": "फिर से शुरू से",
            "blessing in disguise": "छिपा हुआ वरदान",
            "cry over spilled milk": "बीती बात पर पछताना",
            "keep your chin up": "हिम्मत रखना",
            
            # Work-related idioms
            "think outside the box": "नए तरीके से सोचना",
            "raise the bar": "मानक ऊंचा करना",
            "learning curve": "सीखने की प्रक्रिया",
            "up and running": "चालू और कार्यरत",
            "back to the drawing board": "फिर से योजना बनाना",
            
            # Project-related phrases
            "running into issues": "समस्याओं का सामना करना",
            "iron out the bugs": "खामियां दूर करना",
            "in the pipeline": "विचाराधीन",
            "moving forward": "आगे बढ़ते हुए",
            "touch base": "संपर्क में रहना",
            
            # Technical phrases
            "user-friendly": "उपयोगकर्ता के अनुकूल",
            "cutting-edge": "अत्याधुनिक",
            "state of the art": "अत्याधुनिक तकनीक",
            "proof of concept": "व्यवहार्यता का प्रमाण",
            "game changer": "खेल बदलने वाला"
        }
        
        # Sort idioms by length (longest first) to handle overlapping phrases
        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
        
        # Create a single regex pattern for all idioms
        pattern = '|'.join(map(re.escape, sorted_idioms))
        
        def replace_idiom(match):
            return idiom_map[match.group(0).lower()]
        
        # Replace all idioms in one pass, case-insensitive
        text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
    
    return text

# Function to extract text from different file types
def extract_text(file):
    ext = os.path.splitext(file.name)[1].lower()
    
    if ext == ".pdf":
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
        return text
    
    elif ext == ".docx":
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    
    elif ext == ".txt":
        return file.read().decode("utf-8")
    
    else:
        raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")

# Translation function with improved chunking and fixed tokenizer issue
def translate_text(text, src_lang, tgt_lang, models):
    if src_lang == tgt_lang:
        return text

    # Language codes for NLLB
    lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}

    if src_lang not in lang_map or tgt_lang not in lang_map:
        return "Error: Unsupported language combination"

    tgt_lang_code = lang_map[tgt_lang]
    tokenizer, model = models["nllb"]
    
    # Preprocess for idioms
    preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
    
    # Improved chunking: Split by sentences while preserving context
    chunks = []
    current_chunk = ""
    
    for sentence in re.split('([.!?।]+)', preprocessed_text):
        if sentence.strip():
            if len(current_chunk) + len(sentence) < 450:  # Leave room for tokenization
                current_chunk += sentence
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence
    
    if current_chunk:
        chunks.append(current_chunk)
    
    translated_text = ""
    
    for chunk in chunks:
        if chunk.strip():
            # Add target language token to the beginning of the input
            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
            
            # Get the token ID for the target language
            tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
            
            translated = model.generate(
                **inputs,
                forced_bos_token_id=tgt_lang_id,  # Fixed: Using convert_tokens_to_ids instead of lang_code_to_id
                max_length=512,
                num_beams=5,
                length_penalty=1.0,
                no_repeat_ngram_size=3
            )
            translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
            translated_text += translated_chunk + " "
    
    return translated_text.strip()

# Function to save text as a file
def save_text_to_file(text, original_filename, prefix="translated"):
    output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(text)
    return output_filename

# Main processing function
def process_document(file, source_lang, target_lang, models):
    try:
        # Extract text from uploaded file
        text = extract_text(file)
        
        # Translate the text
        translated_text = translate_text(text, source_lang, target_lang, models)
        
        # Save the result
        if translated_text.startswith("Error:"):
            output_file = save_text_to_file(translated_text, file.name, prefix="error")
        else:
            output_file = save_text_to_file(translated_text, file.name)
        
        return output_file, translated_text
    except Exception as e:
        error_message = f"Error: {str(e)}"
        output_file = save_text_to_file(error_message, file.name, prefix="error")
        return output_file, error_message

# Streamlit interface
def main():
    st.title("Document Translator (NLLB-200)")
    st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
    
    # Initialize models
    models = initialize_models()
    
    # File uploader
    uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
    
    # Language selection
    col1, col2 = st.columns(2)
    with col1:
        source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
    with col2:
        target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
    
    if uploaded_file is not None and st.button("Translate"):
        with st.spinner("Translating..."):
            output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
            
            # Display result
            st.text_area("Translated Text", result_text, height=300)
            
            # Provide download button
            with open(output_file, "rb") as file:
                st.download_button(
                    label="Download Translated Document",
                    data=file,
                    file_name=os.path.basename(output_file),
                    mime="text/plain"
                )

if _name_ == "_main_":
    main()