from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import streamlit as st from PyPDF2 import PdfReader import docx import os import re # Load NLLB model and tokenizer @st.cache_resource def load_translation_model(): model_name = "facebook/nllb-200-distilled-600M" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) return tokenizer, model # Initialize model @st.cache_resource def initialize_models(): tokenizer, model = load_translation_model() return {"nllb": (tokenizer, model)} # Enhanced idiom mapping with more comprehensive translations def preprocess_idioms(text, src_lang, tgt_lang): if src_lang == "en" and tgt_lang == "hi": idiom_map = { # Basic phrases "no piece of cake": "कोई आसान काम नहीं", "piece of cake": "बहुत आसान काम", "bite the bullet": "दांतों तले उंगली दबाना", "tackle it head-on": "सीधे मुकाबला करना", "fell into place": "सब कुछ ठीक हो गया", "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना", "with a little perseverance": "थोड़े से धैर्य से", # Additional common idioms "break a leg": "बहुत बहुत शुभकामनाएं", "hit the nail on the head": "बिल्कुल सही बात कहना", "once in a blue moon": "बहुत कम, कभी-कभार", "under the weather": "तबीयत ठीक नहीं", "cost an arm and a leg": "बहुत महंगा", "beating around the bush": "इधर-उधर की बात करना", "call it a day": "काम समाप्त करना", "burn the midnight oil": "रात-रात भर जागकर काम करना", "get the ball rolling": "शुरुआत करना", "pull yourself together": "खुद को संभालो", "shoot yourself in the foot": "अपना ही नुकसान करना", "take it with a grain of salt": "संदेह से लेना", "the last straw": "सहनशीलता की आखिरी सीमा", "time flies": "समय पंख लगाकर उड़ता है", "wrap your head around": "समझने की कोशिश करना", "cut corners": "काम में छोटा रास्ता अपनाना", "back to square one": "फिर से शुरू से", "blessing in disguise": "छिपा हुआ वरदान", "cry over spilled milk": "बीती बात पर पछताना", "keep your chin up": "हिम्मत रखना", # Work-related idioms "think outside the box": "नए तरीके से सोचना", "raise the bar": "मानक ऊंचा करना", "learning curve": "सीखने की प्रक्रिया", "up and running": "चालू और कार्यरत", "back to the drawing board": "फिर से योजना बनाना", # Project-related phrases "running into issues": "समस्याओं का सामना करना", "iron out the bugs": "खामियां दूर करना", "in the pipeline": "विचाराधीन", "moving forward": "आगे बढ़ते हुए", "touch base": "संपर्क में रहना", # Technical phrases "user-friendly": "उपयोगकर्ता के अनुकूल", "cutting-edge": "अत्याधुनिक", "state of the art": "अत्याधुनिक तकनीक", "proof of concept": "व्यवहार्यता का प्रमाण", "game changer": "खेल बदलने वाला" } # Sort idioms by length (longest first) to handle overlapping phrases sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True) # Create a single regex pattern for all idioms pattern = '|'.join(map(re.escape, sorted_idioms)) def replace_idiom(match): return idiom_map[match.group(0).lower()] # Replace all idioms in one pass, case-insensitive text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE) return text # Function to extract text from different file types def extract_text(file): ext = os.path.splitext(file.name)[1].lower() if ext == ".pdf": reader = PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text elif ext == ".docx": doc = docx.Document(file) text = "" for para in doc.paragraphs: text += para.text + "\n" return text elif ext == ".txt": return file.read().decode("utf-8") else: raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.") # Translation function with improved chunking and fixed tokenizer issue def translate_text(text, src_lang, tgt_lang, models): if src_lang == tgt_lang: return text # Language codes for NLLB lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"} if src_lang not in lang_map or tgt_lang not in lang_map: return "Error: Unsupported language combination" tgt_lang_code = lang_map[tgt_lang] tokenizer, model = models["nllb"] # Preprocess for idioms preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang) # Improved chunking: Split by sentences while preserving context chunks = [] current_chunk = "" for sentence in re.split('([.!?।]+)', preprocessed_text): if sentence.strip(): if len(current_chunk) + len(sentence) < 450: # Leave room for tokenization current_chunk += sentence else: if current_chunk: chunks.append(current_chunk) current_chunk = sentence if current_chunk: chunks.append(current_chunk) translated_text = "" for chunk in chunks: if chunk.strip(): # Add target language token to the beginning of the input inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) # Get the token ID for the target language tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code) translated = model.generate( **inputs, forced_bos_token_id=tgt_lang_id, # Fixed: Using convert_tokens_to_ids instead of lang_code_to_id max_length=512, num_beams=5, length_penalty=1.0, no_repeat_ngram_size=3 ) translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True) translated_text += translated_chunk + " " return translated_text.strip() # Function to save text as a file def save_text_to_file(text, original_filename, prefix="translated"): output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt" with open(output_filename, "w", encoding="utf-8") as f: f.write(text) return output_filename # Main processing function def process_document(file, source_lang, target_lang, models): try: # Extract text from uploaded file text = extract_text(file) # Translate the text translated_text = translate_text(text, source_lang, target_lang, models) # Save the result if translated_text.startswith("Error:"): output_file = save_text_to_file(translated_text, file.name, prefix="error") else: output_file = save_text_to_file(translated_text, file.name) return output_file, translated_text except Exception as e: error_message = f"Error: {str(e)}" output_file = save_text_to_file(error_message, file.name, prefix="error") return output_file, error_message # Streamlit interface def main(): st.title("Document Translator (NLLB-200)") st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).") # Initialize models models = initialize_models() # File uploader uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"]) # Language selection col1, col2 = st.columns(2) with col1: source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0) with col2: target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1) if uploaded_file is not None and st.button("Translate"): with st.spinner("Translating..."): output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models) # Display result st.text_area("Translated Text", result_text, height=300) # Provide download button with open(output_file, "rb") as file: st.download_button( label="Download Translated Document", data=file, file_name=os.path.basename(output_file), mime="text/plain" ) if _name_ == "_main_": main()