from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import streamlit as st from PyPDF2 import PdfReader import docx import os import re import asyncio from concurrent.futures import ThreadPoolExecutor import torch # Replace pytesseract with easyocr import easyocr from PIL import Image import numpy as np # Set up async environment for torch if torch.cuda.is_available(): torch.multiprocessing.set_start_method('spawn', force=True) # Initialize asyncio event loop try: loop = asyncio.get_event_loop() except RuntimeError: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # Initialize EasyOCR reader @st.cache_resource def load_ocr_reader(): try: return easyocr.Reader(['en']) # Initialize for English except Exception as e: st.error(f"Error loading OCR reader: {str(e)}") return None # Modified extract_text_from_image function with better error handling def extract_text_from_image(image_file): try: # Get the OCR reader reader = load_ocr_reader() if reader is None: raise Exception("Failed to initialize OCR reader") # Read the image using PIL image = Image.open(image_file) # Convert to numpy array image_np = np.array(image) # Perform OCR results = reader.readtext(image_np) if not results: return "No text was detected in the image." # Extract text from results text = "\n".join([result[1] for result in results]) return text.strip() except Exception as e: raise Exception(f"Error extracting text from image: {str(e)}") # Modified extract_text function to support all file types def extract_text(file): try: ext = os.path.splitext(file.name)[1].lower() if ext == ".pdf": try: reader = PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text.strip() except Exception as e: raise Exception(f"Error reading PDF file: {str(e)}") elif ext == ".docx": try: doc = docx.Document(file) text = "" for para in doc.paragraphs: text += para.text + "\n" return text.strip() except Exception as e: raise Exception(f"Error reading DOCX file: {str(e)}") elif ext == ".txt": try: return file.read().decode("utf-8").strip() except Exception as e: raise Exception(f"Error reading TXT file: {str(e)}") elif ext in [".jpg", ".jpeg", ".png"]: try: return extract_text_from_image(file) except Exception as e: raise Exception(f"Error processing image file: {str(e)}") else: raise ValueError("Unsupported file format. Please upload PDF, DOCX, TXT, or image files (JPG, JPEG, PNG).") except Exception as e: raise Exception(f"Error extracting text from file: {str(e)}") # Load NLLB model and tokenizer with error handling @st.cache_resource def load_translation_model(): try: model_name = "facebook/nllb-200-distilled-600M" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) return tokenizer, model except Exception as e: st.error(f"Error loading model: {str(e)}") return None, None # Initialize model @st.cache_resource def initialize_models(): tokenizer, model = load_translation_model() if tokenizer is None or model is None: st.error("Failed to initialize models") return None return {"nllb": (tokenizer, model)} # Enhanced idiom mapping with more comprehensive translations def preprocess_idioms(text, src_lang, tgt_lang): idiom_map = {} if src_lang == "en" and tgt_lang == "hi": idiom_map = { "no piece of cake": "कोई आसान काम नहीं", "piece of cake": "बहुत आसान काम", "bite the bullet": "दांतों तले उंगली दबाना", "tackle it head-on": "सीधे मुकाबला करना", "fell into place": "सब कुछ ठीक हो गया", "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना", "with a little perseverance": "थोड़े से धैर्य से", # Additional common idioms "break a leg": "बहुत बहुत शुभकामनाएं", "hit the nail on the head": "बिल्कुल सही बात कहना", "once in a blue moon": "बहुत कम, कभी-कभार", "under the weather": "तबीयत ठीक नहीं", "cost an arm and a leg": "बहुत महंगा", "beating around the bush": "इधर-उधर की बात करना", "call it a day": "काम समाप्त करना", "burn the midnight oil": "रात-रात भर जागकर काम करना", "get the ball rolling": "शुरुआत करना", "pull yourself together": "खुद को संभालो", "shoot yourself in the foot": "अपना ही नुकसान करना", "take it with a grain of salt": "संदेह से लेना", "the last straw": "सहनशीलता की आखिरी सीमा", "time flies": "समय पंख लगाकर उड़ता है", "wrap your head around": "समझने की कोशिश करना", "cut corners": "काम में छोटा रास्ता अपनाना", "back to square one": "फिर से शुरू से", "blessing in disguise": "छिपा हुआ वरदान", "cry over spilled milk": "बीती बात पर पछताना", "keep your chin up": "हिम्मत रखना", # Work-related idioms "think outside the box": "नए तरीके से सोचना", "raise the bar": "मानक ऊंचा करना", "learning curve": "सीखने की प्रक्रिया", "up and running": "चालू और कार्यरत", "back to the drawing board": "फिर से योजना बनाना", # Project-related phrases "running into issues": "समस्याओं का सामना करना", "iron out the bugs": "खामियां दूर करना", "in the pipeline": "विचाराधीन", "moving forward": "आगे बढ़ते हुए", "touch base": "संपर्क में रहना", # Technical phrases "user-friendly": "उपयोगकर्ता के अनुकूल", "cutting-edge": "अत्याधुनिक", "state of the art": "अत्याधुनिक तकनीक", "proof of concept": "व्यवहार्यता का प्रमाण", "game changer": "खेल बदलने वाला" } elif src_lang == "en" and tgt_lang == "mr": idiom_map = { "no piece of cake": "सोपं काम नाही", "piece of cake": "अतिशय सोपं काम", "bite the bullet": "कठीण निर्णय घेणे", "tackle it head-on": "समस्येला थेट सामोरे जाणे", "fell into place": "सगळं व्यवस्थित झालं", "see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे", "with a little perseverance": "थोड्या धीराने", "break a leg": "खूप शुभेच्छा", "hit the nail on the head": "अगदी बरोबर बोललात", "once in a blue moon": "क्वचितच, कधीतरी", "under the weather": "तब्येत ठीक नसणे", "cost an arm and a leg": "खूप महाग", "beating around the bush": "गोल गोल फिरवणे", "call it a day": "दिवसाचं काम संपवणे", "burn the midnight oil": "रात्रंदिवस मेहनत करणे", "get the ball rolling": "सुरुवात करणे", "pull yourself together": "स्वतःला सावरा", "shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे", "take it with a grain of salt": "साशंक दृष्टीने पाहणे", "the last straw": "सहनशक्तीची शेवटची मर्यादा", "time flies": "वेळ पंख लावून उडतो", "wrap your head around": "समजून घेण्याचा प्रयत्न करणे", "cut corners": "कमी वेळात काम उरकणे", "back to square one": "पुन्हा सुरुवातीला", "blessing in disguise": "आशीर्वाद लपलेला", "cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे", "keep your chin up": "धीर धरा", # Work-related idioms "think outside the box": "वेगळ्या पद्धतीने विचार करणे", "raise the bar": "पातळी उंचावणे", "learning curve": "शिकण्याची प्रक्रिया", "up and running": "सुरू आणि कार्यरत", "back to the drawing board": "पुन्हा नव्याने योजना आखणे", # Project-related phrases "running into issues": "अडचणींना सामोरे जाणे", "iron out the bugs": "त्रुटी दूर करणे", "in the pipeline": "विचाराधीन", "moving forward": "पुढे जाताना", "touch base": "संपर्कात राहणे", # Technical phrases "user-friendly": "वापरकर्त्यास सोयीस्कर", "cutting-edge": "अत्याधुनिक", "state of the art": "सर्वोत्कृष्ट तंत्रज्ञान", "proof of concept": "संकल्पनेची सिद्धता", "game changer": "खेळ बदलणारी गोष्ट" } if idiom_map: sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True) pattern = '|'.join(map(re.escape, sorted_idioms)) def replace_idiom(match): return idiom_map[match.group(0).lower()] text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE) return text # Async translation function with fixed idiom processing async def translate_text_async(text, src_lang, tgt_lang, models): if src_lang == tgt_lang: return text # Updated language mapping handling src_lang_simple = src_lang.lower() tgt_lang_simple = tgt_lang.lower() lang_map = {"english": "eng_Latn", "hindi": "hin_Deva", "marathi": "mar_Deva"} if src_lang_simple not in lang_map or tgt_lang_simple not in lang_map: return "Error: Unsupported language combination" try: # Process idioms first preprocessed_text = preprocess_idioms(text, src_lang_simple[:2], tgt_lang_simple[:2]) tgt_lang_code = lang_map[tgt_lang_simple] tokenizer, model = models["nllb"] chunks = [] current_chunk = "" # Split text into chunks while preserving sentences for sentence in re.split('([.!?।]+)', preprocessed_text): if sentence.strip(): if len(current_chunk) + len(sentence) < 450: current_chunk += sentence else: if current_chunk: chunks.append(current_chunk) current_chunk = sentence if current_chunk: chunks.append(current_chunk) translated_text = "" # Translate each chunk for chunk in chunks: if chunk.strip(): inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code) translated = model.generate( **inputs, forced_bos_token_id=tgt_lang_id, max_length=512, num_beams=5, length_penalty=1.0, no_repeat_ngram_size=3 ) translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True) translated_text += translated_chunk + " " return translated_text.strip() except Exception as e: return f"Error during translation: {str(e)}" # Synchronous wrapper for translation def translate_text(text, src_lang, tgt_lang, models): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: return loop.run_until_complete(translate_text_async(text, src_lang, tgt_lang, models)) finally: loop.close() def save_text_to_file(text, original_filename, prefix="translated"): try: # Get the original file extension and base name base_name = os.path.splitext(os.path.basename(original_filename))[0] output_filename = f"{prefix}_{base_name}.txt" # Save all translations as text files for simplicity and build speed with open(output_filename, "w", encoding="utf-8") as f: f.write(text) return output_filename except Exception as e: st.error(f"Error saving file: {str(e)}") return None # Modified process_document function to handle multiple formats def process_document(file, source_lang, target_lang, models): try: text = extract_text(file) translated_text = translate_text(text, source_lang, target_lang, models) if translated_text.startswith("Error:"): output_file = save_text_to_file(translated_text, file.name, prefix="error") else: output_file = save_text_to_file(translated_text, file.name) if output_file is None: raise Exception("Failed to save output file") return output_file, translated_text except Exception as e: error_message = f"Error: {str(e)}" output_file = save_text_to_file(error_message, file.name, prefix="error") return output_file, error_message # Modified main function to ensure proper language handling def main(): st.title("Document Translation Toolkit") # Initialize models with error handling models = initialize_models() if models is None: st.error("Failed to initialize translation models. Please try again.") return # Create tabs for different translation modes tab1, tab2 = st.tabs(["Document Translation", "Text Translation"]) # Document Translation Tab with tab1: st.subheader("Document Translation") st.write("Upload a document (PDF, DOCX, TXT, or Image) and select languages.") uploaded_file = st.file_uploader( "Upload Document", type=["pdf", "docx", "txt", "jpg", "jpeg", "png"], key="doc_uploader" ) col1, col2 = st.columns(2) with col1: source_lang = st.selectbox( "Source Language", ["English", "Hindi", "Marathi"], index=0, key="doc_src" ) with col2: target_lang = st.selectbox( "Target Language", ["English", "Hindi", "Marathi"], index=1, key="doc_tgt" ) if uploaded_file is not None and st.button("Translate Document"): try: with st.spinner("Translating..."): # Extract and show input text input_text = extract_text(uploaded_file) st.subheader("Input Text") st.text_area("Original Text", input_text, height=200) # Translate and show output text output_file, result_text = process_document( uploaded_file, source_lang.lower(), target_lang.lower(), models ) st.subheader("Translated Text") st.text_area("Translation", result_text, height=200) # Provide download button with correct MIME type if output_file and os.path.exists(output_file): with open(output_file, "rb") as file: # Set appropriate MIME type based on file extension ext = os.path.splitext(output_file)[1].lower() mime_types = { '.pdf': 'application/pdf', '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', '.txt': 'text/plain', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png' } mime_type = mime_types.get(ext, 'text/plain') st.download_button( label="Download Translated Document", data=file, file_name=os.path.basename(output_file), mime=mime_type ) else: st.error("Failed to generate output file") except Exception as e: st.error(f"An error occurred during translation: {str(e)}") # Text Translation Tab with tab2: st.subheader("Text Translation") st.write("Enter text directly for translation.") col1, col2 = st.columns(2) with col1: text_source_lang = st.selectbox( "Source Language", ["English", "Hindi", "Marathi"], index=0, key="text_src" ) with col2: text_target_lang = st.selectbox( "Target Language", ["English", "Hindi", "Marathi"], index=1, key="text_tgt" ) input_text = st.text_area("Enter text to translate", height=150) if input_text and st.button("Translate Text"): try: with st.spinner("Translating..."): # Translate the input text translated_text = translate_text( input_text, text_source_lang.lower(), text_target_lang.lower(), models ) # Show translation result st.text_area("Translation", translated_text, height=150) # Add download button for translated text st.download_button( label="Download Translation", data=translated_text, file_name="translation.txt", mime="text/plain" ) except Exception as e: st.error(f"An error occurred during translation: {str(e)}") if __name__ == "__main__": try: main() except Exception as e: st.error(f"Application error: {str(e)}")