Spaces:

gauravchand11
/

try

Build error

try

File size: 21,480 Bytes

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import streamlit as st
from PyPDF2 import PdfReader
import docx
import os
import re
import asyncio
from concurrent.futures import ThreadPoolExecutor
import torch
# Replace pytesseract with easyocr
import easyocr
from PIL import Image
import numpy as np

# Set up async environment for torch
if torch.cuda.is_available():
    torch.multiprocessing.set_start_method('spawn', force=True)

# Initialize asyncio event loop
try:
    loop = asyncio.get_event_loop()
except RuntimeError:
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

# Initialize EasyOCR reader
@st.cache_resource
def load_ocr_reader():
    try:
        return easyocr.Reader(['en'])  # Initialize for English
    except Exception as e:
        st.error(f"Error loading OCR reader: {str(e)}")
        return None

# Modified extract_text_from_image function with better error handling
def extract_text_from_image(image_file):
    try:
        # Get the OCR reader
        reader = load_ocr_reader()
        if reader is None:
            raise Exception("Failed to initialize OCR reader")
        
        # Read the image using PIL
        image = Image.open(image_file)
        
        # Convert to numpy array
        image_np = np.array(image)
        
        # Perform OCR
        results = reader.readtext(image_np)
        
        if not results:
            return "No text was detected in the image."
        
        # Extract text from results
        text = "\n".join([result[1] for result in results])
        return text.strip()
    except Exception as e:
        raise Exception(f"Error extracting text from image: {str(e)}")

# Modified extract_text function to support all file types
def extract_text(file):
    try:
        ext = os.path.splitext(file.name)[1].lower()
        
        if ext == ".pdf":
            try:
                reader = PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                return text.strip()
            except Exception as e:
                raise Exception(f"Error reading PDF file: {str(e)}")
        
        elif ext == ".docx":
            try:
                doc = docx.Document(file)
                text = ""
                for para in doc.paragraphs:
                    text += para.text + "\n"
                return text.strip()
            except Exception as e:
                raise Exception(f"Error reading DOCX file: {str(e)}")
        
        elif ext == ".txt":
            try:
                return file.read().decode("utf-8").strip()
            except Exception as e:
                raise Exception(f"Error reading TXT file: {str(e)}")
        
        elif ext in [".jpg", ".jpeg", ".png"]:
            try:
                return extract_text_from_image(file)
            except Exception as e:
                raise Exception(f"Error processing image file: {str(e)}")
        
        else:
            raise ValueError("Unsupported file format. Please upload PDF, DOCX, TXT, or image files (JPG, JPEG, PNG).")
    except Exception as e:
        raise Exception(f"Error extracting text from file: {str(e)}")        

# Load NLLB model and tokenizer with error handling
@st.cache_resource
def load_translation_model():
    try:
        model_name = "facebook/nllb-200-distilled-600M"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        return tokenizer, model
    except Exception as e:
        st.error(f"Error loading model: {str(e)}")
        return None, None

# Initialize model
@st.cache_resource
def initialize_models():
    tokenizer, model = load_translation_model()
    if tokenizer is None or model is None:
        st.error("Failed to initialize models")
        return None
    return {"nllb": (tokenizer, model)}



# Enhanced idiom mapping with more comprehensive translations
def preprocess_idioms(text, src_lang, tgt_lang):
   
    idiom_map = {}
    
    if src_lang == "en" and tgt_lang == "hi":
        idiom_map = {
           "no piece of cake": "कोई आसान काम नहीं",
            "piece of cake": "बहुत आसान काम",
            "bite the bullet": "दांतों तले उंगली दबाना",
            "tackle it head-on": "सीधे मुकाबला करना",
            "fell into place": "सब कुछ ठीक हो गया",
            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
            "with a little perseverance": "थोड़े से धैर्य से",
            
            # Additional common idioms
            "break a leg": "बहुत बहुत शुभकामनाएं",
            "hit the nail on the head": "बिल्कुल सही बात कहना",
            "once in a blue moon": "बहुत कम, कभी-कभार",
            "under the weather": "तबीयत ठीक नहीं",
            "cost an arm and a leg": "बहुत महंगा",
            "beating around the bush": "इधर-उधर की बात करना",
            "call it a day": "काम समाप्त करना",
            "burn the midnight oil": "रात-रात भर जागकर काम करना",
            "get the ball rolling": "शुरुआत करना",
            "pull yourself together": "खुद को संभालो",
            "shoot yourself in the foot": "अपना ही नुकसान करना",
            "take it with a grain of salt": "संदेह से लेना",
            "the last straw": "सहनशीलता की आखिरी सीमा",
            "time flies": "समय पंख लगाकर उड़ता है",
            "wrap your head around": "समझने की कोशिश करना",
            "cut corners": "काम में छोटा रास्ता अपनाना",
            "back to square one": "फिर से शुरू से",
            "blessing in disguise": "छिपा हुआ वरदान",
            "cry over spilled milk": "बीती बात पर पछताना",
            "keep your chin up": "हिम्मत रखना",
            
            # Work-related idioms
            "think outside the box": "नए तरीके से सोचना",
            "raise the bar": "मानक ऊंचा करना",
            "learning curve": "सीखने की प्रक्रिया",
            "up and running": "चालू और कार्यरत",
            "back to the drawing board": "फिर से योजना बनाना",
            
            # Project-related phrases
            "running into issues": "समस्याओं का सामना करना",
            "iron out the bugs": "खामियां दूर करना",
            "in the pipeline": "विचाराधीन",
            "moving forward": "आगे बढ़ते हुए",
            "touch base": "संपर्क में रहना",
            
            # Technical phrases
            "user-friendly": "उपयोगकर्ता के अनुकूल",
            "cutting-edge": "अत्याधुनिक",
            "state of the art": "अत्याधुनिक तकनीक",
            "proof of concept": "व्यवहार्यता का प्रमाण",
            "game changer": "खेल बदलने वाला"
        }
    elif src_lang == "en" and tgt_lang == "mr":
        idiom_map = {
            "no piece of cake": "सोपं काम नाही",
            "piece of cake": "अतिशय सोपं काम",
            "bite the bullet": "कठीण निर्णय घेणे",
            "tackle it head-on": "समस्येला थेट सामोरे जाणे",
            "fell into place": "सगळं व्यवस्थित झालं",
            "see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे",
            "with a little perseverance": "थोड्या धीराने",
            "break a leg": "खूप शुभेच्छा",
            "hit the nail on the head": "अगदी बरोबर बोललात",
            "once in a blue moon": "क्वचितच, कधीतरी",
            "under the weather": "तब्येत ठीक नसणे",
            "cost an arm and a leg": "खूप महाग",
            "beating around the bush": "गोल गोल फिरवणे",
            "call it a day": "दिवसाचं काम संपवणे",
            "burn the midnight oil": "रात्रंदिवस मेहनत करणे",
            "get the ball rolling": "सुरुवात करणे",
            "pull yourself together": "स्वतःला सावरा",
            "shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे",
            "take it with a grain of salt": "साशंक दृष्टीने पाहणे",
            "the last straw": "सहनशक्तीची शेवटची मर्यादा",
            "time flies": "वेळ पंख लावून उडतो",
            "wrap your head around": "समजून घेण्याचा प्रयत्न करणे",
            "cut corners": "कमी वेळात काम उरकणे",
            "back to square one": "पुन्हा सुरुवातीला",
            "blessing in disguise": "आशीर्वाद लपलेला",
            "cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे",
            "keep your chin up": "धीर धरा",

            # Work-related idioms
            "think outside the box": "वेगळ्या पद्धतीने विचार करणे",
            "raise the bar": "पातळी उंचावणे",
            "learning curve": "शिकण्याची प्रक्रिया",
            "up and running": "सुरू आणि कार्यरत",
            "back to the drawing board": "पुन्हा नव्याने योजना आखणे",
            
            # Project-related phrases
            "running into issues": "अडचणींना सामोरे जाणे",
            "iron out the bugs": "त्रुटी दूर करणे",
            "in the pipeline": "विचाराधीन",
            "moving forward": "पुढे जाताना",
            "touch base": "संपर्कात राहणे",
            
            # Technical phrases
            "user-friendly": "वापरकर्त्यास सोयीस्कर",
            "cutting-edge": "अत्याधुनिक",
            "state of the art": "सर्वोत्कृष्ट तंत्रज्ञान",
            "proof of concept": "संकल्पनेची सिद्धता",
            "game changer": "खेळ बदलणारी गोष्ट"
        }

    if idiom_map:
        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
        pattern = '|'.join(map(re.escape, sorted_idioms))
        
        def replace_idiom(match):
            return idiom_map[match.group(0).lower()]
        
        text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
    
    return text

# Async translation function with fixed idiom processing
async def translate_text_async(text, src_lang, tgt_lang, models):
    if src_lang == tgt_lang:
        return text

    # Updated language mapping handling
    src_lang_simple = src_lang.lower()
    tgt_lang_simple = tgt_lang.lower()
    
    lang_map = {"english": "eng_Latn", "hindi": "hin_Deva", "marathi": "mar_Deva"}

    if src_lang_simple not in lang_map or tgt_lang_simple not in lang_map:
        return "Error: Unsupported language combination"

    try:
        # Process idioms first
        preprocessed_text = preprocess_idioms(text, src_lang_simple[:2], tgt_lang_simple[:2])
        
        tgt_lang_code = lang_map[tgt_lang_simple]
        tokenizer, model = models["nllb"]
        
        chunks = []
        current_chunk = ""
        
        # Split text into chunks while preserving sentences
        for sentence in re.split('([.!?।]+)', preprocessed_text):
            if sentence.strip():
                if len(current_chunk) + len(sentence) < 450:
                    current_chunk += sentence
                else:
                    if current_chunk:
                        chunks.append(current_chunk)
                    current_chunk = sentence
        
        if current_chunk:
            chunks.append(current_chunk)
        
        translated_text = ""
        
        # Translate each chunk
        for chunk in chunks:
            if chunk.strip():
                inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
                tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
                
                translated = model.generate(
                    **inputs,
                    forced_bos_token_id=tgt_lang_id,
                    max_length=512,
                    num_beams=5,
                    length_penalty=1.0,
                    no_repeat_ngram_size=3
                )
                
                translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
                translated_text += translated_chunk + " "
        
        return translated_text.strip()
    except Exception as e:
        return f"Error during translation: {str(e)}"

# Synchronous wrapper for translation
def translate_text(text, src_lang, tgt_lang, models):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        return loop.run_until_complete(translate_text_async(text, src_lang, tgt_lang, models))
    finally:
        loop.close()

def save_text_to_file(text, original_filename, prefix="translated"):
    try:
        # Get the original file extension and base name
        base_name = os.path.splitext(os.path.basename(original_filename))[0]
        output_filename = f"{prefix}_{base_name}.txt"
        
        # Save all translations as text files for simplicity and build speed
        with open(output_filename, "w", encoding="utf-8") as f:
            f.write(text)
        
        return output_filename
    except Exception as e:
        st.error(f"Error saving file: {str(e)}")
        return None

# Modified process_document function to handle multiple formats
def process_document(file, source_lang, target_lang, models):
    try:
        text = extract_text(file)
        translated_text = translate_text(text, source_lang, target_lang, models)
        
        if translated_text.startswith("Error:"):
            output_file = save_text_to_file(translated_text, file.name, prefix="error")
        else:
            output_file = save_text_to_file(translated_text, file.name)
        
        if output_file is None:
            raise Exception("Failed to save output file")
        
        return output_file, translated_text
    except Exception as e:
        error_message = f"Error: {str(e)}"
        output_file = save_text_to_file(error_message, file.name, prefix="error")
        return output_file, error_message


# Modified main function to ensure proper language handling
def main():
    st.title("Document Translation Toolkit")

    # Initialize models with error handling
    models = initialize_models()
    if models is None:
        st.error("Failed to initialize translation models. Please try again.")
        return
    
    # Create tabs for different translation modes
    tab1, tab2 = st.tabs(["Document Translation", "Text Translation"])
    
    # Document Translation Tab
    with tab1:
        st.subheader("Document Translation")
        st.write("Upload a document (PDF, DOCX, TXT, or Image) and select languages.")
        
        uploaded_file = st.file_uploader(
            "Upload Document", 
            type=["pdf", "docx", "txt", "jpg", "jpeg", "png"],
            key="doc_uploader"
        )
        
        col1, col2 = st.columns(2)
        with col1:
            source_lang = st.selectbox(
                "Source Language",
                ["English", "Hindi", "Marathi"],
                index=0,
                key="doc_src"
            )
        with col2:
            target_lang = st.selectbox(
                "Target Language",
                ["English", "Hindi", "Marathi"],
                index=1,
                key="doc_tgt"
            )
        
        if uploaded_file is not None and st.button("Translate Document"):
            try:
                with st.spinner("Translating..."):
                    # Extract and show input text
                    input_text = extract_text(uploaded_file)
                    st.subheader("Input Text")
                    st.text_area("Original Text", input_text, height=200)
                    
                    # Translate and show output text
                    output_file, result_text = process_document(
                        uploaded_file,
                        source_lang.lower(),
                        target_lang.lower(),
                        models
                    )
                    
                    st.subheader("Translated Text")
                    st.text_area("Translation", result_text, height=200)
                    
                    # Provide download button with correct MIME type
                    if output_file and os.path.exists(output_file):
                        with open(output_file, "rb") as file:
                            # Set appropriate MIME type based on file extension
                            ext = os.path.splitext(output_file)[1].lower()
                            mime_types = {
                                '.pdf': 'application/pdf',
                                '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                                '.txt': 'text/plain',
                                '.jpg': 'image/jpeg',
                                '.jpeg': 'image/jpeg',
                                '.png': 'image/png'
                            }
                            mime_type = mime_types.get(ext, 'text/plain')
                            
                            st.download_button(
                                label="Download Translated Document",
                                data=file,
                                file_name=os.path.basename(output_file),
                                mime=mime_type
                            )
                    else:
                        st.error("Failed to generate output file")
            except Exception as e:
                st.error(f"An error occurred during translation: {str(e)}")
    
    # Text Translation Tab
    with tab2:
        st.subheader("Text Translation")
        st.write("Enter text directly for translation.")
        
        col1, col2 = st.columns(2)
        with col1:
            text_source_lang = st.selectbox(
                "Source Language",
                ["English", "Hindi", "Marathi"],
                index=0,
                key="text_src"
            )
        with col2:
            text_target_lang = st.selectbox(
                "Target Language",
                ["English", "Hindi", "Marathi"],
                index=1,
                key="text_tgt"
            )
        
        input_text = st.text_area("Enter text to translate", height=150)
        
        if input_text and st.button("Translate Text"):
            try:
                with st.spinner("Translating..."):
                    # Translate the input text
                    translated_text = translate_text(
                        input_text,
                        text_source_lang.lower(),
                        text_target_lang.lower(),
                        models
                    )
                    
                    # Show translation result
                    st.text_area("Translation", translated_text, height=150)
                    
                    # Add download button for translated text
                    st.download_button(
                        label="Download Translation",
                        data=translated_text,
                        file_name="translation.txt",
                        mime="text/plain"
                    )
            except Exception as e:
                st.error(f"An error occurred during translation: {str(e)}")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        st.error(f"Application error: {str(e)}")