Spaces:

gauravchand11
/

try

Build error

App Files Files Community

gauravchand11 commited on 16 days ago

Commit

829aed6

verified ·

1 Parent(s): f72c1a5

Update app.py

Browse files

Files changed (1) hide show

app.py +377 -124

app.py CHANGED Viewed

@@ -4,27 +4,133 @@ from PyPDF2 import PdfReader
 import docx
 import os
 import re
-# Load NLLB model and tokenizer
 @st.cache_resource
 def load_translation_model():
-    model_name = "facebook/nllb-200-distilled-600M"
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-    return tokenizer, model
 # Initialize model
 @st.cache_resource
 def initialize_models():
     tokenizer, model = load_translation_model()
     return {"nllb": (tokenizer, model)}
 # Enhanced idiom mapping with more comprehensive translations
 def preprocess_idioms(text, src_lang, tgt_lang):
     if src_lang == "en" and tgt_lang == "hi":
         idiom_map = {
-            # Basic phrases
-            "no piece of cake": "कोई आसान काम नहीं",
             "piece of cake": "बहुत आसान काम",
             "bite the bullet": "दांतों तले उंगली दबाना",
             "tackle it head-on": "सीधे मुकाबला करना",
@@ -75,162 +181,309 @@ def preprocess_idioms(text, src_lang, tgt_lang):
             "proof of concept": "व्यवहार्यता का प्रमाण",
             "game changer": "खेल बदलने वाला"
         }
-        # Sort idioms by length (longest first) to handle overlapping phrases
         sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
-        # Create a single regex pattern for all idioms
         pattern = '|'.join(map(re.escape, sorted_idioms))
         def replace_idiom(match):
             return idiom_map[match.group(0).lower()]
-        # Replace all idioms in one pass, case-insensitive
         text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
     return text
-# Function to extract text from different file types
-def extract_text(file):
-    ext = os.path.splitext(file.name)[1].lower()
-    if ext == ".pdf":
-        reader = PdfReader(file)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text() + "\n"
-        return text
-    elif ext == ".docx":
-        doc = docx.Document(file)
-        text = ""
-        for para in doc.paragraphs:
-            text += para.text + "\n"
-        return text
-    elif ext == ".txt":
-        return file.read().decode("utf-8")
-    else:
-        raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
-# Translation function with improved chunking and fixed tokenizer issue
-def translate_text(text, src_lang, tgt_lang, models):
     if src_lang == tgt_lang:
         return text
-    # Language codes for NLLB
-    lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
-    if src_lang not in lang_map or tgt_lang not in lang_map:
         return "Error: Unsupported language combination"
-    tgt_lang_code = lang_map[tgt_lang]
-    tokenizer, model = models["nllb"]
-    # Preprocess for idioms
-    preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
-    # Improved chunking: Split by sentences while preserving context
-    chunks = []
-    current_chunk = ""
-    for sentence in re.split('([.!?।]+)', preprocessed_text):
-        if sentence.strip():
-            if len(current_chunk) + len(sentence) < 450:  # Leave room for tokenization
-                current_chunk += sentence
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk)
-                current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk)
-    translated_text = ""
-    for chunk in chunks:
-        if chunk.strip():
-            # Add target language token to the beginning of the input
-            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
-            # Get the token ID for the target language
-            tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
-            translated = model.generate(
-                **inputs,
-                forced_bos_token_id=tgt_lang_id,  # Fixed: Using convert_tokens_to_ids instead of lang_code_to_id
-                max_length=512,
-                num_beams=5,
-                length_penalty=1.0,
-                no_repeat_ngram_size=3
-            )
-            translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
-            translated_text += translated_chunk + " "
-    return translated_text.strip()
-# Function to save text as a file
 def save_text_to_file(text, original_filename, prefix="translated"):
-    output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
-    with open(output_filename, "w", encoding="utf-8") as f:
-        f.write(text)
-    return output_filename
-# Main processing function
 def process_document(file, source_lang, target_lang, models):
     try:
-        # Extract text from uploaded file
         text = extract_text(file)
-        # Translate the text
         translated_text = translate_text(text, source_lang, target_lang, models)
-        # Save the result
         if translated_text.startswith("Error:"):
             output_file = save_text_to_file(translated_text, file.name, prefix="error")
         else:
             output_file = save_text_to_file(translated_text, file.name)
         return output_file, translated_text
     except Exception as e:
         error_message = f"Error: {str(e)}"
         output_file = save_text_to_file(error_message, file.name, prefix="error")
         return output_file, error_message
-# Streamlit interface
 def main():
-    st.title("Document Translator (NLLB-200)")
-    st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
-    # Initialize models
     models = initialize_models()
-    # File uploader
-    uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
-    # Language selection
-    col1, col2 = st.columns(2)
-    with col1:
-        source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
-    with col2:
-        target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
-    if uploaded_file is not None and st.button("Translate"):
-        with st.spinner("Translating..."):
-            output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
-            # Display result
-            st.text_area("Translated Text", result_text, height=300)
-            # Provide download button
-            with open(output_file, "rb") as file:
-                st.download_button(
-                    label="Download Translated Document",
-                    data=file,
-                    file_name=os.path.basename(output_file),
-                    mime="text/plain"
-                )
 if __name__ == "__main__":
-    main()

 import docx
 import os
 import re
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import torch
+# Replace pytesseract with easyocr
+import easyocr
+from PIL import Image
+import numpy as np
+# Set up async environment for torch
+if torch.cuda.is_available():
+    torch.multiprocessing.set_start_method('spawn', force=True)
+# Initialize asyncio event loop
+try:
+    loop = asyncio.get_event_loop()
+except RuntimeError:
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+# Initialize EasyOCR reader
+@st.cache_resource
+def load_ocr_reader():
+    try:
+        return easyocr.Reader(['en'])  # Initialize for English
+    except Exception as e:
+        st.error(f"Error loading OCR reader: {str(e)}")
+        return None
+# Modified extract_text_from_image function with better error handling
+def extract_text_from_image(image_file):
+    try:
+        # Get the OCR reader
+        reader = load_ocr_reader()
+        if reader is None:
+            raise Exception("Failed to initialize OCR reader")
+        # Read the image using PIL
+        image = Image.open(image_file)
+        # Convert to numpy array
+        image_np = np.array(image)
+        # Perform OCR
+        results = reader.readtext(image_np)
+        if not results:
+            return "No text was detected in the image."
+        # Extract text from results
+        text = "\n".join([result[1] for result in results])
+        return text.strip()
+    except Exception as e:
+        raise Exception(f"Error extracting text from image: {str(e)}")
+# Modified extract_text function to support all file types
+def extract_text(file):
+    try:
+        ext = os.path.splitext(file.name)[1].lower()
+        if ext == ".pdf":
+            try:
+                reader = PdfReader(file)
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text() + "\n"
+                return text.strip()
+            except Exception as e:
+                raise Exception(f"Error reading PDF file: {str(e)}")
+        elif ext == ".docx":
+            try:
+                doc = docx.Document(file)
+                text = ""
+                for para in doc.paragraphs:
+                    text += para.text + "\n"
+                return text.strip()
+            except Exception as e:
+                raise Exception(f"Error reading DOCX file: {str(e)}")
+        elif ext == ".txt":
+            try:
+                return file.read().decode("utf-8").strip()
+            except Exception as e:
+                raise Exception(f"Error reading TXT file: {str(e)}")
+        elif ext in [".jpg", ".jpeg", ".png"]:
+            try:
+                return extract_text_from_image(file)
+            except Exception as e:
+                raise Exception(f"Error processing image file: {str(e)}")
+        else:
+            raise ValueError("Unsupported file format. Please upload PDF, DOCX, TXT, or image files (JPG, JPEG, PNG).")
+    except Exception as e:
+        raise Exception(f"Error extracting text from file: {str(e)}")
+# Load NLLB model and tokenizer with error handling
 @st.cache_resource
 def load_translation_model():
+    try:
+        model_name = "facebook/nllb-200-distilled-600M"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        return tokenizer, model
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None, None
 # Initialize model
 @st.cache_resource
 def initialize_models():
     tokenizer, model = load_translation_model()
+    if tokenizer is None or model is None:
+        st.error("Failed to initialize models")
+        return None
     return {"nllb": (tokenizer, model)}
 # Enhanced idiom mapping with more comprehensive translations
 def preprocess_idioms(text, src_lang, tgt_lang):
+    idiom_map = {}
     if src_lang == "en" and tgt_lang == "hi":
         idiom_map = {
+           "no piece of cake": "कोई आसान काम नहीं",
             "piece of cake": "बहुत आसान काम",
             "bite the bullet": "दांतों तले उंगली दबाना",
             "tackle it head-on": "सीधे मुकाबला करना",
             "proof of concept": "व्यवहार्यता का प्रमाण",
             "game changer": "खेल बदलने वाला"
         }
+    elif src_lang == "en" and tgt_lang == "mr":
+        idiom_map = {
+            "no piece of cake": "सोपं काम नाही",
+            "piece of cake": "अतिशय सोपं काम",
+            "bite the bullet": "कठीण निर्णय घेणे",
+            "tackle it head-on": "समस्येला थेट सामोरे जाणे",
+            "fell into place": "सगळं व्यवस्थित झालं",
+            "see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे",
+            "with a little perseverance": "थोड्या धीराने",
+            "break a leg": "खूप शुभेच्छा",
+            "hit the nail on the head": "अगदी बरोबर बोललात",
+            "once in a blue moon": "क्वचितच, कधीतरी",
+            "under the weather": "तब्येत ठीक नसणे",
+            "cost an arm and a leg": "खूप महाग",
+            "beating around the bush": "गोल गोल फिरवणे",
+            "call it a day": "दिवसाचं काम संपवणे",
+            "burn the midnight oil": "रात्रंदिवस मेहनत करणे",
+            "get the ball rolling": "सुरुवात करणे",
+            "pull yourself together": "स्वतःला सावरा",
+            "shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे",
+            "take it with a grain of salt": "साशंक दृष्टीने पाहणे",
+            "the last straw": "सहनशक्तीची शेवटची मर्यादा",
+            "time flies": "वेळ पंख लावून उडतो",
+            "wrap your head around": "समजून घेण्याचा प्रयत्न करणे",
+            "cut corners": "कमी वेळात काम उरकणे",
+            "back to square one": "पुन्हा सुरुवातीला",
+            "blessing in disguise": "आशीर्वाद लपलेला",
+            "cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे",
+            "keep your chin up": "धीर धरा",
+            # Work-related idioms
+            "think outside the box": "वेगळ्या पद्धतीने विचार करणे",
+            "raise the bar": "पातळी उंचावणे",
+            "learning curve": "शिकण्याची प्रक्रिया",
+            "up and running": "सुरू आणि कार्यरत",
+            "back to the drawing board": "पुन्हा नव्याने योजना आखणे",
+            # Project-related phrases
+            "running into issues": "अडचणींना सामोरे जाणे",
+            "iron out the bugs": "त्रुटी दूर करणे",
+            "in the pipeline": "विचाराधीन",
+            "moving forward": "पुढे जाताना",
+            "touch base": "संपर्कात राहणे",
+            # Technical phrases
+            "user-friendly": "वापरकर्त्यास सोयीस्कर",
+            "cutting-edge": "अत्याधुनिक",
+            "state of the art": "सर्वोत्कृष्ट तंत्रज्ञान",
+            "proof of concept": "संकल्पनेची सिद्धता",
+            "game changer": "खेळ बदलणारी गोष्ट"
+        }
+    if idiom_map:
         sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
         pattern = '|'.join(map(re.escape, sorted_idioms))
         def replace_idiom(match):
             return idiom_map[match.group(0).lower()]
         text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
     return text
+# Async translation function with fixed idiom processing
+async def translate_text_async(text, src_lang, tgt_lang, models):
     if src_lang == tgt_lang:
         return text
+    # Updated language mapping handling
+    src_lang_simple = src_lang.lower()
+    tgt_lang_simple = tgt_lang.lower()
+    lang_map = {"english": "eng_Latn", "hindi": "hin_Deva", "marathi": "mar_Deva"}
+    if src_lang_simple not in lang_map or tgt_lang_simple not in lang_map:
         return "Error: Unsupported language combination"
+    try:
+        # Process idioms first
+        preprocessed_text = preprocess_idioms(text, src_lang_simple[:2], tgt_lang_simple[:2])
+        tgt_lang_code = lang_map[tgt_lang_simple]
+        tokenizer, model = models["nllb"]
+        chunks = []
+        current_chunk = ""
+        # Split text into chunks while preserving sentences
+        for sentence in re.split('([.!?।]+)', preprocessed_text):
+            if sentence.strip():
+                if len(current_chunk) + len(sentence) < 450:
+                    current_chunk += sentence
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk)
+                    current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk)
+        translated_text = ""
+        # Translate each chunk
+        for chunk in chunks:
+            if chunk.strip():
+                inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+                tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
+                translated = model.generate(
+                    **inputs,
+                    forced_bos_token_id=tgt_lang_id,
+                    max_length=512,
+                    num_beams=5,
+                    length_penalty=1.0,
+                    no_repeat_ngram_size=3
+                )
+                translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
+                translated_text += translated_chunk + " "
+        return translated_text.strip()
+    except Exception as e:
+        return f"Error during translation: {str(e)}"
+# Synchronous wrapper for translation
+def translate_text(text, src_lang, tgt_lang, models):
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        return loop.run_until_complete(translate_text_async(text, src_lang, tgt_lang, models))
+    finally:
+        loop.close()
 def save_text_to_file(text, original_filename, prefix="translated"):
+    try:
+        # Get the original file extension and base name
+        base_name = os.path.splitext(os.path.basename(original_filename))[0]
+        output_filename = f"{prefix}_{base_name}.txt"
+        # Save all translations as text files for simplicity and build speed
+        with open(output_filename, "w", encoding="utf-8") as f:
+            f.write(text)
+        return output_filename
+    except Exception as e:
+        st.error(f"Error saving file: {str(e)}")
+        return None
+# Modified process_document function to handle multiple formats
 def process_document(file, source_lang, target_lang, models):
     try:
         text = extract_text(file)
         translated_text = translate_text(text, source_lang, target_lang, models)
         if translated_text.startswith("Error:"):
             output_file = save_text_to_file(translated_text, file.name, prefix="error")
         else:
             output_file = save_text_to_file(translated_text, file.name)
+        if output_file is None:
+            raise Exception("Failed to save output file")
         return output_file, translated_text
     except Exception as e:
         error_message = f"Error: {str(e)}"
         output_file = save_text_to_file(error_message, file.name, prefix="error")
         return output_file, error_message
+# Modified main function to ensure proper language handling
 def main():
+    st.title("Document Translation Toolkit")
+    # Initialize models with error handling
     models = initialize_models()
+    if models is None:
+        st.error("Failed to initialize translation models. Please try again.")
+        return
+    # Create tabs for different translation modes
+    tab1, tab2 = st.tabs(["Document Translation", "Text Translation"])
+    # Document Translation Tab
+    with tab1:
+        st.subheader("Document Translation")
+        st.write("Upload a document (PDF, DOCX, TXT, or Image) and select languages.")
+        uploaded_file = st.file_uploader(
+            "Upload Document",
+            type=["pdf", "docx", "txt", "jpg", "jpeg", "png"],
+            key="doc_uploader"
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            source_lang = st.selectbox(
+                "Source Language",
+                ["English", "Hindi", "Marathi"],
+                index=0,
+                key="doc_src"
+            )
+        with col2:
+            target_lang = st.selectbox(
+                "Target Language",
+                ["English", "Hindi", "Marathi"],
+                index=1,
+                key="doc_tgt"
+            )
+        if uploaded_file is not None and st.button("Translate Document"):
+            try:
+                with st.spinner("Translating..."):
+                    # Extract and show input text
+                    input_text = extract_text(uploaded_file)
+                    st.subheader("Input Text")
+                    st.text_area("Original Text", input_text, height=200)
+                    # Translate and show output text
+                    output_file, result_text = process_document(
+                        uploaded_file,
+                        source_lang.lower(),
+                        target_lang.lower(),
+                        models
+                    )
+                    st.subheader("Translated Text")
+                    st.text_area("Translation", result_text, height=200)
+                    # Provide download button with correct MIME type
+                    if output_file and os.path.exists(output_file):
+                        with open(output_file, "rb") as file:
+                            # Set appropriate MIME type based on file extension
+                            ext = os.path.splitext(output_file)[1].lower()
+                            mime_types = {
+                                '.pdf': 'application/pdf',
+                                '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+                                '.txt': 'text/plain',
+                                '.jpg': 'image/jpeg',
+                                '.jpeg': 'image/jpeg',
+                                '.png': 'image/png'
+                            }
+                            mime_type = mime_types.get(ext, 'text/plain')
+                            st.download_button(
+                                label="Download Translated Document",
+                                data=file,
+                                file_name=os.path.basename(output_file),
+                                mime=mime_type
+                            )
+                    else:
+                        st.error("Failed to generate output file")
+            except Exception as e:
+                st.error(f"An error occurred during translation: {str(e)}")
+    # Text Translation Tab
+    with tab2:
+        st.subheader("Text Translation")
+        st.write("Enter text directly for translation.")
+        col1, col2 = st.columns(2)
+        with col1:
+            text_source_lang = st.selectbox(
+                "Source Language",
+                ["English", "Hindi", "Marathi"],
+                index=0,
+                key="text_src"
+            )
+        with col2:
+            text_target_lang = st.selectbox(
+                "Target Language",
+                ["English", "Hindi", "Marathi"],
+                index=1,
+                key="text_tgt"
+            )
+        input_text = st.text_area("Enter text to translate", height=150)
+        if input_text and st.button("Translate Text"):
+            try:
+                with st.spinner("Translating..."):
+                    # Translate the input text
+                    translated_text = translate_text(
+                        input_text,
+                        text_source_lang.lower(),
+                        text_target_lang.lower(),
+                        models
+                    )
+                    # Show translation result
+                    st.text_area("Translation", translated_text, height=150)
+                    # Add download button for translated text
+                    st.download_button(
+                        label="Download Translation",
+                        data=translated_text,
+                        file_name="translation.txt",
+                        mime="text/plain"
+                    )
+            except Exception as e:
+                st.error(f"An error occurred during translation: {str(e)}")
 if __name__ == "__main__":
+    try:
+        main()
+    except Exception as e:
+        st.error(f"Application error: {str(e)}")