Spaces:

gauravchand11
/

try

Build error

App Files Files Community

gauravchand11 commited on 26 days ago

Commit

2ea2438

verified ·

1 Parent(s): ed75acb

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -189

app.py CHANGED Viewed

@@ -1,96 +1,96 @@
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-    BertTokenizer,
-    BertModel,
-    AutoModelForTokenClassification
-)
 import streamlit as st
 from PyPDF2 import PdfReader
 import docx
 import os
 import re
-import torch
-import numpy as np
-from datetime import datetime, timezone
-# Load models and tokenizers
 @st.cache_resource
-def load_models():
-    try:
-        # BERT model for context understanding
-        context_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
-        context_model = BertModel.from_pretrained('bert-base-multilingual-cased')
-        # NLLB model for translation
-        nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-        nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
-        # Grammar correction model
-        grammar_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-        grammar_model = AutoModelForTokenClassification.from_pretrained(
-            'bert-base-cased',
-            num_labels=3  # Assuming 3 labels: keep, delete, replace
-        )
-        return {
-            "context": (context_tokenizer, context_model),
-            "nllb": (nllb_tokenizer, nllb_model),
-            "grammar": (grammar_tokenizer, grammar_model)
-        }
-    except Exception as e:
-        st.error(f"Error loading models: {str(e)}")
-        raise e
-def get_bert_embeddings(text, models):
-    """Get contextual embeddings from BERT"""
-    tokenizer, model = models["context"]
-    # Split text into smaller chunks
-    max_length = 512
-    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
-    contextual_embeddings = []
-    for chunk in chunks:
-        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
-        with torch.no_grad():
-            outputs = model(**inputs)
-            embeddings = outputs.last_hidden_state.mean(dim=1)
-            contextual_embeddings.append(embeddings)
-    # Combine embeddings from all chunks
-    combined_embedding = torch.cat(contextual_embeddings, dim=0).mean(dim=0)
-    return combined_embedding
-def apply_grammar_correction(text, models):
-    """Basic grammar correction using BERT"""
-    tokenizer, model = models["grammar"]
-    sentences = re.split('([.!?।]+)', text)
-    corrected_sentences = []
-    for sentence in sentences:
-        if sentence.strip():
-            # Basic tokenization and prediction
-            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
-            with torch.no_grad():
-                outputs = model(**inputs)
-                predictions = torch.argmax(outputs.logits, dim=2)
-                tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
-                corrected_tokens = []
-                for token, pred in zip(tokens, predictions[0]):
-                    if pred == 0 or token in ['[CLS]', '[SEP]', '[PAD]']:
-                        if token not in ['[CLS]', '[SEP]', '[PAD]']:
-                            corrected_tokens.append(token)
-                corrected_text = tokenizer.convert_tokens_to_string(corrected_tokens)
-                if corrected_text.strip():
-                    corrected_sentences.append(corrected_text)
-    return " ".join(corrected_sentences)
 def extract_text(file):
     ext = os.path.splitext(file.name)[1].lower()
@@ -114,10 +114,12 @@ def extract_text(file):
     else:
         raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
 def translate_text(text, src_lang, tgt_lang, models):
     if src_lang == tgt_lang:
         return text
     lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
     if src_lang not in lang_map or tgt_lang not in lang_map:
@@ -126,81 +128,61 @@ def translate_text(text, src_lang, tgt_lang, models):
     tgt_lang_code = lang_map[tgt_lang]
     tokenizer, model = models["nllb"]
-    try:
-        # Get contextual embeddings
-        context_embedding = get_bert_embeddings(text, models)
-        # Split into chunks for translation
-        chunks = []
-        current_chunk = ""
-        for sentence in re.split('([.!?।]+)', text):
-            if sentence.strip():
-                if len(current_chunk) + len(sentence) < 450:
-                    current_chunk += sentence
-                else:
-                    if current_chunk:
-                        chunks.append(current_chunk)
-                    current_chunk = sentence
-        if current_chunk:
-            chunks.append(current_chunk)
-        translated_text = ""
-        for chunk in chunks:
-            if chunk.strip():
-                inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
-                # Use context embedding to modify attention
-                attention_mask = inputs['attention_mask'].float()
-                context_weight = 0.1 * torch.sigmoid(context_embedding.mean())
-                attention_mask = attention_mask * (1 + context_weight)
-                # Get target language token ID
-                tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
-                with torch.no_grad():
-                    translated = model.generate(
-                        input_ids=inputs['input_ids'],
-                        attention_mask=attention_mask,
-                        forced_bos_token_id=tgt_lang_id,
-                        max_length=512,
-                        num_beams=5,
-                        length_penalty=1.0,
-                        no_repeat_ngram_size=3,
-                        do_sample=True,
-                        temperature=0.7
-                    )
-                    translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
-                    translated_text += translated_chunk + " "
-        # Apply basic grammar correction
-        corrected_text = apply_grammar_correction(translated_text.strip(), models)
-        return corrected_text
-    except Exception as e:
-        st.error(f"Translation error: {str(e)}")
-        return f"Error during translation: {str(e)}"
 def save_text_to_file(text, original_filename, prefix="translated"):
-    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
-    output_filename = f"{prefix}_{timestamp}_{os.path.basename(original_filename)}.txt"
     with open(output_filename, "w", encoding="utf-8") as f:
         f.write(text)
     return output_filename
 def process_document(file, source_lang, target_lang, models):
     try:
         # Extract text from uploaded file
         text = extract_text(file)
-        # Add debugging information
-        st.sidebar.write("Processing document...")
-        st.sidebar.write(f"Source language: {source_lang}")
-        st.sidebar.write(f"Target language: {target_lang}")
         # Translate the text
         translated_text = translate_text(text, source_lang, target_lang, models)
@@ -211,56 +193,44 @@ def process_document(file, source_lang, target_lang, models):
             output_file = save_text_to_file(translated_text, file.name)
         return output_file, translated_text
     except Exception as e:
         error_message = f"Error: {str(e)}"
-        st.error(error_message)
         output_file = save_text_to_file(error_message, file.name, prefix="error")
         return output_file, error_message
 def main():
-    st.title("Advanced Document Translator")
-    st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages.")
-    # Display current user and timestamp
-    st.sidebar.write(f"Current User: {os.getenv('USER', 'gauravchand')}")
-    st.sidebar.write(f"UTC Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}")
-    try:
-        # Initialize models with error handling
-        with st.spinner("Loading models..."):
-            models = load_models()
-            st.success("Models loaded successfully!")
-        # File uploader
-        uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
-        # Language selection
-        col1, col2 = st.columns(2)
-        with col1:
-            source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
-        with col2:
-            target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
-        if uploaded_file is not None and st.button("Translate"):
-            with st.spinner("Processing and Translating..."):
-                output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
-                # Display result
-                st.text_area("Translated Text", result_text, height=300)
-                # Provide download button
-                with open(output_file, "rb") as file:
-                    st.download_button(
-                        label="Download Translated Document",
-                        data=file,
-                        file_name=os.path.basename(output_file),
-                        mime="text/plain"
-                    )
-    except Exception as e:
-        st.error(f"Application error: {str(e)}")
-        st.warning("Please try refreshing the page or contact support.")
-if __name__ == "__main__":
     main()

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import streamlit as st
 from PyPDF2 import PdfReader
 import docx
 import os
 import re
+# Load NLLB model and tokenizer
 @st.cache_resource
+def load_translation_model():
+    model_name = "facebook/nllb-200-distilled-600M"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    return tokenizer, model
+# Initialize model
+@st.cache_resource
+def initialize_models():
+    tokenizer, model = load_translation_model()
+    return {"nllb": (tokenizer, model)}
+# Enhanced idiom mapping with more comprehensive translations
+def preprocess_idioms(text, src_lang, tgt_lang):
+    if src_lang == "en" and tgt_lang == "hi":
+        idiom_map = {
+            # Basic phrases
+            "no piece of cake": "कोई आसान काम नहीं",
+            "piece of cake": "बहुत आसान काम",
+            "bite the bullet": "दांतों तले उंगली दबाना",
+            "tackle it head-on": "सीधे मुकाबला करना",
+            "fell into place": "सब कुछ ठीक हो गया",
+            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
+            "with a little perseverance": "थोड़े से धैर्य से",
+            # Additional common idioms
+            "break a leg": "बहुत बहुत शुभकामनाएं",
+            "hit the nail on the head": "बिल्कुल सही बात कहना",
+            "once in a blue moon": "बहुत कम, कभी-कभार",
+            "under the weather": "तबीयत ठीक नहीं",
+            "cost an arm and a leg": "बहुत महंगा",
+            "beating around the bush": "इधर-उधर की बात करना",
+            "call it a day": "काम समाप्त करना",
+            "burn the midnight oil": "रात-रात भर जागकर काम करना",
+            "get the ball rolling": "शुरुआत करना",
+            "pull yourself together": "खुद को संभालो",
+            "shoot yourself in the foot": "अपना ही नुकसान करना",
+            "take it with a grain of salt": "संदेह से लेना",
+            "the last straw": "सहनशीलता की आखिरी सीमा",
+            "time flies": "समय पंख लगाकर उड़ता है",
+            "wrap your head around": "समझने की कोशिश करना",
+            "cut corners": "काम में छोटा रास्ता अपनाना",
+            "back to square one": "फिर से शुरू से",
+            "blessing in disguise": "छिपा हुआ वरदान",
+            "cry over spilled milk": "बीती बात पर पछताना",
+            "keep your chin up": "हिम्मत रखना",
+            # Work-related idioms
+            "think outside the box": "नए तरीके से सोचना",
+            "raise the bar": "मानक ऊंचा करना",
+            "learning curve": "सीखने की प्रक्रिया",
+            "up and running": "चालू और कार्यरत",
+            "back to the drawing board": "फिर से योजना बनाना",
+            # Project-related phrases
+            "running into issues": "समस्याओं का सामना करना",
+            "iron out the bugs": "खामियां दूर करना",
+            "in the pipeline": "विचाराधीन",
+            "moving forward": "आगे बढ़ते हुए",
+            "touch base": "संपर्क में रहना",
+            # Technical phrases
+            "user-friendly": "उपयोगकर्ता के अनुकूल",
+            "cutting-edge": "अत्याधुनिक",
+            "state of the art": "अत्याधुनिक तकनीक",
+            "proof of concept": "व्यवहार्यता का प्रमाण",
+            "game changer": "खेल बदलने वाला"
+        }
+        # Sort idioms by length (longest first) to handle overlapping phrases
+        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
+        # Create a single regex pattern for all idioms
+        pattern = '|'.join(map(re.escape, sorted_idioms))
+        def replace_idiom(match):
+            return idiom_map[match.group(0).lower()]
+        # Replace all idioms in one pass, case-insensitive
+        text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
+    return text
+# Function to extract text from different file types
 def extract_text(file):
     ext = os.path.splitext(file.name)[1].lower()
     else:
         raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
+# Translation function with improved chunking and fixed tokenizer issue
 def translate_text(text, src_lang, tgt_lang, models):
     if src_lang == tgt_lang:
         return text
+    # Language codes for NLLB
     lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
     if src_lang not in lang_map or tgt_lang not in lang_map:
     tgt_lang_code = lang_map[tgt_lang]
     tokenizer, model = models["nllb"]
+    # Preprocess for idioms
+    preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
+    # Improved chunking: Split by sentences while preserving context
+    chunks = []
+    current_chunk = ""
+    for sentence in re.split('([.!?।]+)', preprocessed_text):
+        if sentence.strip():
+            if len(current_chunk) + len(sentence) < 450:  # Leave room for tokenization
+                current_chunk += sentence
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk)
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk)
+    translated_text = ""
+    for chunk in chunks:
+        if chunk.strip():
+            # Add target language token to the beginning of the input
+            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            # Get the token ID for the target language
+            tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
+            translated = model.generate(
+                **inputs,
+                forced_bos_token_id=tgt_lang_id,  # Fixed: Using convert_tokens_to_ids instead of lang_code_to_id
+                max_length=512,
+                num_beams=5,
+                length_penalty=1.0,
+                no_repeat_ngram_size=3
+            )
+            translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
+            translated_text += translated_chunk + " "
+    return translated_text.strip()
+# Function to save text as a file
 def save_text_to_file(text, original_filename, prefix="translated"):
+    output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
     with open(output_filename, "w", encoding="utf-8") as f:
         f.write(text)
     return output_filename
+# Main processing function
 def process_document(file, source_lang, target_lang, models):
     try:
         # Extract text from uploaded file
         text = extract_text(file)
         # Translate the text
         translated_text = translate_text(text, source_lang, target_lang, models)
             output_file = save_text_to_file(translated_text, file.name)
         return output_file, translated_text
     except Exception as e:
         error_message = f"Error: {str(e)}"
         output_file = save_text_to_file(error_message, file.name, prefix="error")
         return output_file, error_message
+# Streamlit interface
 def main():
+    st.title("Document Translator (NLLB-200)")
+    st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
+    # Initialize models
+    models = initialize_models()
+    # File uploader
+    uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
+    # Language selection
+    col1, col2 = st.columns(2)
+    with col1:
+        source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
+    with col2:
+        target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
+    if uploaded_file is not None and st.button("Translate"):
+        with st.spinner("Translating..."):
+            output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
+            # Display result
+            st.text_area("Translated Text", result_text, height=300)
+            # Provide download button
+            with open(output_file, "rb") as file:
+                st.download_button(
+                    label="Download Translated Document",
+                    data=file,
+                    file_name=os.path.basename(output_file),
+                    mime="text/plain"
+                )
+if _name_ == "_main_":
     main()