Spaces:

gauravchand11
/

H

Sleeping

App Files Files Community

gauravchand11 commited on Apr 24

Commit

35445ba

verified ·

1 Parent(s): 8981936

Create app.py

Browse files

Files changed (1) hide show

app.py +572 -0

app.py ADDED Viewed

	@@ -0,0 +1,572 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import gradio as gr
+from PyPDF2 import PdfReader
+import docx
+import os
+import re
+import torch
+from datetime import datetime
+import pytz
+from io import BytesIO
+from docx import Document
+import tempfile
+# Load translation model
+def load_translation_model():
+    try:
+        model_name = "facebook/nllb-200-distilled-600M"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        return tokenizer, model
+    except Exception as e:
+        print(f"Error loading model: {str(e)}")
+        return None, None
+# Initialize models
+tokenizer, model = load_translation_model()
+MODELS = {"nllb": (tokenizer, model)} if tokenizer and model else None
+# Extract text from documents
+def extract_text(file):
+    try:
+        if isinstance(file, str):  # File path provided
+            ext = os.path.splitext(file)[1].lower()
+        else:  # File object provided
+            ext = os.path.splitext(file.name)[1].lower()
+        if ext == ".pdf":
+            try:
+                # Create a BytesIO object to hold the file content
+                if isinstance(file, str):
+                    with open(file, 'rb') as f:
+                        file_content = BytesIO(f.read())
+                else:
+                    file_content = BytesIO(file.read())
+                # Create PdfReader object from the BytesIO
+                reader = PdfReader(file_content)
+                text = ""
+                for page in reader.pages:
+                    text += page.extract_text() + "\n"
+                return text.strip()
+            except Exception as e:
+                raise Exception(f"PDF extraction error: {str(e)}")
+            finally:
+                if 'file_content' in locals():
+                    file_content.close()
+        elif ext == ".docx":
+            if isinstance(file, str):
+                doc = docx.Document(file)
+            else:
+                doc = docx.Document(file)
+            text = ""
+            for para in doc.paragraphs:
+                text += para.text + "\n"
+            return text.strip()
+        elif ext == ".txt":
+            if isinstance(file, str):
+                with open(file, 'r', encoding='utf-8') as f:
+                    return f.read().strip()
+            else:
+                return file.read().decode("utf-8").strip()
+        else:
+            raise ValueError("Unsupported file format")
+    except Exception as e:
+        raise Exception(f"Error extracting text: {str(e)}")
+# Preprocess idioms
+def preprocess_idioms(text, src_lang, tgt_lang):
+    idiom_map = {}
+    if src_lang == "en" and tgt_lang == "hi":
+        idiom_map = {
+            "no piece of cake": "कोई आसान काम नहीं",
+            "piece of cake": "बहुत आसान काम",
+            "bite the bullet": "दांतों तले उंगली दबाना",
+            "tackle it head-on": "सीधे मुकाबला करना",
+            "fell into place": "सब कुछ ठीक हो गया",
+            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
+            "with a little perseverance": "थोड़े से धैर्य से",
+            # Additional common idioms
+            "break a leg": "बहुत बहुत शुभकामनाएं",
+            "hit the nail on the head": "बिल्कुल सही बात कहना",
+            "once in a blue moon": "बहुत कम, कभी-कभार",
+            "under the weather": "तबीयत ठीक नहीं",
+            "cost an arm and a leg": "बहुत महंगा",
+            "beating around the bush": "इधर-उधर की बात करना",
+            "call it a day": "काम समाप्त करना",
+            "burn the midnight oil": "रात-रात भर जागकर काम करना",
+            "get the ball rolling": "शुरुआत करना",
+            "pull yourself together": "खुद को संभालो",
+            "shoot yourself in the foot": "अपना ही नुकसान करना",
+            "take it with a grain of salt": "संदेह से लेना",
+            "the last straw": "सहनशीलता की आखिरी सीमा",
+            "time flies": "समय पंख लगाकर उड़ता है",
+            "wrap your head around": "समझने की कोशिश करना",
+            "cut corners": "काम में छोटा रास्ता अपनाना",
+            "back to square one": "फिर से शुरू से",
+            "blessing in disguise": "छिपा हुआ वरदान",
+            "cry over spilled milk": "बीती बात पर पछताना",
+            "keep your chin up": "हिम्मत रखना",
+            # Work-related idioms
+            "think outside the box": "नए तरीके से सोचना",
+            "raise the bar": "मानक ऊंचा करना",
+            "learning curve": "सीखने की प्रक्रिया",
+            "up and running": "चालू और कार्यरत",
+            "back to the drawing board": "फिर से योजना बनाना",
+            # Project-related phrases
+            "running into issues": "समस्याओं का सामना करना",
+            "iron out the bugs": "खामियां दूर करना",
+            "in the pipeline": "विचाराधीन",
+            "moving forward": "आगे बढ़ते हुए",
+            "touch base": "संपर्क में रहना",
+            # Technical phrases
+            "user-friendly": "उपयोगकर्ता के अनुकूल",
+            "cutting-edge": "अत्याधुनिक",
+            "state of the art": "अत्याधुनिक तकनीक",
+            "proof of concept": "व्यवहार्यता का प्रमाण",
+            "game changer": "खेल बदलने वाला",
+            "a blessing in disguise": "छुपा हुआ वरदान",
+            "actions speak louder than words": "कर्म शब्दों से अधिक प्रभावी होते हैं",
+            "add fuel to the fire": "आग में घी डालना",
+            "barking up the wrong tree": "गलत दिशा में प्रयास करना",
+            "best of both worlds": "दोनों चीजों का लाभ",
+            "cut to the chase": "मुद्दे पर आना",
+            "don't judge a book by its cover": "किसी को उसके रूप से मत आंकिए",
+            "easy does it": "धीरे-धीरे करो",
+            "every cloud has a silver lining": "हर मुश्किल में आशा की किरण होती है",
+            "get a taste of your own medicine": "जैसा किया वैसा भुगतो",
+            "hit the sack": "सोने जाना",
+            "let the cat out of the bag": "राज़ खोल देना",
+            "miss the boat": "मौका चूक जाना",
+            "no pain no gain": "बिना मेहनत के कुछ नहीं मिलता",
+            "on the ball": "सचेत और सतर्क",
+            "pull the plug": "काम रोक देना",
+            "spill the beans": "राज़ खोलना",
+            "the ball is in your court": "अब निर्णय तुम्हारे हाथ में है",
+            "through thick and thin": "हर परिस्थिति में",
+            "you can't have your cake and eat it too": "दोनों फायदे एक साथ नहीं हो सकते"
+        }
+    elif src_lang == "en" and tgt_lang == "mr":
+        idiom_map = {
+            "no piece of cake": "सोपं काम नाही",
+            "piece of cake": "अतिशय सोपं काम",
+            "bite the bullet": "कठीण निर्णय घेणे",
+            "tackle it head-on": "समस्येला थेट सामोरे जाणे",
+            "fell into place": "सगळं व्यवस्थित झालं",
+            "see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे",
+            "with a little perseverance": "थोड्या धीराने",
+            "break a leg": "खूप शुभेच्छा",
+            "hit the nail on the head": "अगदी बरोबर बोललात",
+            "once in a blue moon": "क्वचितच, कधीतरी",
+            "under the weather": "तब्येत ठीक नसणे",
+            "cost an arm and a leg": "खूप महाग",
+            "beating around the bush": "गोल गोल फिरवणे",
+            "call it a day": "दिवसाचं काम संपवणे",
+            "burn the midnight oil": "रात्रंदिवस मेहनत करणे",
+            "get the ball rolling": "सुरुवात करणे",
+            "pull yourself together": "स्वतःला सावरा",
+            "shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे",
+            "take it with a grain of salt": "साशंक दृष्टीने पाहणे",
+            "the last straw": "सहनशक्तीची शेवटची मर्यादा",
+            "time flies": "वेळ पंख लावून उडतो",
+            "wrap your head around": "समजून घेण्याचा प्रयत्न करणे",
+            "cut corners": "कमी वेळात काम उरकणे",
+            "back to square one": "पुन्हा सुरुवातीला",
+            "blessing in disguise": "आशीर्वाद लपलेला",
+            "cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे",
+            "keep your chin up": "धीर धरा",
+            # Work-related idioms
+            "think outside the box": "वेगळ्या पद्धतीने विचार करणे",
+            "raise the bar": "पातळी उंचावणे",
+            "learning curve": "शिकण्याची प्रक्रिया",
+            "up and running": "सुरू आणि कार्यरत",
+            "back to the drawing board": "पुन्हा नव्याने योजना आखणे",
+            # Project-related phrases
+            "running into issues": "अडचणींना सामोरे जाणे",
+            "iron out the bugs": "त्रुटी दूर करणे",
+            "in the pipeline": "विचाराधीन",
+            "moving forward": "पुढे जाताना",
+            "touch base": "संपर्कात राहणे",
+            # Technical phrases
+            "user-friendly": "वापरकर्त्यास सोयीस्कर",
+            "cutting-edge": "अत्याधुनिक",
+            "state of the art": "सर्वोत्कृष्ट तंत्रज्ञान",
+            "proof of concept": "संकल्पनेची सिद्धता",
+            "game changer": "खेळ बदलणारी गोष्ट",
+            "a blessing in disguise": "छुपलेले वरदान",
+            "actions speak louder than words": "कृती शब्दांपेक्षा प्रभावी असतात",
+            "add fuel to the fire": "आग ला फुंकर घालणे",
+            "barking up the wrong tree": "चुकीच्या गोष्टीकडे लक्ष देणे",
+            "best of both worlds": "दोनही गोष्टींचा लाभ",
+            "cut to the chase": "थेट मुद्द्यावर येणे",
+            "don't judge a book by its cover": "फक्त बाह्यरूप पाहून अंदाज लावू नका",
+            "easy does it": "हळूहळू करा",
+            "every cloud has a silver lining": "प्रत्येक संकटात संधी असते",
+            "get a taste of your own medicine": "जसे कराल तसे भराल",
+            "hit the sack": "झोपायला जाणे",
+            "let the cat out of the bag": "गुपित उघड करणे",
+            "miss the boat": "संधी गमावणे",
+            "no pain no gain": "कष्टाशिवाय यश नाही",
+            "on the ball": "सतर्क असणे",
+            "pull the plug": "काम बंद करणे",
+            "spill the beans": "गुपित सांगणे",
+            "the ball is in your court": "निर्णय तुमच्या हाती आहे",
+            "through thick and thin": "संकटसमयीही साथ देणे",
+            "you can't have your cake and eat it too": "सगळं काही मिळवता येत नाही"
+        }
+    if idiom_map:
+        # Sort idioms by length (longest first) to handle overlapping phrases
+        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
+        pattern = '|'.join(map(re.escape, sorted_idioms))
+        # Create a regex pattern and replace idioms
+        if pattern:
+            regex = re.compile(pattern, flags=re.IGNORECASE)
+            text = regex.sub(lambda m: idiom_map[m.group(0).lower()], text)
+    return text
+# Translation function
+def translate_text(text, src_lang, tgt_lang):
+    if src_lang == tgt_lang:
+        return text
+    lang_map = {"English": "eng_Latn", "Hindi": "hin_Deva", "Marathi": "mar_Deva"}
+    src_lang_code = lang_map.get(src_lang)
+    tgt_lang_code = lang_map.get(tgt_lang)
+    if not src_lang_code or not tgt_lang_code:
+        return "Error: Unsupported language combination"
+    try:
+        # First apply idiom preprocessing
+        preprocessed_text = preprocess_idioms(text, src_lang[:2].lower(), tgt_lang[:2].lower())
+        tokenizer, model = MODELS["nllb"]
+        chunks = []
+        current_chunk = ""
+        # Split text into manageable chunks
+        for sentence in re.split('([.!?।]+)', preprocessed_text):
+            if sentence.strip():
+                if len(current_chunk) + len(sentence) < 450:
+                    current_chunk += sentence
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk)
+                    current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk)
+        translated_text = ""
+        # Translate each chunk
+        for chunk in chunks:
+            if chunk.strip():
+                inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+                tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
+                translated = model.generate(
+                    **inputs,
+                    forced_bos_token_id=tgt_lang_id,
+                    max_length=512,
+                    num_beams=5,
+                    length_penalty=1.0,
+                    no_repeat_ngram_size=3
+                )
+                translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
+                translated_text += translated_chunk + " "
+        return translated_text.strip()
+    except Exception as e:
+        return f"Error during translation: {str(e)}"
+# Document translation function
+def translate_document(file, source_lang, target_lang):
+    try:
+        if file is None:
+            return "Please upload a file", None
+        input_ext = os.path.splitext(file.name)[1].lower()
+        temp_dir = tempfile.gettempdir()
+        # Change output extension to .txt for PDF inputs
+        if input_ext == '.pdf':
+            output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.txt"
+        else:
+            output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}{input_ext}"
+        output_path = os.path.join(temp_dir, output_filename)
+        if input_ext == '.pdf':
+            try:
+                # Create a BytesIO object for the PDF content
+                if isinstance(file, str):
+                    with open(file, 'rb') as f:
+                        file_content = BytesIO(f.read())
+                else:
+                    file_content = BytesIO(file.read())
+                # Create PdfReader object
+                reader = PdfReader(file_content)
+                translated_pages = []
+                # Process each page while preserving structure
+                for page_num, page in enumerate(reader.pages, 1):
+                    # Extract text from the page
+                    page_text = page.extract_text()
+                    if not page_text.strip():
+                        continue
+                    # Split into paragraphs while preserving structure
+                    paragraphs = page_text.split('\n\n')
+                    translated_paragraphs = []
+                    for paragraph in paragraphs:
+                        # Handle individual lines within paragraphs
+                        lines = paragraph.split('\n')
+                        translated_lines = []
+                        for line in lines:
+                            if line.strip():
+                                translated_line = translate_text(line, source_lang, target_lang)
+                                translated_lines.append(translated_line)
+                            else:
+                                translated_lines.append('')  # Preserve empty lines
+                        translated_paragraphs.append('\n'.join(translated_lines))
+                    # Combine translated paragraphs with proper spacing
+                    translated_pages.append('\n\n'.join(translated_paragraphs))
+                # Combine all translated pages
+                final_text = '\n\n'.join(translated_pages)
+                # Save as formatted txt file
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    f.write(final_text)
+                return final_text, output_path
+            except Exception as e:
+                raise Exception(f"PDF processing error: {str(e)}")
+            finally:
+                if 'file_content' in locals():
+                    file_content.close()
+        elif input_ext == '.docx':
+            # Handle DOCX with formatting preservation
+            doc = Document(file)
+            new_doc = Document()
+            # Copy styles from original document
+            for style in doc.styles:
+                if style.name not in new_doc.styles:
+                    new_doc.styles.add_style(
+                        style.name,
+                        style.type,
+                        True if style.base_style else False
+                    )
+            # Process each paragraph while preserving formatting
+            for para in doc.paragraphs:
+                if not para.text.strip():
+                    # Preserve empty paragraphs
+                    new_doc.add_paragraph()
+                    continue
+                # Create new paragraph with same style
+                new_para = new_doc.add_paragraph(style=para.style.name if para.style else None)
+                # Buffer to collect text for translation
+                runs_buffer = []
+                formatting_map = []
+                # Collect text and formatting information
+                for run in para.runs:
+                    if run.text.strip():
+                        runs_buffer.append(run.text)
+                        # Store formatting attributes
+                        formatting_map.append({
+                            'bold': run.bold,
+                            'italic': run.italic,
+                            'underline': run.underline,
+                            'font_size': run.font.size if run.font.size else None,
+                            'font_name': run.font.name if run.font.name else None,
+                            'color': run.font.color.rgb if run.font.color and run.font.color.rgb else None
+                        })
+                if runs_buffer:
+                    # Translate the combined text
+                    combined_text = " ".join(runs_buffer)
+                    translated_text = translate_text(combined_text, source_lang, target_lang)
+                    # Split translated text approximately matching original structure
+                    translated_parts = translated_text.split()
+                    avg_len = len(translated_parts) // len(formatting_map)
+                    # Apply formatting to translated parts
+                    current_index = 0
+                    for i, format_info in enumerate(formatting_map):
+                        # Calculate text chunk for this run
+                        end_index = min(current_index + avg_len, len(translated_parts))
+                        if i == len(formatting_map) - 1:
+                            # Last run gets all remaining text
+                            end_index = len(translated_parts)
+                        chunk_text = " ".join(translated_parts[current_index:end_index])
+                        current_index = end_index
+                        # Create new run with preserved formatting
+                        new_run = new_para.add_run(chunk_text + " ")
+                        new_run.bold = format_info['bold']
+                        new_run.italic = format_info['italic']
+                        new_run.underline = format_info['underline']
+                        if format_info['font_size']:
+                            new_run.font.size = format_info['font_size']
+                        if format_info['font_name']:
+                            new_run.font.name = format_info['font_name']
+                        if format_info['color']:
+                            new_run.font.color.rgb = format_info['color']
+            # Save the formatted document
+            new_doc.save(output_path)
+            # Return both text content and file
+            text_content = "\n".join(para.text for para in new_doc.paragraphs if para.text.strip())
+            return text_content, output_path
+        elif input_ext == '.txt':
+            # Handle TXT with line formatting preservation
+            input_text = extract_text(file)
+            if not input_text:
+                return "Could not extract text from the document", None
+            # Split into paragraphs while preserving line breaks
+            paragraphs = input_text.split('\n\n')
+            translated_paragraphs = []
+            for paragraph in paragraphs:
+                # Handle individual lines within paragraphs
+                lines = paragraph.split('\n')
+                translated_lines = []
+                for line in lines:
+                    if line.strip():
+                        translated_line = translate_text(line, source_lang, target_lang)
+                        translated_lines.append(translated_line)
+                    else:
+                        translated_lines.append('')  # Preserve empty lines
+                translated_paragraphs.append('\n'.join(translated_lines))
+            # Combine translated paragraphs with double line breaks
+            final_text = '\n\n'.join(translated_paragraphs)
+            # Save as formatted txt file
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(final_text)
+            return final_text, output_path
+        else:
+            # For other file types, use the original translation logic
+            input_text = extract_text(file)
+            if input_text is None:
+                return "Could not extract text from the document", None
+            translated_text = translate_text(input_text, source_lang, target_lang)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(translated_text)
+            return translated_text, output_path
+    except Exception as e:
+        return f"Error: {str(e)}", None
+# Direct text translation function
+def translate_text_direct(text, source_lang, target_lang):
+    if not text:
+        return "Please enter some text"
+    return translate_text(text, source_lang, target_lang)
+# Get current time in UTC
+def get_current_time():
+    utc_now = datetime.now(pytz.UTC)
+    return utc_now.strftime("%Y-%m-%d %H:%M:%S")
+# Create Gradio interface
+def create_interface():
+    # Add header with timestamp and user info
+    header = gr.Markdown(
+        f"""
+        # Document Translation Toolkit
+        *Current Date and Time (UTC):* {get_current_time()}
+        *Current User's Login:* gauravchand
+        """
+    )
+    # Document Translation Interface
+    doc_interface = gr.Interface(
+        fn=translate_document,
+        inputs=[
+            gr.File(label="Upload Document (PDF, DOCX, or TXT)"),
+            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"),
+            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi")
+        ],
+        outputs=[
+            gr.Textbox(label="Translation", lines=10),
+            gr.File(label="Download Translation")
+        ],
+        title="Document Translation",
+        description="Upload a document to translate"
+    )
+    # Text Translation Interface
+    text_interface = gr.Interface(
+        fn=translate_text_direct,
+        inputs=[
+            gr.Textbox(lines=5, label="Enter text to translate"),
+            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"),
+            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi")
+        ],
+        outputs=gr.Textbox(label="Translation", lines=5),
+        title="Text Translation",
+        description="Enter text directly to translate"
+    )
+    # Combine interfaces with header
+    demo = gr.Blocks()
+    with demo:
+        header.render()
+        gr.TabbedInterface(
+            [doc_interface, text_interface],
+            tab_names=["Document Translation", "Text Translation"]
+        )
+    return demo
+# Launch the app
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()