try / app.py
gauravchand11's picture
Update app.py
77a6efe verified
raw
history blame
7.64 kB
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import streamlit as st
from PyPDF2 import PdfReader
import docx
import os
import re
from datetime import datetime
# Page config
st.set_page_config(
page_title="Document Translator (NLLB-200)",
page_icon="📄",
layout="wide"
)
# Load NLLB model and tokenizer
@st.cache_resource
def load_translation_model():
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
return tokenizer, model
# Initialize model
@st.cache_resource
def initialize_models():
tokenizer, model = load_translation_model()
return {"nllb": (tokenizer, model)}
def split_long_sentence(sentence, max_length=200):
"""Split long sentences into smaller chunks at appropriate break points."""
if len(sentence) <= max_length:
return [sentence]
chunks = []
current_chunk = ""
words = sentence.split()
for word in words:
if len(current_chunk) + len(word) + 1 <= max_length:
current_chunk += (" " + word if current_chunk else word)
else:
chunks.append(current_chunk)
current_chunk = word
if current_chunk:
chunks.append(current_chunk)
return chunks
def preprocess_idioms(text, src_lang, tgt_lang):
if src_lang == "en" and tgt_lang == "hi":
idiom_map = {
# Common English-Hindi idiom mappings
"no piece of cake": "कोई आसान काम नहीं",
"bite the bullet": "दांतों तले उंगली दबाना",
"tackle it head-on": "इसे पूरे मन से हाथ में लेना",
"fell into place": "ठीक हो गया",
"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
"with a little perseverance": "थोड़े से धैर्य से",
"break the ice": "बातचीत की शुरुआत करना",
"on cloud nine": "सातवें आसमान पर होना",
"once in a blue moon": "कभी-कभार",
"beating around the bush": "इधर-उधर की बात करना",
"burning the midnight oil": "रात-रात भर जागकर काम करना",
"calm before the storm": "तूफान से पहले की शांति",
"cost an arm and a leg": "बहुत महंगा होना",
"blessing in disguise": "छुपा हुआ वरदान",
"kill two birds with one stone": "एक पंथ दो काज",
"a piece of cake": "बहुत आसान काम",
"under the weather": "तबीयत ठीक न होना",
"pull yourself together": "खुद को संभालो",
"rise and shine": "जल्दी उठो और तैयार हो जाओ",
"time flies": "समय पंख लगाकर उड़ता है",
"actions speak louder than words": "कथनी से करनी बड़ी",
"all ears": "पूरा ध्यान से सुन रहा हूं",
"back to square one": "वापस शुरुआत में",
"better late than never": "देर आये दुरुस्त आये",
"cry over spilled milk": "बीती बात पर पछताना",
"down to earth": "सरल स्वभाव का",
"every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
"food for thought": "सोचने वाली बात",
"give someone the benefit of the doubt": "शक का फायदा देना",
"hit the nail on the head": "सटीक बात कहना",
"in hot water": "मुसीबत में होना"
}
# Sort idioms by length (longest first) to handle overlapping phrases
sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
# Replace idioms with their translations
for idiom in sorted_idioms:
pattern = r'\b' + re.escape(idiom) + r'\b'
text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)
elif src_lang == "en" and tgt_lang == "mr":
idiom_map = {
"no piece of cake": "सोपं काम नाही",
"bite the bullet": "कठीण निर्णय घेणे",
"tackle it head-on": "समस्येला थेट सामोरे जाणे",
"fell into place": "सगळं व्यवस्थित झालं",
"see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
"with a little perseverance": "थोड्या धीराने",
"break the ice": "संभाषणाची सुरुवात करणे",
"on cloud nine": "आनंदात असणे",
"once in a blue moon": "क्वचितच",
"burning the midnight oil": "रात्रंदिवस मेहनत करणे",
"better late than never": "उशीर का होईना पण योग्य वेळी"
}
for idiom, translation in idiom_map.items():
pattern = r'\b' + re.escape(idiom) + r'\b'
text = re.sub(pattern, translation, text, flags=re.IGNORECASE)
return text
def extract_text(file):
ext = os.path.splitext(file.name)[1].lower()
if ext == ".pdf":
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
elif ext == ".docx":
doc = docx.Document(file)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
elif ext == ".txt":
return file.read().decode("utf-8")
else:
raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
def translate_text(text, src_lang, tgt_lang, models):
if src_lang == tgt_lang:
return text
# Language codes for NLLB
lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
if src_lang not in lang_map or tgt_lang not in lang_map:
return "Error: Unsupported language combination"
tgt_lang_code = lang_map[tgt_lang]
tokenizer, model = models["nllb"]
# Preprocess for idioms
preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
# Split text into smaller chunks (sentences)
sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
translated_text = []
for sentence in sentences:
if sentence.strip():
chunks = split_long_sentence(sentence, max_length=200)
for chunk in chunks:
try:
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
translated = model.generate(
**inputs