Spaces:
Build error
Build error
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import streamlit as st | |
from PyPDF2 import PdfReader | |
import docx | |
import os | |
import re | |
from datetime import datetime | |
# Page config | |
st.set_page_config( | |
page_title="Document Translator (NLLB-200)", | |
page_icon="📄", | |
layout="wide" | |
) | |
# Load NLLB model and tokenizer | |
def load_translation_model(): | |
model_name = "facebook/nllb-200-distilled-600M" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
return tokenizer, model | |
# Initialize model | |
def initialize_models(): | |
tokenizer, model = load_translation_model() | |
return {"nllb": (tokenizer, model)} | |
def split_long_sentence(sentence, max_length=200): | |
"""Split long sentences into smaller chunks at appropriate break points.""" | |
if len(sentence) <= max_length: | |
return [sentence] | |
chunks = [] | |
current_chunk = "" | |
words = sentence.split() | |
for word in words: | |
if len(current_chunk) + len(word) + 1 <= max_length: | |
current_chunk += (" " + word if current_chunk else word) | |
else: | |
chunks.append(current_chunk) | |
current_chunk = word | |
if current_chunk: | |
chunks.append(current_chunk) | |
return chunks | |
def preprocess_idioms(text, src_lang, tgt_lang): | |
if src_lang == "en" and tgt_lang == "hi": | |
idiom_map = { | |
# Common English-Hindi idiom mappings | |
"no piece of cake": "कोई आसान काम नहीं", | |
"bite the bullet": "दांतों तले उंगली दबाना", | |
"tackle it head-on": "इसे पूरे मन से हाथ में लेना", | |
"fell into place": "ठीक हो गया", | |
"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना", | |
"with a little perseverance": "थोड़े से धैर्य से", | |
"break the ice": "बातचीत की शुरुआत करना", | |
"on cloud nine": "सातवें आसमान पर होना", | |
"once in a blue moon": "कभी-कभार", | |
"beating around the bush": "इधर-उधर की बात करना", | |
"burning the midnight oil": "रात-रात भर जागकर काम करना", | |
"calm before the storm": "तूफान से पहले की शांति", | |
"cost an arm and a leg": "बहुत महंगा होना", | |
"blessing in disguise": "छुपा हुआ वरदान", | |
"kill two birds with one stone": "एक पंथ दो काज", | |
"a piece of cake": "बहुत आसान काम", | |
"under the weather": "तबीयत ठीक न होना", | |
"pull yourself together": "खुद को संभालो", | |
"rise and shine": "जल्दी उठो और तैयार हो जाओ", | |
"time flies": "समय पंख लगाकर उड़ता है", | |
"actions speak louder than words": "कथनी से करनी बड़ी", | |
"all ears": "पूरा ध्यान से सुन रहा हूं", | |
"back to square one": "वापस शुरुआत में", | |
"better late than never": "देर आये दुरुस्त आये", | |
"cry over spilled milk": "बीती बात पर पछताना", | |
"down to earth": "सरल स्वभाव का", | |
"every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है", | |
"food for thought": "सोचने वाली बात", | |
"give someone the benefit of the doubt": "शक का फायदा देना", | |
"hit the nail on the head": "सटीक बात कहना", | |
"in hot water": "मुसीबत में होना" | |
} | |
# Sort idioms by length (longest first) to handle overlapping phrases | |
sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True) | |
# Replace idioms with their translations | |
for idiom in sorted_idioms: | |
pattern = r'\b' + re.escape(idiom) + r'\b' | |
text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE) | |
elif src_lang == "en" and tgt_lang == "mr": | |
idiom_map = { | |
"no piece of cake": "सोपं काम नाही", | |
"bite the bullet": "कठीण निर्णय घेणे", | |
"tackle it head-on": "समस्येला थेट सामोरे जाणे", | |
"fell into place": "सगळं व्यवस्थित झालं", | |
"see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे", | |
"with a little perseverance": "थोड्या धीराने", | |
"break the ice": "संभाषणाची सुरुवात करणे", | |
"on cloud nine": "आनंदात असणे", | |
"once in a blue moon": "क्वचितच", | |
"burning the midnight oil": "रात्रंदिवस मेहनत करणे", | |
"better late than never": "उशीर का होईना पण योग्य वेळी" | |
} | |
for idiom, translation in idiom_map.items(): | |
pattern = r'\b' + re.escape(idiom) + r'\b' | |
text = re.sub(pattern, translation, text, flags=re.IGNORECASE) | |
return text | |
def extract_text(file): | |
ext = os.path.splitext(file.name)[1].lower() | |
if ext == ".pdf": | |
reader = PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
elif ext == ".docx": | |
doc = docx.Document(file) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
elif ext == ".txt": | |
return file.read().decode("utf-8") | |
else: | |
raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.") | |
def translate_text(text, src_lang, tgt_lang, models): | |
if src_lang == tgt_lang: | |
return text | |
# Language codes for NLLB | |
lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"} | |
if src_lang not in lang_map or tgt_lang not in lang_map: | |
return "Error: Unsupported language combination" | |
tgt_lang_code = lang_map[tgt_lang] | |
tokenizer, model = models["nllb"] | |
# Preprocess for idioms | |
preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang) | |
# Split text into smaller chunks (sentences) | |
sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text) | |
translated_text = [] | |
for sentence in sentences: | |
if sentence.strip(): | |
chunks = split_long_sentence(sentence, max_length=200) | |
for chunk in chunks: | |
try: | |
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
translated = model.generate( | |
**inputs |