Spaces:

gauravchand11
/

try

Build error

App Files Files Community

try / app.py

gauravchand11

Update app.py

77a6efe verified 3 months ago

raw

history blame

7.64 kB

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import streamlit as st
	from PyPDF2 import PdfReader
	import docx
	import os
	import re
	from datetime import datetime

	# Page config
	st.set_page_config(
	page_title="Document Translator (NLLB-200)",
	page_icon="📄",
	layout="wide"
	)

	# Load NLLB model and tokenizer
	@st.cache_resource
	def load_translation_model():
	model_name = "facebook/nllb-200-distilled-600M"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	return tokenizer, model

	# Initialize model
	@st.cache_resource
	def initialize_models():
	tokenizer, model = load_translation_model()
	return {"nllb": (tokenizer, model)}

	def split_long_sentence(sentence, max_length=200):
	"""Split long sentences into smaller chunks at appropriate break points."""
	if len(sentence) <= max_length:
	return [sentence]

	chunks = []
	current_chunk = ""
	words = sentence.split()

	for word in words:
	if len(current_chunk) + len(word) + 1 <= max_length:
	current_chunk += (" " + word if current_chunk else word)
	else:
	chunks.append(current_chunk)
	current_chunk = word

	if current_chunk:
	chunks.append(current_chunk)

	return chunks

	def preprocess_idioms(text, src_lang, tgt_lang):
	if src_lang == "en" and tgt_lang == "hi":
	idiom_map = {
	# Common English-Hindi idiom mappings
	"no piece of cake": "कोई आसान काम नहीं",
	"bite the bullet": "दांतों तले उंगली दबाना",
	"tackle it head-on": "इसे पूरे मन से हाथ में लेना",
	"fell into place": "ठीक हो गया",
	"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
	"with a little perseverance": "थोड़े से धैर्य से",
	"break the ice": "बातचीत की शुरुआत करना",
	"on cloud nine": "सातवें आसमान पर होना",
	"once in a blue moon": "कभी-कभार",
	"beating around the bush": "इधर-उधर की बात करना",
	"burning the midnight oil": "रात-रात भर जागकर काम करना",
	"calm before the storm": "तूफान से पहले की शांति",
	"cost an arm and a leg": "बहुत महंगा होना",
	"blessing in disguise": "छुपा हुआ वरदान",
	"kill two birds with one stone": "एक पंथ दो काज",
	"a piece of cake": "बहुत आसान काम",
	"under the weather": "तबीयत ठीक न होना",
	"pull yourself together": "खुद को संभालो",
	"rise and shine": "जल्दी उठो और तैयार हो जाओ",
	"time flies": "समय पंख लगाकर उड़ता है",
	"actions speak louder than words": "कथनी से करनी बड़ी",
	"all ears": "पूरा ध्यान से सुन रहा हूं",
	"back to square one": "वापस शुरुआत में",
	"better late than never": "देर आये दुरुस्त आये",
	"cry over spilled milk": "बीती बात पर पछताना",
	"down to earth": "सरल स्वभाव का",
	"every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
	"food for thought": "सोचने वाली बात",
	"give someone the benefit of the doubt": "शक का फायदा देना",
	"hit the nail on the head": "सटीक बात कहना",
	"in hot water": "मुसीबत में होना"
	}

	# Sort idioms by length (longest first) to handle overlapping phrases
	sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)

	# Replace idioms with their translations
	for idiom in sorted_idioms:
	pattern = r'\b' + re.escape(idiom) + r'\b'
	text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)

	elif src_lang == "en" and tgt_lang == "mr":
	idiom_map = {
	"no piece of cake": "सोपं काम नाही",
	"bite the bullet": "कठीण निर्णय घेणे",
	"tackle it head-on": "समस्येला थेट सामोरे जाणे",
	"fell into place": "सगळं व्यवस्थित झालं",
	"see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
	"with a little perseverance": "थोड्या धीराने",
	"break the ice": "संभाषणाची सुरुवात करणे",
	"on cloud nine": "आनंदात असणे",
	"once in a blue moon": "क्वचितच",
	"burning the midnight oil": "रात्रंदिवस मेहनत करणे",
	"better late than never": "उशीर का होईना पण योग्य वेळी"
	}
	for idiom, translation in idiom_map.items():
	pattern = r'\b' + re.escape(idiom) + r'\b'
	text = re.sub(pattern, translation, text, flags=re.IGNORECASE)

	return text

	def extract_text(file):
	ext = os.path.splitext(file.name)[1].lower()

	if ext == ".pdf":
	reader = PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	elif ext == ".docx":
	doc = docx.Document(file)
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text

	elif ext == ".txt":
	return file.read().decode("utf-8")

	else:
	raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")

	def translate_text(text, src_lang, tgt_lang, models):
	if src_lang == tgt_lang:
	return text

	# Language codes for NLLB
	lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}

	if src_lang not in lang_map or tgt_lang not in lang_map:
	return "Error: Unsupported language combination"

	tgt_lang_code = lang_map[tgt_lang]

	tokenizer, model = models["nllb"]

	# Preprocess for idioms
	preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)

	# Split text into smaller chunks (sentences)
	sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
	translated_text = []

	for sentence in sentences:
	if sentence.strip():
	chunks = split_long_sentence(sentence, max_length=200)

	for chunk in chunks:
	try:
	inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
	translated = model.generate(
	**inputs