Spaces:

gauravchand11
/

try

Build error

App Files Files Community

try / app.py

gauravchand11

Update app.py

2ea2438 verified 3 months ago

raw

history blame

10.2 kB

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import streamlit as st
	from PyPDF2 import PdfReader
	import docx
	import os
	import re

	# Load NLLB model and tokenizer
	@st.cache_resource
	def load_translation_model():
	model_name = "facebook/nllb-200-distilled-600M"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	return tokenizer, model

	# Initialize model
	@st.cache_resource
	def initialize_models():
	tokenizer, model = load_translation_model()
	return {"nllb": (tokenizer, model)}

	# Enhanced idiom mapping with more comprehensive translations
	def preprocess_idioms(text, src_lang, tgt_lang):
	if src_lang == "en" and tgt_lang == "hi":
	idiom_map = {
	# Basic phrases
	"no piece of cake": "कोई आसान काम नहीं",
	"piece of cake": "बहुत आसान काम",
	"bite the bullet": "दांतों तले उंगली दबाना",
	"tackle it head-on": "सीधे मुकाबला करना",
	"fell into place": "सब कुछ ठीक हो गया",
	"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
	"with a little perseverance": "थोड़े से धैर्य से",

	# Additional common idioms
	"break a leg": "बहुत बहुत शुभकामनाएं",
	"hit the nail on the head": "बिल्कुल सही बात कहना",
	"once in a blue moon": "बहुत कम, कभी-कभार",
	"under the weather": "तबीयत ठीक नहीं",
	"cost an arm and a leg": "बहुत महंगा",
	"beating around the bush": "इधर-उधर की बात करना",
	"call it a day": "काम समाप्त करना",
	"burn the midnight oil": "रात-रात भर जागकर काम करना",
	"get the ball rolling": "शुरुआत करना",
	"pull yourself together": "खुद को संभालो",
	"shoot yourself in the foot": "अपना ही नुकसान करना",
	"take it with a grain of salt": "संदेह से लेना",
	"the last straw": "सहनशीलता की आखिरी सीमा",
	"time flies": "समय पंख लगाकर उड़ता है",
	"wrap your head around": "समझने की कोशिश करना",
	"cut corners": "काम में छोटा रास्ता अपनाना",
	"back to square one": "फिर से शुरू से",
	"blessing in disguise": "छिपा हुआ वरदान",
	"cry over spilled milk": "बीती बात पर पछताना",
	"keep your chin up": "हिम्मत रखना",

	# Work-related idioms
	"think outside the box": "नए तरीके से सोचना",
	"raise the bar": "मानक ऊंचा करना",
	"learning curve": "सीखने की प्रक्रिया",
	"up and running": "चालू और कार्यरत",
	"back to the drawing board": "फिर से योजना बनाना",

	# Project-related phrases
	"running into issues": "समस्याओं का सामना करना",
	"iron out the bugs": "खामियां दूर करना",
	"in the pipeline": "विचाराधीन",
	"moving forward": "आगे बढ़ते हुए",
	"touch base": "संपर्क में रहना",

	# Technical phrases
	"user-friendly": "उपयोगकर्ता के अनुकूल",
	"cutting-edge": "अत्याधुनिक",
	"state of the art": "अत्याधुनिक तकनीक",
	"proof of concept": "व्यवहार्यता का प्रमाण",
	"game changer": "खेल बदलने वाला"
	}

	# Sort idioms by length (longest first) to handle overlapping phrases
	sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)

	# Create a single regex pattern for all idioms
	pattern = '\|'.join(map(re.escape, sorted_idioms))

	def replace_idiom(match):
	return idiom_map[match.group(0).lower()]

	# Replace all idioms in one pass, case-insensitive
	text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)

	return text

	# Function to extract text from different file types
	def extract_text(file):
	ext = os.path.splitext(file.name)[1].lower()

	if ext == ".pdf":
	reader = PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text

	elif ext == ".docx":
	doc = docx.Document(file)
	text = ""
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text

	elif ext == ".txt":
	return file.read().decode("utf-8")

	else:
	raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")

	# Translation function with improved chunking and fixed tokenizer issue
	def translate_text(text, src_lang, tgt_lang, models):
	if src_lang == tgt_lang:
	return text

	# Language codes for NLLB
	lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}

	if src_lang not in lang_map or tgt_lang not in lang_map:
	return "Error: Unsupported language combination"

	tgt_lang_code = lang_map[tgt_lang]
	tokenizer, model = models["nllb"]

	# Preprocess for idioms
	preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)

	# Improved chunking: Split by sentences while preserving context
	chunks = []
	current_chunk = ""

	for sentence in re.split('([.!?।]+)', preprocessed_text):
	if sentence.strip():
	if len(current_chunk) + len(sentence) < 450: # Leave room for tokenization
	current_chunk += sentence
	else:
	if current_chunk:
	chunks.append(current_chunk)
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk)

	translated_text = ""

	for chunk in chunks:
	if chunk.strip():
	# Add target language token to the beginning of the input
	inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)

	# Get the token ID for the target language
	tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)

	translated = model.generate(
	**inputs,
	forced_bos_token_id=tgt_lang_id, # Fixed: Using convert_tokens_to_ids instead of lang_code_to_id
	max_length=512,
	num_beams=5,
	length_penalty=1.0,
	no_repeat_ngram_size=3
	)
	translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
	translated_text += translated_chunk + " "

	return translated_text.strip()

	# Function to save text as a file
	def save_text_to_file(text, original_filename, prefix="translated"):
	output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
	with open(output_filename, "w", encoding="utf-8") as f:
	f.write(text)
	return output_filename

	# Main processing function
	def process_document(file, source_lang, target_lang, models):
	try:
	# Extract text from uploaded file
	text = extract_text(file)

	# Translate the text
	translated_text = translate_text(text, source_lang, target_lang, models)

	# Save the result
	if translated_text.startswith("Error:"):
	output_file = save_text_to_file(translated_text, file.name, prefix="error")
	else:
	output_file = save_text_to_file(translated_text, file.name)

	return output_file, translated_text
	except Exception as e:
	error_message = f"Error: {str(e)}"
	output_file = save_text_to_file(error_message, file.name, prefix="error")
	return output_file, error_message

	# Streamlit interface
	def main():
	st.title("Document Translator (NLLB-200)")
	st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")

	# Initialize models
	models = initialize_models()

	# File uploader
	uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])

	# Language selection
	col1, col2 = st.columns(2)
	with col1:
	source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
	with col2:
	target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)

	if uploaded_file is not None and st.button("Translate"):
	with st.spinner("Translating..."):
	output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)

	# Display result
	st.text_area("Translated Text", result_text, height=300)

	# Provide download button
	with open(output_file, "rb") as file:
	st.download_button(
	label="Download Translated Document",
	data=file,
	file_name=os.path.basename(output_file),
	mime="text/plain"
	)

	if _name_ == "_main_":
	main()