Spaces:
Build error
Build error
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import streamlit as st | |
from PyPDF2 import PdfReader | |
import docx | |
import os | |
import re | |
# Load NLLB model and tokenizer | |
def load_translation_model(): | |
model_name = "facebook/nllb-200-distilled-600M" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
return tokenizer, model | |
# Initialize model | |
def initialize_models(): | |
tokenizer, model = load_translation_model() | |
return {"nllb": (tokenizer, model)} | |
# Enhanced idiom mapping with more comprehensive translations | |
def preprocess_idioms(text, src_lang, tgt_lang): | |
if src_lang == "en" and tgt_lang == "hi": | |
idiom_map = { | |
# Basic phrases | |
"no piece of cake": "कोई आसान काम नहीं", | |
"piece of cake": "बहुत आसान काम", | |
"bite the bullet": "दांतों तले उंगली दबाना", | |
"tackle it head-on": "सीधे मुकाबला करना", | |
"fell into place": "सब कुछ ठीक हो गया", | |
"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना", | |
"with a little perseverance": "थोड़े से धैर्य से", | |
# Additional common idioms | |
"break a leg": "बहुत बहुत शुभकामनाएं", | |
"hit the nail on the head": "बिल्कुल सही बात कहना", | |
"once in a blue moon": "बहुत कम, कभी-कभार", | |
"under the weather": "तबीयत ठीक नहीं", | |
"cost an arm and a leg": "बहुत महंगा", | |
"beating around the bush": "इधर-उधर की बात करना", | |
"call it a day": "काम समाप्त करना", | |
"burn the midnight oil": "रात-रात भर जागकर काम करना", | |
"get the ball rolling": "शुरुआत करना", | |
"pull yourself together": "खुद को संभालो", | |
"shoot yourself in the foot": "अपना ही नुकसान करना", | |
"take it with a grain of salt": "संदेह से लेना", | |
"the last straw": "सहनशीलता की आखिरी सीमा", | |
"time flies": "समय पंख लगाकर उड़ता है", | |
"wrap your head around": "समझने की कोशिश करना", | |
"cut corners": "काम में छोटा रास्ता अपनाना", | |
"back to square one": "फिर से शुरू से", | |
"blessing in disguise": "छिपा हुआ वरदान", | |
"cry over spilled milk": "बीती बात पर पछताना", | |
"keep your chin up": "हिम्मत रखना", | |
# Work-related idioms | |
"think outside the box": "नए तरीके से सोचना", | |
"raise the bar": "मानक ऊंचा करना", | |
"learning curve": "सीखने की प्रक्रिया", | |
"up and running": "चालू और कार्यरत", | |
"back to the drawing board": "फिर से योजना बनाना", | |
# Project-related phrases | |
"running into issues": "समस्याओं का सामना करना", | |
"iron out the bugs": "खामियां दूर करना", | |
"in the pipeline": "विचाराधीन", | |
"moving forward": "आगे बढ़ते हुए", | |
"touch base": "संपर्क में रहना", | |
# Technical phrases | |
"user-friendly": "उपयोगकर्ता के अनुकूल", | |
"cutting-edge": "अत्याधुनिक", | |
"state of the art": "अत्याधुनिक तकनीक", | |
"proof of concept": "व्यवहार्यता का प्रमाण", | |
"game changer": "खेल बदलने वाला" | |
} | |
# Sort idioms by length (longest first) to handle overlapping phrases | |
sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True) | |
# Create a single regex pattern for all idioms | |
pattern = '|'.join(map(re.escape, sorted_idioms)) | |
def replace_idiom(match): | |
return idiom_map[match.group(0).lower()] | |
# Replace all idioms in one pass, case-insensitive | |
text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE) | |
return text | |
# Function to extract text from different file types | |
def extract_text(file): | |
ext = os.path.splitext(file.name)[1].lower() | |
if ext == ".pdf": | |
reader = PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
elif ext == ".docx": | |
doc = docx.Document(file) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text | |
elif ext == ".txt": | |
return file.read().decode("utf-8") | |
else: | |
raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.") | |
# Translation function with improved chunking and fixed tokenizer issue | |
def translate_text(text, src_lang, tgt_lang, models): | |
if src_lang == tgt_lang: | |
return text | |
# Language codes for NLLB | |
lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"} | |
if src_lang not in lang_map or tgt_lang not in lang_map: | |
return "Error: Unsupported language combination" | |
tgt_lang_code = lang_map[tgt_lang] | |
tokenizer, model = models["nllb"] | |
# Preprocess for idioms | |
preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang) | |
# Improved chunking: Split by sentences while preserving context | |
chunks = [] | |
current_chunk = "" | |
for sentence in re.split('([.!?।]+)', preprocessed_text): | |
if sentence.strip(): | |
if len(current_chunk) + len(sentence) < 450: # Leave room for tokenization | |
current_chunk += sentence | |
else: | |
if current_chunk: | |
chunks.append(current_chunk) | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk) | |
translated_text = "" | |
for chunk in chunks: | |
if chunk.strip(): | |
# Add target language token to the beginning of the input | |
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
# Get the token ID for the target language | |
tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code) | |
translated = model.generate( | |
**inputs, | |
forced_bos_token_id=tgt_lang_id, # Fixed: Using convert_tokens_to_ids instead of lang_code_to_id | |
max_length=512, | |
num_beams=5, | |
length_penalty=1.0, | |
no_repeat_ngram_size=3 | |
) | |
translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True) | |
translated_text += translated_chunk + " " | |
return translated_text.strip() | |
# Function to save text as a file | |
def save_text_to_file(text, original_filename, prefix="translated"): | |
output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt" | |
with open(output_filename, "w", encoding="utf-8") as f: | |
f.write(text) | |
return output_filename | |
# Main processing function | |
def process_document(file, source_lang, target_lang, models): | |
try: | |
# Extract text from uploaded file | |
text = extract_text(file) | |
# Translate the text | |
translated_text = translate_text(text, source_lang, target_lang, models) | |
# Save the result | |
if translated_text.startswith("Error:"): | |
output_file = save_text_to_file(translated_text, file.name, prefix="error") | |
else: | |
output_file = save_text_to_file(translated_text, file.name) | |
return output_file, translated_text | |
except Exception as e: | |
error_message = f"Error: {str(e)}" | |
output_file = save_text_to_file(error_message, file.name, prefix="error") | |
return output_file, error_message | |
# Streamlit interface | |
def main(): | |
st.title("Document Translator (NLLB-200)") | |
st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).") | |
# Initialize models | |
models = initialize_models() | |
# File uploader | |
uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"]) | |
# Language selection | |
col1, col2 = st.columns(2) | |
with col1: | |
source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0) | |
with col2: | |
target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1) | |
if uploaded_file is not None and st.button("Translate"): | |
with st.spinner("Translating..."): | |
output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models) | |
# Display result | |
st.text_area("Translated Text", result_text, height=300) | |
# Provide download button | |
with open(output_file, "rb") as file: | |
st.download_button( | |
label="Download Translated Document", | |
data=file, | |
file_name=os.path.basename(output_file), | |
mime="text/plain" | |
) | |
if _name_ == "_main_": | |
main() |