Spaces:
Build error
Build error
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import streamlit as st | |
from PyPDF2 import PdfReader | |
import docx | |
import os | |
import re | |
import asyncio | |
from concurrent.futures import ThreadPoolExecutor | |
import torch | |
# Replace pytesseract with easyocr | |
import easyocr | |
from PIL import Image | |
import numpy as np | |
# Set up async environment for torch | |
if torch.cuda.is_available(): | |
torch.multiprocessing.set_start_method('spawn', force=True) | |
# Initialize asyncio event loop | |
try: | |
loop = asyncio.get_event_loop() | |
except RuntimeError: | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
# Initialize EasyOCR reader | |
def load_ocr_reader(): | |
try: | |
return easyocr.Reader(['en']) # Initialize for English | |
except Exception as e: | |
st.error(f"Error loading OCR reader: {str(e)}") | |
return None | |
# Modified extract_text_from_image function with better error handling | |
def extract_text_from_image(image_file): | |
try: | |
# Get the OCR reader | |
reader = load_ocr_reader() | |
if reader is None: | |
raise Exception("Failed to initialize OCR reader") | |
# Read the image using PIL | |
image = Image.open(image_file) | |
# Convert to numpy array | |
image_np = np.array(image) | |
# Perform OCR | |
results = reader.readtext(image_np) | |
if not results: | |
return "No text was detected in the image." | |
# Extract text from results | |
text = "\n".join([result[1] for result in results]) | |
return text.strip() | |
except Exception as e: | |
raise Exception(f"Error extracting text from image: {str(e)}") | |
# Modified extract_text function to support all file types | |
def extract_text(file): | |
try: | |
ext = os.path.splitext(file.name)[1].lower() | |
if ext == ".pdf": | |
try: | |
reader = PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text.strip() | |
except Exception as e: | |
raise Exception(f"Error reading PDF file: {str(e)}") | |
elif ext == ".docx": | |
try: | |
doc = docx.Document(file) | |
text = "" | |
for para in doc.paragraphs: | |
text += para.text + "\n" | |
return text.strip() | |
except Exception as e: | |
raise Exception(f"Error reading DOCX file: {str(e)}") | |
elif ext == ".txt": | |
try: | |
return file.read().decode("utf-8").strip() | |
except Exception as e: | |
raise Exception(f"Error reading TXT file: {str(e)}") | |
elif ext in [".jpg", ".jpeg", ".png"]: | |
try: | |
return extract_text_from_image(file) | |
except Exception as e: | |
raise Exception(f"Error processing image file: {str(e)}") | |
else: | |
raise ValueError("Unsupported file format. Please upload PDF, DOCX, TXT, or image files (JPG, JPEG, PNG).") | |
except Exception as e: | |
raise Exception(f"Error extracting text from file: {str(e)}") | |
# Load NLLB model and tokenizer with error handling | |
def load_translation_model(): | |
try: | |
model_name = "facebook/nllb-200-distilled-600M" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
return tokenizer, model | |
except Exception as e: | |
st.error(f"Error loading model: {str(e)}") | |
return None, None | |
# Initialize model | |
def initialize_models(): | |
tokenizer, model = load_translation_model() | |
if tokenizer is None or model is None: | |
st.error("Failed to initialize models") | |
return None | |
return {"nllb": (tokenizer, model)} | |
# Enhanced idiom mapping with more comprehensive translations | |
def preprocess_idioms(text, src_lang, tgt_lang): | |
idiom_map = {} | |
if src_lang == "en" and tgt_lang == "hi": | |
idiom_map = { | |
"no piece of cake": "कोई आसान काम नहीं", | |
"piece of cake": "बहुत आसान काम", | |
"bite the bullet": "दांतों तले उंगली दबाना", | |
"tackle it head-on": "सीधे मुकाबला करना", | |
"fell into place": "सब कुछ ठीक हो गया", | |
"see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना", | |
"with a little perseverance": "थोड़े से धैर्य से", | |
# Additional common idioms | |
"break a leg": "बहुत बहुत शुभकामनाएं", | |
"hit the nail on the head": "बिल्कुल सही बात कहना", | |
"once in a blue moon": "बहुत कम, कभी-कभार", | |
"under the weather": "तबीयत ठीक नहीं", | |
"cost an arm and a leg": "बहुत महंगा", | |
"beating around the bush": "इधर-उधर की बात करना", | |
"call it a day": "काम समाप्त करना", | |
"burn the midnight oil": "रात-रात भर जागकर काम करना", | |
"get the ball rolling": "शुरुआत करना", | |
"pull yourself together": "खुद को संभालो", | |
"shoot yourself in the foot": "अपना ही नुकसान करना", | |
"take it with a grain of salt": "संदेह से लेना", | |
"the last straw": "सहनशीलता की आखिरी सीमा", | |
"time flies": "समय पंख लगाकर उड़ता है", | |
"wrap your head around": "समझने की कोशिश करना", | |
"cut corners": "काम में छोटा रास्ता अपनाना", | |
"back to square one": "फिर से शुरू से", | |
"blessing in disguise": "छिपा हुआ वरदान", | |
"cry over spilled milk": "बीती बात पर पछताना", | |
"keep your chin up": "हिम्मत रखना", | |
# Work-related idioms | |
"think outside the box": "नए तरीके से सोचना", | |
"raise the bar": "मानक ऊंचा करना", | |
"learning curve": "सीखने की प्रक्रिया", | |
"up and running": "चालू और कार्यरत", | |
"back to the drawing board": "फिर से योजना बनाना", | |
# Project-related phrases | |
"running into issues": "समस्याओं का सामना करना", | |
"iron out the bugs": "खामियां दूर करना", | |
"in the pipeline": "विचाराधीन", | |
"moving forward": "आगे बढ़ते हुए", | |
"touch base": "संपर्क में रहना", | |
# Technical phrases | |
"user-friendly": "उपयोगकर्ता के अनुकूल", | |
"cutting-edge": "अत्याधुनिक", | |
"state of the art": "अत्याधुनिक तकनीक", | |
"proof of concept": "व्यवहार्यता का प्रमाण", | |
"game changer": "खेल बदलने वाला" | |
} | |
elif src_lang == "en" and tgt_lang == "mr": | |
idiom_map = { | |
"no piece of cake": "सोपं काम नाही", | |
"piece of cake": "अतिशय सोपं काम", | |
"bite the bullet": "कठीण निर्णय घेणे", | |
"tackle it head-on": "समस्येला थेट सामोरे जाणे", | |
"fell into place": "सगळं व्यवस्थित झालं", | |
"see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे", | |
"with a little perseverance": "थोड्या धीराने", | |
"break a leg": "खूप शुभेच्छा", | |
"hit the nail on the head": "अगदी बरोबर बोललात", | |
"once in a blue moon": "क्वचितच, कधीतरी", | |
"under the weather": "तब्येत ठीक नसणे", | |
"cost an arm and a leg": "खूप महाग", | |
"beating around the bush": "गोल गोल फिरवणे", | |
"call it a day": "दिवसाचं काम संपवणे", | |
"burn the midnight oil": "रात्रंदिवस मेहनत करणे", | |
"get the ball rolling": "सुरुवात करणे", | |
"pull yourself together": "स्वतःला सावरा", | |
"shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे", | |
"take it with a grain of salt": "साशंक दृष्टीने पाहणे", | |
"the last straw": "सहनशक्तीची शेवटची मर्यादा", | |
"time flies": "वेळ पंख लावून उडतो", | |
"wrap your head around": "समजून घेण्याचा प्रयत्न करणे", | |
"cut corners": "कमी वेळात काम उरकणे", | |
"back to square one": "पुन्हा सुरुवातीला", | |
"blessing in disguise": "आशीर्वाद लपलेला", | |
"cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे", | |
"keep your chin up": "धीर धरा", | |
# Work-related idioms | |
"think outside the box": "वेगळ्या पद्धतीने विचार करणे", | |
"raise the bar": "पातळी उंचावणे", | |
"learning curve": "शिकण्याची प्रक्रिया", | |
"up and running": "सुरू आणि कार्यरत", | |
"back to the drawing board": "पुन्हा नव्याने योजना आखणे", | |
# Project-related phrases | |
"running into issues": "अडचणींना सामोरे जाणे", | |
"iron out the bugs": "त्रुटी दूर करणे", | |
"in the pipeline": "विचाराधीन", | |
"moving forward": "पुढे जाताना", | |
"touch base": "संपर्कात राहणे", | |
# Technical phrases | |
"user-friendly": "वापरकर्त्यास सोयीस्कर", | |
"cutting-edge": "अत्याधुनिक", | |
"state of the art": "सर्वोत्कृष्ट तंत्रज्ञान", | |
"proof of concept": "संकल्पनेची सिद्धता", | |
"game changer": "खेळ बदलणारी गोष्ट" | |
} | |
if idiom_map: | |
sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True) | |
pattern = '|'.join(map(re.escape, sorted_idioms)) | |
def replace_idiom(match): | |
return idiom_map[match.group(0).lower()] | |
text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE) | |
return text | |
# Async translation function with fixed idiom processing | |
async def translate_text_async(text, src_lang, tgt_lang, models): | |
if src_lang == tgt_lang: | |
return text | |
# Updated language mapping handling | |
src_lang_simple = src_lang.lower() | |
tgt_lang_simple = tgt_lang.lower() | |
lang_map = {"english": "eng_Latn", "hindi": "hin_Deva", "marathi": "mar_Deva"} | |
if src_lang_simple not in lang_map or tgt_lang_simple not in lang_map: | |
return "Error: Unsupported language combination" | |
try: | |
# Process idioms first | |
preprocessed_text = preprocess_idioms(text, src_lang_simple[:2], tgt_lang_simple[:2]) | |
tgt_lang_code = lang_map[tgt_lang_simple] | |
tokenizer, model = models["nllb"] | |
chunks = [] | |
current_chunk = "" | |
# Split text into chunks while preserving sentences | |
for sentence in re.split('([.!?।]+)', preprocessed_text): | |
if sentence.strip(): | |
if len(current_chunk) + len(sentence) < 450: | |
current_chunk += sentence | |
else: | |
if current_chunk: | |
chunks.append(current_chunk) | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk) | |
translated_text = "" | |
# Translate each chunk | |
for chunk in chunks: | |
if chunk.strip(): | |
inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code) | |
translated = model.generate( | |
**inputs, | |
forced_bos_token_id=tgt_lang_id, | |
max_length=512, | |
num_beams=5, | |
length_penalty=1.0, | |
no_repeat_ngram_size=3 | |
) | |
translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True) | |
translated_text += translated_chunk + " " | |
return translated_text.strip() | |
except Exception as e: | |
return f"Error during translation: {str(e)}" | |
# Synchronous wrapper for translation | |
def translate_text(text, src_lang, tgt_lang, models): | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
try: | |
return loop.run_until_complete(translate_text_async(text, src_lang, tgt_lang, models)) | |
finally: | |
loop.close() | |
def save_text_to_file(text, original_filename, prefix="translated"): | |
try: | |
# Get the original file extension and base name | |
base_name = os.path.splitext(os.path.basename(original_filename))[0] | |
output_filename = f"{prefix}_{base_name}.txt" | |
# Save all translations as text files for simplicity and build speed | |
with open(output_filename, "w", encoding="utf-8") as f: | |
f.write(text) | |
return output_filename | |
except Exception as e: | |
st.error(f"Error saving file: {str(e)}") | |
return None | |
# Modified process_document function to handle multiple formats | |
def process_document(file, source_lang, target_lang, models): | |
try: | |
text = extract_text(file) | |
translated_text = translate_text(text, source_lang, target_lang, models) | |
if translated_text.startswith("Error:"): | |
output_file = save_text_to_file(translated_text, file.name, prefix="error") | |
else: | |
output_file = save_text_to_file(translated_text, file.name) | |
if output_file is None: | |
raise Exception("Failed to save output file") | |
return output_file, translated_text | |
except Exception as e: | |
error_message = f"Error: {str(e)}" | |
output_file = save_text_to_file(error_message, file.name, prefix="error") | |
return output_file, error_message | |
# Modified main function to ensure proper language handling | |
def main(): | |
st.title("Document Translation Toolkit") | |
# Initialize models with error handling | |
models = initialize_models() | |
if models is None: | |
st.error("Failed to initialize translation models. Please try again.") | |
return | |
# Create tabs for different translation modes | |
tab1, tab2 = st.tabs(["Document Translation", "Text Translation"]) | |
# Document Translation Tab | |
with tab1: | |
st.subheader("Document Translation") | |
st.write("Upload a document (PDF, DOCX, TXT, or Image) and select languages.") | |
uploaded_file = st.file_uploader( | |
"Upload Document", | |
type=["pdf", "docx", "txt", "jpg", "jpeg", "png"], | |
key="doc_uploader" | |
) | |
col1, col2 = st.columns(2) | |
with col1: | |
source_lang = st.selectbox( | |
"Source Language", | |
["English", "Hindi", "Marathi"], | |
index=0, | |
key="doc_src" | |
) | |
with col2: | |
target_lang = st.selectbox( | |
"Target Language", | |
["English", "Hindi", "Marathi"], | |
index=1, | |
key="doc_tgt" | |
) | |
if uploaded_file is not None and st.button("Translate Document"): | |
try: | |
with st.spinner("Translating..."): | |
# Extract and show input text | |
input_text = extract_text(uploaded_file) | |
st.subheader("Input Text") | |
st.text_area("Original Text", input_text, height=200) | |
# Translate and show output text | |
output_file, result_text = process_document( | |
uploaded_file, | |
source_lang.lower(), | |
target_lang.lower(), | |
models | |
) | |
st.subheader("Translated Text") | |
st.text_area("Translation", result_text, height=200) | |
# Provide download button with correct MIME type | |
if output_file and os.path.exists(output_file): | |
with open(output_file, "rb") as file: | |
# Set appropriate MIME type based on file extension | |
ext = os.path.splitext(output_file)[1].lower() | |
mime_types = { | |
'.pdf': 'application/pdf', | |
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | |
'.txt': 'text/plain', | |
'.jpg': 'image/jpeg', | |
'.jpeg': 'image/jpeg', | |
'.png': 'image/png' | |
} | |
mime_type = mime_types.get(ext, 'text/plain') | |
st.download_button( | |
label="Download Translated Document", | |
data=file, | |
file_name=os.path.basename(output_file), | |
mime=mime_type | |
) | |
else: | |
st.error("Failed to generate output file") | |
except Exception as e: | |
st.error(f"An error occurred during translation: {str(e)}") | |
# Text Translation Tab | |
with tab2: | |
st.subheader("Text Translation") | |
st.write("Enter text directly for translation.") | |
col1, col2 = st.columns(2) | |
with col1: | |
text_source_lang = st.selectbox( | |
"Source Language", | |
["English", "Hindi", "Marathi"], | |
index=0, | |
key="text_src" | |
) | |
with col2: | |
text_target_lang = st.selectbox( | |
"Target Language", | |
["English", "Hindi", "Marathi"], | |
index=1, | |
key="text_tgt" | |
) | |
input_text = st.text_area("Enter text to translate", height=150) | |
if input_text and st.button("Translate Text"): | |
try: | |
with st.spinner("Translating..."): | |
# Translate the input text | |
translated_text = translate_text( | |
input_text, | |
text_source_lang.lower(), | |
text_target_lang.lower(), | |
models | |
) | |
# Show translation result | |
st.text_area("Translation", translated_text, height=150) | |
# Add download button for translated text | |
st.download_button( | |
label="Download Translation", | |
data=translated_text, | |
file_name="translation.txt", | |
mime="text/plain" | |
) | |
except Exception as e: | |
st.error(f"An error occurred during translation: {str(e)}") | |
if __name__ == "__main__": | |
try: | |
main() | |
except Exception as e: | |
st.error(f"Application error: {str(e)}") |