import gradio as gr from transformers import MarianMTModel, MarianTokenizer import torch from nltk.tokenize import sent_tokenize, LineTokenizer import math import nltk nltk.download('punkt_tab') # Load the translation model and tokenizer from Hugging Face model_name = "opus-mt-id-en" tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) # Define the translation function with adaptive input handling def translate_id_en(text): # Tokenize the input into lines and sentences lt = LineTokenizer() batch_size = 8 paragraphs = lt.tokenize(text) translated_paragraphs = [] for paragraph in paragraphs: sentences = sent_tokenize(paragraph) batches = math.ceil(len(sentences) / batch_size) translated = [] # Process sentences in batches for i in range(batches): sent_batch = sentences[i * batch_size:(i + 1) * batch_size] model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True) # Generate translation with torch.no_grad(): translated_batch = model.generate(**model_inputs) # Decode the generated tokens into text translated += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch] translated_paragraphs.append(" ".join(translated)) # Combine all paragraphs into the final translated text translated_text = "\n\n".join(translated_paragraphs) return translated_text # Define the Gradio interface iface = gr.Interface( fn=translate_id_en, # Function to translate text inputs=gr.Textbox(lines=12, placeholder="Enter Indonesian text...", label="Input (Indonesian)"), # Input box outputs=gr.Textbox(lines=12, label="Output (English)"), # Output box title="Indonesian to English Translator", # Title of the app description="Translate Indonesian text to English using the opus-mt-id-en model." ) # Launch the Gradio interface locally iface.launch()