opus-mt-id-en / translation-id-en-marian-gradio.py
finnstrom3693's picture
Upload translation-id-en-marian-gradio.py
f038f7c verified
raw
history blame
2.14 kB
import gradio as gr
from transformers import MarianMTModel, MarianTokenizer
import torch
from nltk.tokenize import sent_tokenize, LineTokenizer
import math
import nltk
nltk.download('punkt_tab')
# Load the translation model and tokenizer from Hugging Face
model_name = "opus-mt-id-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
# Define the translation function with adaptive input handling
def translate_id_en(text):
# Tokenize the input into lines and sentences
lt = LineTokenizer()
batch_size = 8
paragraphs = lt.tokenize(text)
translated_paragraphs = []
for paragraph in paragraphs:
sentences = sent_tokenize(paragraph)
batches = math.ceil(len(sentences) / batch_size)
translated = []
# Process sentences in batches
for i in range(batches):
sent_batch = sentences[i * batch_size:(i + 1) * batch_size]
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True)
# Generate translation
with torch.no_grad():
translated_batch = model.generate(**model_inputs)
# Decode the generated tokens into text
translated += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch]
translated_paragraphs.append(" ".join(translated))
# Combine all paragraphs into the final translated text
translated_text = "\n\n".join(translated_paragraphs)
return translated_text
# Define the Gradio interface
iface = gr.Interface(
fn=translate_id_en, # Function to translate text
inputs=gr.Textbox(lines=12, placeholder="Enter Indonesian text...", label="Input (Indonesian)"), # Input box
outputs=gr.Textbox(lines=12, label="Output (English)"), # Output box
title="Indonesian to English Translator", # Title of the app
description="Translate Indonesian text to English using the opus-mt-id-en model."
)
# Launch the Gradio interface locally
iface.launch()