|
import gradio as gr
|
|
from transformers import MarianMTModel, MarianTokenizer
|
|
import torch
|
|
from nltk.tokenize import sent_tokenize, LineTokenizer
|
|
import math
|
|
import nltk
|
|
|
|
nltk.download('punkt_tab')
|
|
|
|
|
|
model_name = "opus-mt-id-en"
|
|
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
|
model = MarianMTModel.from_pretrained(model_name)
|
|
|
|
|
|
def translate_id_en(text):
|
|
|
|
lt = LineTokenizer()
|
|
batch_size = 8
|
|
paragraphs = lt.tokenize(text)
|
|
translated_paragraphs = []
|
|
|
|
for paragraph in paragraphs:
|
|
sentences = sent_tokenize(paragraph)
|
|
batches = math.ceil(len(sentences) / batch_size)
|
|
translated = []
|
|
|
|
|
|
for i in range(batches):
|
|
sent_batch = sentences[i * batch_size:(i + 1) * batch_size]
|
|
model_inputs = tokenizer(sent_batch, return_tensors="pt", padding=True, truncation=True)
|
|
|
|
|
|
with torch.no_grad():
|
|
translated_batch = model.generate(**model_inputs)
|
|
|
|
|
|
translated += [tokenizer.decode(t, skip_special_tokens=True) for t in translated_batch]
|
|
|
|
translated_paragraphs.append(" ".join(translated))
|
|
|
|
|
|
translated_text = "\n\n".join(translated_paragraphs)
|
|
return translated_text
|
|
|
|
|
|
iface = gr.Interface(
|
|
fn=translate_id_en,
|
|
inputs=gr.Textbox(lines=12, placeholder="Enter Indonesian text...", label="Input (Indonesian)"),
|
|
outputs=gr.Textbox(lines=12, label="Output (English)"),
|
|
title="Indonesian to English Translator",
|
|
description="Translate Indonesian text to English using the opus-mt-id-en model."
|
|
)
|
|
|
|
|
|
iface.launch()
|
|
|