In [20]:


import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

ip = IndicProcessor(inference=True)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)

sentences = [
 "This is a test sentence.",
 "This is another longer different test sentence.",
 "Please send an SMS to 9876543210 and an email on newemail123@xyz.com by 15th October, 2023.",
]

batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="tam_Taml", visualize=False) # set it to visualize=True to print a progress bar
batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")

with torch.inference_mode():
 outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)

with tokenizer.as_target_tokenizer():
 # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
 # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
 outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)

outputs = ip.postprocess_batch(outputs, lang="tam_Taml")
print(outputs)



['இது ஒரு சோதனை வாக்கியம். ', 'இது மற்றொரு நீண்ட வித்தியாசமான சோதனை வாக்கியமாகும். ', '9876543210 என்ற எண்ணுக்கு ஒரு எஸ்எம்எஸ் அனுப்பவும், 2023 அக்டோபர் 15 ஆம் தேதிக்குள் newemail123@xyz.com என்ற மின்னஞ்சல் முகவரிக்கு அனுப்பவும். ']


In [None]:
import sys
import os
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Add local IndicTransToolkit path if needed
sys.path.append(os.path.abspath("libs/IndicTransToolkit"))
from IndicTransToolkit.processor import IndicProcessor

# Load processor, tokenizer, and model
ip = IndicProcessor(inference=True)

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)

def translate(text, target_lang):
 if not text.strip():
 return "Please enter some text."

 # Preprocess
 batch = ip.preprocess_batch([text], src_lang="eng_Latn", tgt_lang=target_lang)
 batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")

 # Inference
 with torch.inference_mode():
 outputs = model.generate(**batch, num_beams=5, max_length=256)

 # Postprocess
 with tokenizer.as_target_tokenizer():
 decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)

 return ip.postprocess_batch(decoded, lang=target_lang)[0]


In [19]:
translate_text("hello","tam_Taml")

'வணக்கம். '

In [None]:
import requests

url = "http://localhost:7860/translate"

payload = {
 "text": "Hello, how are you?",
 "target_lang": "tam_Taml"
}

headers = {
 "Content-Type": "application/json"
}

response = requests.post(url, json=payload, headers=headers)

if response.status_code == 200:
 print("Translation:", response.json()["translation"])
else:
 print("Error:", response.status_code, response.text)


Translation: टाम् @टाम्ल नमस्कार, आप कैसे हैं? 
