{ "cells": [ { "cell_type": "code", "execution_count": 20, "id": "f73dec47", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['இது ஒரு சோதனை வாக்கியம். ', 'இது மற்றொரு நீண்ட வித்தியாசமான சோதனை வாக்கியமாகும். ', '9876543210 என்ற எண்ணுக்கு ஒரு எஸ்எம்எஸ் அனுப்பவும், 2023 அக்டோபர் 15 ஆம் தேதிக்குள் newemail123@xyz.com என்ற மின்னஞ்சல் முகவரிக்கு அனுப்பவும். ']\n" ] } ], "source": [ "\n", "\n", "import torch\n", "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", "\n", "ip = IndicProcessor(inference=True)\n", "tokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n", "model = AutoModelForSeq2SeqLM.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n", "\n", "sentences = [\n", " \"This is a test sentence.\",\n", " \"This is another longer different test sentence.\",\n", " \"Please send an SMS to 9876543210 and an email on newemail123@xyz.com by 15th October, 2023.\",\n", "]\n", "\n", "batch = ip.preprocess_batch(sentences, src_lang=\"eng_Latn\", tgt_lang=\"tam_Taml\", visualize=False) # set it to visualize=True to print a progress bar\n", "batch = tokenizer(batch, padding=\"longest\", truncation=True, max_length=256, return_tensors=\"pt\")\n", "\n", "with torch.inference_mode():\n", " outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)\n", "\n", "with tokenizer.as_target_tokenizer():\n", " # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.\n", " # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.\n", " outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", "\n", "outputs = ip.postprocess_batch(outputs, lang=\"tam_Taml\")\n", "print(outputs)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4ec49007", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6fa9fc68", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "import torch\n", "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", "\n", "# Add local IndicTransToolkit path if needed\n", "sys.path.append(os.path.abspath(\"libs/IndicTransToolkit\"))\n", "from IndicTransToolkit.processor import IndicProcessor\n", "\n", "# Load processor, tokenizer, and model\n", "ip = IndicProcessor(inference=True)\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n", "model = AutoModelForSeq2SeqLM.from_pretrained(\"ai4bharat/indictrans2-en-indic-dist-200M\", trust_remote_code=True)\n", "\n", "def translate(text, target_lang):\n", " if not text.strip():\n", " return \"Please enter some text.\"\n", "\n", " # Preprocess\n", " batch = ip.preprocess_batch([text], src_lang=\"eng_Latn\", tgt_lang=target_lang)\n", " batch = tokenizer(batch, padding=\"longest\", truncation=True, max_length=256, return_tensors=\"pt\")\n", "\n", " # Inference\n", " with torch.inference_mode():\n", " outputs = model.generate(**batch, num_beams=5, max_length=256)\n", "\n", " # Postprocess\n", " with tokenizer.as_target_tokenizer():\n", " decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", "\n", " return ip.postprocess_batch(decoded, lang=target_lang)[0]\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "c4ae654a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'வணக்கம். '" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "translate_text(\"hello\",\"tam_Taml\")" ] }, { "cell_type": "code", "execution_count": null, "id": "530f0925", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Translation: टाम् @टाम्ल नमस्कार, आप कैसे हैं? \n" ] } ], "source": [ "import requests\n", "\n", "url = \"http://localhost:7860/translate\"\n", "\n", "payload = {\n", " \"text\": \"Hello, how are you?\",\n", " \"target_lang\": \"tam_Taml\"\n", "}\n", "\n", "headers = {\n", " \"Content-Type\": \"application/json\"\n", "}\n", "\n", "response = requests.post(url, json=payload, headers=headers)\n", "\n", "if response.status_code == 200:\n", " print(\"Translation:\", response.json()[\"translation\"])\n", "else:\n", " print(\"Error:\", response.status_code, response.text)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "73eb9c61", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "indietrans2", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.2" } }, "nbformat": 4, "nbformat_minor": 5 }