+
+
+ IndicTrans API
+
+
+ Real-time Indian Language Text Translation with IndicTrans!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From
+
+
+
+
+
+
+ To
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{ transcription_time }}
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/interface/logo.png b/interface/logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..8b924af80542daadb49ab5b72765bff3d9381d9f
Binary files /dev/null and b/interface/logo.png differ
diff --git a/joint_translate.sh b/joint_translate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ce23dda9c7d05884a2289db921375ad25370824d
--- /dev/null
+++ b/joint_translate.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+echo `date`
+infname=$1
+outfname=$2
+src_lang=$3
+tgt_lang=$4
+exp_dir=$5
+ref_fname=$6
+
+SRC_PREFIX='SRC'
+TGT_PREFIX='TGT'
+
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR='subword-nmt'
+model_dir=$exp_dir/model
+data_bin_dir=$exp_dir/final_bin
+
+### normalization and script conversion
+
+echo "Applying normalization and script conversion"
+input_size=`python scripts/preprocess_translate.py $infname $outfname.norm $src_lang true`
+echo "Number of sentences in input: $input_size"
+
+### apply BPE to input file
+
+echo "Applying BPE"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $exp_dir/vocab/bpe_codes.32k.${SRC_PREFIX} \
+ --vocabulary $exp_dir/vocab/vocab.$SRC_PREFIX \
+ --vocabulary-threshold 5 \
+ < $outfname.norm \
+ > $outfname._bpe
+
+# not needed for joint training
+# echo "Adding language tags"
+python scripts/add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang
+
+### run decoder
+
+echo "Decoding"
+
+src_input_bpe_fname=$outfname.bpe
+tgt_output_fname=$outfname
+fairseq-interactive $data_bin_dir \
+ -s $SRC_PREFIX -t $TGT_PREFIX \
+ --distributed-world-size 1 \
+ --path $model_dir/checkpoint_best.pt \
+ --batch-size 64 --buffer-size 2500 --beam 5 --remove-bpe \
+ --skip-invalid-size-inputs-valid-test \
+ --user-dir model_configs \
+ --input $src_input_bpe_fname > $tgt_output_fname.log 2>&1
+
+
+echo "Extracting translations, script conversion and detokenization"
+# this part reverses the transliteration from devnagiri script to target lang and then detokenizes it.
+python scripts/postprocess_translate.py $tgt_output_fname.log $tgt_output_fname $input_size $tgt_lang true
+
+# This block is now moved to compute_bleu.sh for release with more documentation.
+# if [ $src_lang == 'en' ]; then
+# # indicnlp tokenize the output files before evaluation
+# input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang`
+# input_size=`python scripts/preprocess_translate.py $tgt_output_fname $tgt_output_fname.tok $tgt_lang`
+# sacrebleu --tokenize none $ref_fname.tok < $tgt_output_fname.tok
+# else
+# # indic to en models
+# sacrebleu $ref_fname < $tgt_output_fname
+# fi
+# echo `date`
+echo "Translation completed"
diff --git a/learn_bpe.sh b/learn_bpe.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3219ac8d5615643344237eaa0279af3fe7ced254
--- /dev/null
+++ b/learn_bpe.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+expdir=$1 # EXPDIR
+num_operations=${2:-32000}
+
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR="subword-nmt"
+data_dir="$expdir/data"
+train_file=$data_dir/train
+# num_operations=32000
+
+echo Input file: $train_file
+
+mkdir -p $expdir/vocab
+
+echo "learning joint BPE"
+cat $train_file.SRC $train_file.TGT > $train_file.ALL
+python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \
+ --input $train_file.ALL \
+ -s $num_operations \
+ -o $expdir/vocab/bpe_codes.32k.SRC_TGT \
+ --num-workers -1
+
+echo "computing SRC vocab"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+ --num-workers -1 \
+ -i $train_file.SRC | \
+python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
+ > $expdir/vocab/vocab.tmp.SRC
+python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.SRC $expdir/vocab/vocab.SRC
+#rm $expdir/vocab/vocab.tmp.SRC
+
+echo "computing TGT vocab"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+ --num-workers -1 \
+ -i $train_file.TGT | \
+python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
+ > $expdir/vocab/vocab.tmp.TGT
+python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.TGT $expdir/vocab/vocab.TGT
+#rm $expdir/vocab/vocab.tmp.TGT
+
+rm $train_file.ALL
diff --git a/learn_single_bpe.sh b/learn_single_bpe.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8d56edb8532e19ba0aadedf6bf9e8ae4b6828eb0
--- /dev/null
+++ b/learn_single_bpe.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+expdir=$1 # EXPDIR
+num_operations=${2:-32000}
+
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR="subword-nmt"
+data_dir="$expdir/data"
+train_file=$data_dir/train
+# num_operations=32000
+
+echo Input file: $train_file
+
+mkdir -p $expdir/vocab
+
+echo "learning source BPE"
+
+python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \
+ --input $train_file.SRC \
+ -s $num_operations \
+ -o $expdir/vocab/bpe_codes.32k.SRC\
+ --num-workers -1
+
+echo "learning target BPE"
+python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \
+ --input $train_file.TGT \
+ -s $num_operations \
+ -o $expdir/vocab/bpe_codes.32k.TGT\
+ --num-workers -1
+
+echo "computing SRC vocab"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $expdir/vocab/bpe_codes.32k.SRC \
+ --num-workers -1 \
+ -i $train_file.SRC | \
+python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
+ > $expdir/vocab/vocab.tmp.SRC
+python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.SRC $expdir/vocab/vocab.SRC
+rm $expdir/vocab/vocab.tmp.SRC
+
+echo "computing TGT vocab"
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $expdir/vocab/bpe_codes.32k.TGT \
+ --num-workers -1 \
+ -i $train_file.TGT | \
+python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \
+ > $expdir/vocab/vocab.tmp.TGT
+python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.TGT $expdir/vocab/vocab.TGT
+rm $expdir/vocab/vocab.tmp.TGT
diff --git a/legacy/apply_bpe_test_valid_notag.sh b/legacy/apply_bpe_test_valid_notag.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f152770c4ad7d5c13f72b492d50ffff238ff44f0
--- /dev/null
+++ b/legacy/apply_bpe_test_valid_notag.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+expdir=$1 # EXPDIR
+org_data_dir=$2
+langs=$3
+
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR="subword-nmt"
+echo "Apply to each language"
+
+for dset in `echo test dev`
+do
+ echo $dset
+
+ in_dset_dir="$org_data_dir/$dset"
+ out_dset_dir="$expdir/bpe/$dset"
+
+ for lang in $langs
+ do
+
+ echo Apply BPE for $dset "-" $lang
+
+ mkdir -p $out_dset_dir
+
+ python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+ --vocabulary $expdir/vocab/vocab.SRC \
+ --vocabulary-threshold 5 \
+ < $in_dset_dir/$dset.$lang \
+ > $out_dset_dir/$dset.$lang
+
+ done
+done
diff --git a/legacy/apply_bpe_train_notag.sh b/legacy/apply_bpe_train_notag.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fa24a57dc2a8b26eed1aae66793f9a65c2712e26
--- /dev/null
+++ b/legacy/apply_bpe_train_notag.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+expdir=$1 # EXPDIR
+
+#`dirname $0`/env.sh
+SUBWORD_NMT_DIR="subword-nmt"
+
+data_dir="$expdir/data"
+train_file=$data_dir/train
+bpe_file=$expdir/bpe/train/train
+
+mkdir -p $expdir/bpe/train
+
+echo "Apply to SRC corpus"
+
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+ --vocabulary $expdir/vocab/vocab.SRC \
+ --vocabulary-threshold 5 \
+ --num-workers "-1" \
+ < $train_file.SRC \
+ > $bpe_file.SRC
+
+echo "Apply to TGT corpus"
+
+python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
+ -c $expdir/vocab/bpe_codes.32k.SRC_TGT \
+ --vocabulary $expdir/vocab/vocab.TGT \
+ --vocabulary-threshold 5 \
+ --num-workers "-1" \
+ < $train_file.TGT \
+ > $bpe_file.TGT
+
diff --git a/legacy/env.sh b/legacy/env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9c9611b0d11e821bdb17b612b64c3d14e208cc74
--- /dev/null
+++ b/legacy/env.sh
@@ -0,0 +1,17 @@
+
+export SRC=''
+
+## Python env directory where fairseq is installed
+export PYTHON_ENV=''
+
+export SUBWORD_NMT_DIR=''
+export INDIC_RESOURCES_PATH=''
+export INDIC_NLP_HOME=''
+
+export CUDA_HOME=''
+
+export PATH=$CUDA_HOME/bin:$INDIC_NLP_HOME:$PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64
+
+# set environment variable to control GPUS visible to the application
+#export CUDA_VISIBLE_DEVICES="'
diff --git a/legacy/indictrans_workflow.ipynb b/legacy/indictrans_workflow.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..7e11ad28f9f788c5311d60d44f360a215a1da8d9
--- /dev/null
+++ b/legacy/indictrans_workflow.ipynb
@@ -0,0 +1,643 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import random\n",
+ "from tqdm.notebook import tqdm\n",
+ "from sacremoses import MosesPunctNormalizer\n",
+ "from sacremoses import MosesTokenizer\n",
+ "from sacremoses import MosesDetokenizer\n",
+ "from collections import defaultdict\n",
+ "import sacrebleu"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The path to the local git repo for Indic NLP library\n",
+ "INDIC_NLP_LIB_HOME=\"\"\n",
+ "\n",
+ "# The path to the local git repo for Indic NLP Resources\n",
+ "INDIC_NLP_RESOURCES=\"\"\n",
+ "\n",
+ "import sys\n",
+ "sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))\n",
+ "\n",
+ "from indicnlp import common\n",
+ "common.set_resources_path(INDIC_NLP_RESOURCES)\n",
+ "\n",
+ "from indicnlp import loader\n",
+ "loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import indicnlp\n",
+ "from indicnlp.tokenize import indic_tokenize\n",
+ "from indicnlp.tokenize import indic_detokenize\n",
+ "from indicnlp.normalize import indic_normalize\n",
+ "from indicnlp.transliterate import unicode_transliterate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "LANGS=[\n",
+ " \"bn\",\n",
+ " \"gu\",\n",
+ " \"hi\",\n",
+ " \"kn\",\n",
+ " \"ml\",\n",
+ " \"mr\",\n",
+ " \"or\",\n",
+ " \"pa\",\n",
+ " \"ta\",\n",
+ " \"te\", \n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def preprocess(infname,outfname,lang):\n",
+ " \"\"\"\n",
+ " Preparing each corpus file: \n",
+ " - Normalization\n",
+ " - Tokenization \n",
+ " - Script coversion to Devanagari for Indic scripts\n",
+ " \"\"\"\n",
+ " \n",
+ " ### reading \n",
+ " with open(infname,'r',encoding='utf-8') as infile, \\\n",
+ " open(outfname,'w',encoding='utf-8') as outfile:\n",
+ " \n",
+ " if lang=='en':\n",
+ " en_tok=MosesTokenizer(lang='en')\n",
+ " en_normalizer = MosesPunctNormalizer()\n",
+ " for line in tqdm(infile): \n",
+ " outline=' '.join(\n",
+ " en_tok.tokenize( \n",
+ " en_normalizer.normalize(line.strip()), \n",
+ " escape=False ) )\n",
+ " outfile.write(outline+'\\n')\n",
+ " \n",
+ " else:\n",
+ " normfactory=indic_normalize.IndicNormalizerFactory()\n",
+ " normalizer=normfactory.get_normalizer(lang)\n",
+ " for line in tqdm(infile): \n",
+ " outline=unicode_transliterate.UnicodeIndicTransliterator.transliterate(\n",
+ " ' '.join(\n",
+ " indic_tokenize.trivial_tokenize(\n",
+ " normalizer.normalize(line.strip()), lang) ), lang, 'hi').replace(' ् ','्')\n",
+ "\n",
+ "\n",
+ " outfile.write(outline+'\\n')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def add_token(sent, tag_infos):\n",
+ " \"\"\" add special tokens specified by tag_infos to each element in list\n",
+ "\n",
+ " tag_infos: list of tuples (tag_type,tag)\n",
+ "\n",
+ " each tag_info results in a token of the form: __{tag_type}__{tag}__\n",
+ "\n",
+ " \"\"\"\n",
+ "\n",
+ " tokens=[]\n",
+ " for tag_type, tag in tag_infos:\n",
+ " token = '__' + tag_type + '__' + tag + '__'\n",
+ " tokens.append(token)\n",
+ "\n",
+ " return ' '.join(tokens) + ' ' + sent \n",
+ "\n",
+ "\n",
+ "def concat_data(data_dir, outdir, lang_pair_list, out_src_lang='SRC', out_trg_lang='TGT'):\n",
+ " \"\"\"\n",
+ " data_dir: input dir, contains directories for language pairs named l1-l2\n",
+ " \"\"\"\n",
+ " os.makedirs(outdir,exist_ok=True)\n",
+ "\n",
+ " out_src_fname='{}/train.{}'.format(outdir,out_src_lang)\n",
+ " out_trg_fname='{}/train.{}'.format(outdir,out_trg_lang)\n",
+ "# out_meta_fname='{}/metadata.txt'.format(outdir)\n",
+ "\n",
+ " print()\n",
+ " print(out_src_fname)\n",
+ " print(out_trg_fname)\n",
+ "# print(out_meta_fname)\n",
+ "\n",
+ " ### concatenate train data \n",
+ " if os.path.isfile(out_src_fname):\n",
+ " os.unlink(out_src_fname)\n",
+ " if os.path.isfile(out_trg_fname):\n",
+ " os.unlink(out_trg_fname)\n",
+ "# if os.path.isfile(out_meta_fname):\n",
+ "# os.unlink(out_meta_fname)\n",
+ "\n",
+ " for src_lang, trg_lang in tqdm(lang_pair_list):\n",
+ " print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n",
+ "\n",
+ " in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n",
+ " in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n",
+ "\n",
+ " print(in_src_fname)\n",
+ " os.system('cat {} >> {}'.format(in_src_fname,out_src_fname))\n",
+ "\n",
+ " print(in_trg_fname)\n",
+ " os.system('cat {} >> {}'.format(in_trg_fname,out_trg_fname)) \n",
+ " \n",
+ " \n",
+ "# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n",
+ "# lpfile.write('\\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))\n",
+ " \n",
+ " corpus_stats(data_dir, outdir, lang_pair_list)\n",
+ " \n",
+ "def corpus_stats(data_dir, outdir, lang_pair_list):\n",
+ " \"\"\"\n",
+ " data_dir: input dir, contains directories for language pairs named l1-l2\n",
+ " \"\"\"\n",
+ "\n",
+ " with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n",
+ "\n",
+ " for src_lang, trg_lang in tqdm(lang_pair_list):\n",
+ " print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n",
+ "\n",
+ " in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n",
+ " # in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n",
+ "\n",
+ " print(in_src_fname)\n",
+ " corpus_size=0\n",
+ " with open(in_src_fname,'r',encoding='utf-8') as infile:\n",
+ " corpus_size=sum(map(lambda x:1,infile))\n",
+ " \n",
+ " lpfile.write('{}\\t{}\\t{}\\n'.format(src_lang,trg_lang,corpus_size))\n",
+ " \n",
+ "def generate_lang_tag_iterator(infname):\n",
+ " with open(infname,'r',encoding='utf-8') as infile:\n",
+ " for line in infile:\n",
+ " src,tgt,count=line.strip().split('\\t')\n",
+ " count=int(count)\n",
+ " for _ in range(count):\n",
+ " yield (src,tgt) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#### directory containing all experiments \n",
+ "## one directory per experiment \n",
+ "EXPBASEDIR=''\n",
+ "\n",
+ "### directory containing data\n",
+ "## contains 3 directories: train test dev\n",
+ "## train directory structure: \n",
+ "## - There is one directory for each language pair\n",
+ "## - Directory naming convention lang1-lang2 (you need another directory/softlink for lang2-lang1)\n",
+ "## - Each directory contains 6 files: {train,test,dev}.{lang1,lang2}\n",
+ "## test & dev directory structure \n",
+ "## - test: contains files {test.l1,test.l2,test.l3} - assumes parallel test files like the wat2021 dataset\n",
+ "## - valid: contains files {dev.l1,dev.l2,dev.l3} - assumes parallel test files like the wat2021 dataset\n",
+ "## All files are tokenized\n",
+ "ORG_DATA_DIR='{d}/consolidated_unique_preprocessed'.format(d=BASEDIR)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Exp2 (M2O)\n",
+ "\n",
+ "- All *-en "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Params**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "expname='exp2_m2o_baseline'\n",
+ "expdir='{}/{}'.format(EXPBASEDIR,expname)\n",
+ "\n",
+ "lang_pair_list=[]\n",
+ "for lang in LANGS: \n",
+ " lang_pair_list.append([lang,'en'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Create Train Corpus**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "indir='{}/train'.format(ORG_DATA_DIR)\n",
+ "outdir='{}/data'.format(expdir)\n",
+ "\n",
+ "# print(lang_pair_list)\n",
+ "concat_data(indir,outdir,lang_pair_list)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Learn BPE**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!echo ./learn_bpe.sh {expdir}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!echo ./apply_bpe_train_notag.sh {expdir}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!echo ./apply_bpe_test_valid_notag.sh {expdir} {ORG_DATA_DIR} {'\"'+' '.join(LANGS+['en'])+'\"'}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Add language tags to train**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dset='train' \n",
+ "\n",
+ "src_fname='{expdir}/bpe/train/{dset}.SRC'.format(expdir=expdir,dset=dset)\n",
+ "tgt_fname='{expdir}/bpe/train/{dset}.TGT'.format(expdir=expdir,dset=dset)\n",
+ "meta_fname='{expdir}/data/lang_pairs.txt'.format(expdir=expdir,dset=dset)\n",
+ " \n",
+ "out_src_fname='{expdir}/final/{dset}.SRC'.format(expdir=expdir,dset=dset)\n",
+ "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(expdir=expdir,dset=dset)\n",
+ "\n",
+ "lang_tag_iterator=generate_lang_tag_iterator(meta_fname)\n",
+ "\n",
+ "print(expdir)\n",
+ "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
+ "\n",
+ "with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
+ " open(tgt_fname,'r',encoding='utf-8') as tgtfile, \\\n",
+ " open(out_src_fname,'w',encoding='utf-8') as outsrcfile, \\\n",
+ " open(out_tgt_fname,'w',encoding='utf-8') as outtgtfile: \n",
+ "\n",
+ " for (l1,l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator, srcfile, tgtfile)):\n",
+ " outsrcfile.write(add_token(src_sent.strip(),[('src',l1),('tgt',l2)]) + '\\n' )\n",
+ " outtgtfile.write(tgt_sent.strip()+'\\n')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Add language tags to valid**\n",
+ "\n",
+ "- add language tags, create parallel corpus\n",
+ "- sample 20\\% for validation set \n",
+ "- Create final validation set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dset='dev' \n",
+ "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n",
+ " expdir=expdir,dset=dset)\n",
+ "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n",
+ " expdir=expdir,dset=dset)\n",
+ "\n",
+ "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
+ "\n",
+ "print('Processing validation files') \n",
+ "consolidated_dset=[]\n",
+ "for l1, l2 in tqdm(lang_pair_list):\n",
+ " src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+ " expdir=expdir,dset=dset,lang=l1)\n",
+ " tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+ " expdir=expdir,dset=dset,lang=l2)\n",
+ "# print(src_fname)\n",
+ "# print(os.path.exists(src_fname))\n",
+ " with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
+ " open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n",
+ " for src_sent, tgt_sent in zip(srcfile,tgtfile):\n",
+ " consolidated_dset.append(\n",
+ " ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n",
+ " tgt_sent.strip() )\n",
+ " )\n",
+ "\n",
+ "print('Create validation set') \n",
+ "random.shuffle(consolidated_dset)\n",
+ "final_set=consolidated_dset[:len(consolidated_dset)//5] \n",
+ "\n",
+ "print('Original set size: {}'.format(len(consolidated_dset))) \n",
+ "print('Sampled set size: {}'.format(len(final_set))) \n",
+ "\n",
+ "print('Write validation set')\n",
+ "\n",
+ "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n",
+ " open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n",
+ " for src_sent, tgt_sent in final_set: \n",
+ " srcfile.write(src_sent+'\\n')\n",
+ " tgtfile.write(tgt_sent+'\\n')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Add language tags to test**\n",
+ "\n",
+ "- add language tags, create parallel corpus all M2O language pairs \n",
+ "- Create final test set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dset='test' \n",
+ "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n",
+ " expdir=expdir,dset=dset)\n",
+ "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n",
+ " expdir=expdir,dset=dset)\n",
+ "\n",
+ "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
+ "\n",
+ "print('Processing test files') \n",
+ "consolidated_dset=[]\n",
+ "for l1, l2 in tqdm(lang_pair_list):\n",
+ " src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+ " expdir=expdir,dset=dset,lang=l1)\n",
+ " tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
+ " expdir=expdir,dset=dset,lang=l2)\n",
+ "# print(src_fname)\n",
+ "# print(os.path.exists(src_fname))\n",
+ " with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
+ " open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n",
+ " for src_sent, tgt_sent in zip(srcfile,tgtfile):\n",
+ " consolidated_dset.append(\n",
+ " ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n",
+ " tgt_sent.strip() )\n",
+ " )\n",
+ "\n",
+ "print('Final set size: {}'.format(len(consolidated_dset))) \n",
+ " \n",
+ "print('Write test set')\n",
+ "print('testset truncated')\n",
+ "\n",
+ "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n",
+ " open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n",
+ " for lno, (src_sent, tgt_sent) in enumerate(consolidated_dset,1):\n",
+ " \n",
+ " s=src_sent.strip().split(' ')\n",
+ " t=tgt_sent.strip().split(' ')\n",
+ " \n",
+ " if len(s) > 200 or len(t) > 200:\n",
+ " print('exp: {}, pair: ({},{}), lno: {}: lens: ({},{})'.format(expname,l1,l2,lno,len(s),len(t))) \n",
+ " \n",
+ " src_sent=' '.join( s[:min(len(s),200)] )\n",
+ " tgt_sent=' '.join( t[:min(len(t),200)] )\n",
+ " \n",
+ " srcfile.write(src_sent+'\\n')\n",
+ " tgtfile.write(tgt_sent+'\\n')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Binarize data**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!echo ./binarize_training_exp.sh {expdir} SRC TGT"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Training Command**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%bash \n",
+ "\n",
+ "python train.py {expdir}/final_bin \\\n",
+ " --arch transformer \\\n",
+ " --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1.0 \\\n",
+ " --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \\\n",
+ " --dropout 0.2 \\\n",
+ " --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \\\n",
+ " --max-tokens 8192 \\\n",
+ " --max-update 1000000 \\\n",
+ " --max-source-positions 200 \\\n",
+ " --max-target-positions 200 \\\n",
+ " --tensorboard-logdir {expdir}/tensorboard \\\n",
+ " --save-dir {expdir}/model \\\n",
+ " --required-batch-size-multiple 8 \\\n",
+ " --save-interval 1 \\\n",
+ " --keep-last-epochs 5 \\\n",
+ " --patience 5 \\\n",
+ " --fp16"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Cleanup**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# os.unlink('{}')\n",
+ "\n",
+ "to_delete=[\n",
+ " '{expdir}/data/train.SRC'.format(expdir=expdir,dset=dset),\n",
+ " '{expdir}/data/train.TGT'.format(expdir=expdir,dset=dset),\n",
+ " '{expdir}/bpe/train/train.SRC'.format(expdir=expdir,dset=dset),\n",
+ " '{expdir}/bpe/train/train.TGT'.format(expdir=expdir,dset=dset),\n",
+ "]`\n",
+ "\n",
+ "for fname in to_delete:\n",
+ " os.unlink(fname)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Evaluation**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dset='test' \n",
+ "consolidated_testoutput_fname='{expdir}/evaluations/test/default/test.SRC_TGT.TGT'.format(expdir=expdir)\n",
+ "consolidated_testoutput_log_fname='{}.log'.format(consolidated_testoutput_fname)\n",
+ "metrics_fname='{expdir}/evaluations/test/default/test.metrics.tsv'.format(expdir=expdir)\n",
+ " \n",
+ "test_set_size=2390\n",
+ "\n",
+ "consolidated_testoutput=[]\n",
+ "with open(consolidated_testoutput_log_fname,'r',encoding='utf-8') as hypfile:\n",
+ " consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),hypfile) ))\n",
+ " consolidated_testoutput.sort(key=lambda x: int(x.split('\\t')[0].split('-')[1]))\n",
+ " consolidated_testoutput=[ x.split('\\t')[2] for x in consolidated_testoutput ]\n",
+ "\n",
+ "os.makedirs('{expdir}/evaluations/test/default'.format(expdir=expdir),exist_ok=True)\n",
+ "\n",
+ "with open(consolidated_testoutput_fname,'w',encoding='utf-8') as finalhypfile:\n",
+ " for sent in consolidated_testoutput:\n",
+ " finalhypfile.write(sent+'\\n')\n",
+ "\n",
+ "print('Processing test files') \n",
+ "with open(metrics_fname,'w',encoding='utf-8') as metrics_file: \n",
+ " for i, (l1, l2) in enumerate(tqdm(lang_pair_list)):\n",
+ "\n",
+ " start=i*test_set_size\n",
+ " end=(i+1)*test_set_size\n",
+ " hyps=consolidated_testoutput[start:end]\n",
+ " ref_fname='{expdir}/{dset}/{dset}.{lang}'.format(\n",
+ " expdir=ORG_DATA_DIR,dset=dset,lang=l2)\n",
+ "\n",
+ " refs=[]\n",
+ " with open(ref_fname,'r',encoding='utf-8') as reffile:\n",
+ " refs.extend(map(lambda x:x.strip(),reffile))\n",
+ "\n",
+ " assert(len(hyps)==len(refs))\n",
+ "\n",
+ " bleu=sacrebleu.corpus_bleu(hyps,[refs],tokenize='none')\n",
+ "\n",
+ " print('{} {} {} {}'.format(l1,l2,bleu.score,bleu.prec_str))\n",
+ " metrics_file.write('{}\\t{}\\t{}\\t{}\\t{}\\n'.format(expname,l1,l2,bleu.score,bleu.prec_str))\n",
+ " "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.0"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {
+ "height": "243.993px",
+ "width": "160px"
+ },
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/legacy/install_fairseq.sh b/legacy/install_fairseq.sh
new file mode 100644
index 0000000000000000000000000000000000000000..275ab9574dabcd293a553dd50e46288d33025e7a
--- /dev/null
+++ b/legacy/install_fairseq.sh
@@ -0,0 +1,45 @@
+#NVIDIA CUDA download
+wget "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux"
+wget "http://developer.download.nvidia.com/compute/cuda/10.0/Prod/patches/1/cuda_10.0.130.1_linux.run"
+
+## do not install drivers (See this: https://docs.nvidia.com/deploy/cuda-compatibility/index.html)
+sudo sh "cuda_10.0.130_410.48_linux"
+sudo sh "cuda_10.0.130.1_linux.run"
+
+#Set environment variables
+export CUDA_HOME=/usr/local/cuda-10.0
+export PATH=$CUDA_HOME/bin:$PATH
+export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+# Install pytorch 1.2
+python3 -m venv pytorch1.2
+source pytorch1.2/bin/activate
+which pip3
+pip3 install torch==1.2.0 torchvision==0.4.0
+
+# Install nccl
+git clone https://github.com/NVIDIA/nccl.git
+cd nccl
+make src.build CUDA_HOME=$CUDA_HOME
+sudo apt install build-essential devscripts debhelper fakeroot
+make pkg.debian.build CUDA_HOME=$CUDA_HOME
+sudo dpkg -i build/pkg/deb/libnccl2_2.7.8-1+cuda10.0_amd64.deb
+sudo dpkg -i build/pkg/deb/libnccl-dev_2.7.8-1+cuda10.0_amd64.deb
+sudo apt-get install -f
+
+# Install Apex
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \
+ --global-option="--deprecated_fused_adam" --global-option="--xentropy" \
+ --global-option="--fast_multihead_attn" ./
+
+# Install PyArrow
+pip install pyarrow
+
+# Install fairseq
+pip install --editable ./
+
+# Install other dependencies
+pip install sacrebleu
+pip install tensorboardX --user
diff --git a/legacy/run_inference.sh b/legacy/run_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ff582a6c49d015cf36c82e8f20a755f6d1418ed8
--- /dev/null
+++ b/legacy/run_inference.sh
@@ -0,0 +1,80 @@
+src_lang=${1:-hi}
+tgt_lang=${2:-en}
+bucket_path=${3:-gs://ai4b-anuvaad-nmt/baselines/transformer-base/baselines-${src_lang}-${tgt_lang}}
+
+expdir=../baselines/baselines-${src_lang}-${tgt_lang}
+
+if [[ -d $expdir ]]
+then
+ echo "$expdir exists on your filesystem. Please delete this if you have made some changes to the bucket files and trying to redownload"
+else
+ mkdir -p $expdir
+ mkdir -p $expdir/model
+ cd ../baselines
+ gsutil -m cp -r $bucket_path/vocab $expdir
+ gsutil -m cp -r $bucket_path/final_bin $expdir
+ gsutil -m cp $bucket_path/model/checkpoint_best.pt $expdir/model
+ cd ../indicTrans
+fi
+
+
+if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then
+ #TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all)
+ TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news )
+elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then
+ # TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
+ TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news ufal-ta)
+elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then
+ # TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
+ TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
+elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then
+ # TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all)
+ TEST_SETS=( wat2021-devtest wat2020-devtest wmt-news )
+elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then
+ TEST_SETS=( pmi )
+elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then
+ # TEST_SETS=( wat2021-devtest anuvaad-legal all)
+ TEST_SETS=( wat2021-devtest )
+elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then
+ # TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all)
+ TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
+elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then
+ # TEST_SETS=( wat2021-devtest wat2020-devtest all)
+ TEST_SETS=( wat2021-devtest wat2020-devtest )
+elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then
+ TEST_SETS=( wat2021-devtest )
+elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then
+ TEST_SETS=( wat2021-devtest )
+elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then
+ # TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all )
+ TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
+fi
+
+if [ $src_lang == 'en' ]; then
+ indic_lang=$tgt_lang
+else
+ indic_lang=$src_lang
+fi
+
+
+for tset in ${TEST_SETS[@]};do
+ echo $tset $src_lang $tgt_lang
+ if [ $tset == 'wat2021-devtest' ]; then
+ SRC_FILE=${expdir}/benchmarks/$tset/test.$src_lang
+ REF_FILE=${expdir}/benchmarks/$tset/test.$tgt_lang
+ else
+ SRC_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$src_lang
+ REF_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$tgt_lang
+ fi
+ RESULTS_DIR=${expdir}/results/$tset
+
+ mkdir -p $RESULTS_DIR
+
+ bash translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE
+ # for newline between different outputs
+ echo
+done
+# send the results to the bucket
+gsutil -m cp -r $expdir/results $bucket_path
+# clear up the space in the instance
+# rm -r $expdir
\ No newline at end of file
diff --git a/legacy/run_joint_inference.sh b/legacy/run_joint_inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bf4668c9ecb6b1a1ef9b9b7871c6ee22d7865c0b
--- /dev/null
+++ b/legacy/run_joint_inference.sh
@@ -0,0 +1,74 @@
+src_lang=${1:-en}
+tgt_lang=${2:-indic}
+bucket_path=${3:-gs://ai4b-anuvaad-nmt/models/transformer-4x/indictrans-${src_lang}-${tgt_lang}}
+
+mkdir -p ../baselines
+expdir=../baselines/baselines-${src_lang}-${tgt_lang}
+
+if [[ -d $expdir ]]
+then
+ echo "$expdir exists on your filesystem."
+else
+ cd ../baselines
+ mkdir -p baselines-${src_lang}-${tgt_lang}/model
+ mkdir -p baselines-${src_lang}-${tgt_lang}/final_bin
+ cd baselines-${src_lang}-${tgt_lang}/model
+ gsutil -m cp $bucket_path/model/checkpoint_best.pt .
+ cd ..
+ gsutil -m cp $bucket_path/vocab .
+ gsutil -m cp $bucket_path/final_bin/dict.* final_bin
+ cd ../indicTrans
+fi
+
+
+
+
+
+if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then
+ TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all)
+elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then
+ TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
+elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then
+ TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
+elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then
+ TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all)
+elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then
+ TEST_SETS=( all )
+elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then
+ TEST_SETS=( wat2021-devtest anuvaad-legal all)
+elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then
+ TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all)
+elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then
+ TEST_SETS=( wat2021-devtest wat2020-devtest all)
+elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then
+ TEST_SETS=( all )
+elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then
+ TEST_SETS=( all )
+elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then
+ TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all )
+fi
+
+if [ $src_lang == 'en' ]; then
+ indic_lang=$tgt_lang
+else
+ indic_lang=$src_lang
+fi
+
+
+for tset in ${TEST_SETS[@]};do
+ echo $tset $src_lang $tgt_lang
+ if [ $tset == 'wat2021-devtest' ]; then
+ SRC_FILE=${expdir}/devtest/$tset/test.$src_lang
+ REF_FILE=${expdir}/devtest/$tset/test.$tgt_lang
+ else
+ SRC_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$src_lang
+ REF_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$tgt_lang
+ fi
+ RESULTS_DIR=${expdir}/results/$tset
+
+ mkdir -p $RESULTS_DIR
+
+ bash joint_translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE
+ # for newline between different outputs
+ echo
+done
diff --git a/legacy/tpu_training_instructions.md b/legacy/tpu_training_instructions.md
new file mode 100644
index 0000000000000000000000000000000000000000..41c9092811f50188c21b459c3033a59d769be8c8
--- /dev/null
+++ b/legacy/tpu_training_instructions.md
@@ -0,0 +1,92 @@
+## Instructions to run on Google cloud TPUs
+Before starting these steps, make sure to prepare the dataset (normalization -> bpe -> .. -> binarization) following the steps in indicTrans workflow or do these steps on a cpu instance before launching the tpu instance (to save time and costs)
+
+### Creating TPU instance
+
+- Create a cpu instance on gcp with `torch-xla` image like:
+```bash
+gcloud compute --project=${PROJECT_ID} instances create