diff --git a/IndicTrans_training.ipynb b/IndicTrans_training.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fc895710960b0718bb3603ae35db4502721c5971 --- /dev/null +++ b/IndicTrans_training.ipynb @@ -0,0 +1,752 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FdyHSnoj7Iun", + "outputId": "d0624c60-68c4-470f-9ade-c517e3296044" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/content/training\n" + ] + } + ], + "source": [ + "# create a seperate folder to store everything\n", + "!mkdir training\n", + "%cd training" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "y55OfxBz8QeP", + "outputId": "6d0ab016-0f96-4671-ddee-f06b50506dcd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'indicTrans'...\n", + "remote: Enumerating objects: 432, done.\u001b[K\n", + "remote: Counting objects: 100% (139/139), done.\u001b[K\n", + "remote: Compressing objects: 100% (34/34), done.\u001b[K\n", + "remote: Total 432 (delta 122), reused 105 (delta 105), pack-reused 293\u001b[K\n", + "Receiving objects: 100% (432/432), 1.43 MiB | 14.11 MiB/s, done.\n", + "Resolving deltas: 100% (248/248), done.\n", + "/content/training/indicTrans\n", + "Cloning into 'indic_nlp_library'...\n", + "remote: Enumerating objects: 1325, done.\u001b[K\n", + "remote: Counting objects: 100% (147/147), done.\u001b[K\n", + "remote: Compressing objects: 100% (103/103), done.\u001b[K\n", + "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n", + "Receiving objects: 100% (1325/1325), 9.57 MiB | 10.51 MiB/s, done.\n", + "Resolving deltas: 100% (688/688), done.\n", + "Cloning into 'indic_nlp_resources'...\n", + "remote: Enumerating objects: 133, done.\u001b[K\n", + "remote: Counting objects: 100% (7/7), done.\u001b[K\n", + "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", + "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n", + "Receiving objects: 100% (133/133), 149.77 MiB | 34.05 MiB/s, done.\n", + "Resolving deltas: 100% (51/51), done.\n", + "Checking out files: 100% (28/28), done.\n", + "Cloning into 'subword-nmt'...\n", + "remote: Enumerating objects: 580, done.\u001b[K\n", + "remote: Counting objects: 100% (4/4), done.\u001b[K\n", + "remote: Compressing objects: 100% (4/4), done.\u001b[K\n", + "remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n", + "Receiving objects: 100% (580/580), 237.41 KiB | 5.28 MiB/s, done.\n", + "Resolving deltas: 100% (349/349), done.\n", + "/content/training\n" + ] + } + ], + "source": [ + "# clone the repo for running finetuning\n", + "!git clone https://github.com/AI4Bharat/indicTrans.git\n", + "%cd indicTrans\n", + "# clone requirements repositories\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n", + "!git clone https://github.com/rsennrich/subword-nmt.git\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ziWWl-1a8SMw", + "outputId": "d7908a62-9573-4693-e7cb-44aeeebaaa15" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following NEW packages will be installed:\n", + " tree\n", + "0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.\n", + "Need to get 40.7 kB of archives.\n", + "After this operation, 105 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n", + "Fetched 40.7 kB in 0s (133 kB/s)\n", + "debconf: unable to initialize frontend: Dialog\n", + "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)\n", + "debconf: falling back to frontend: Readline\n", + "debconf: unable to initialize frontend: Readline\n", + "debconf: (This frontend requires a controlling tty.)\n", + "debconf: falling back to frontend: Teletype\n", + "dpkg-preconfigure: unable to re-open stdin: \n", + "Selecting previously unselected package tree.\n", + "(Reading database ... 160772 files and directories currently installed.)\n", + "Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n", + "Unpacking tree (1.7.0-5) ...\n", + "Setting up tree (1.7.0-5) ...\n", + "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", + "Collecting sacremoses\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", + "\u001b[K |████████████████████████████████| 901kB 4.0MB/s \n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", + "Collecting mock\n", + " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n", + "Collecting sacrebleu\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n", + "\u001b[K |████████████████████████████████| 61kB 7.4MB/s \n", + "\u001b[?25hCollecting tensorboardX\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n", + "\u001b[K |████████████████████████████████| 133kB 24.0MB/s \n", + "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n", + "Collecting indic-nlp-library\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n", + "\u001b[K |████████████████████████████████| 40kB 5.4MB/s \n", + "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n", + "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n", + "Collecting portalocker==2.0.0\n", + " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n", + "Collecting morfessor\n", + " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n", + "Collecting sphinx-argparse\n", + " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n", + "Collecting sphinx-rtd-theme\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n", + "\u001b[K |████████████████████████████████| 9.2MB 21.7MB/s \n", + "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n", + "Requirement already satisfied: sphinx>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx-argparse->indic-nlp-library) (1.8.5)\n", + "Collecting docutils<0.17\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n", + "\u001b[K |████████████████████████████████| 552kB 38.5MB/s \n", + "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (20.9)\n", + "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.0)\n", + "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.23.0)\n", + "Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.4)\n", + "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.6.1)\n", + "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.1.0)\n", + "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.9.1)\n", + "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (0.7.12)\n", + "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.11.3)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.4.7)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (3.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2021.5.30)\n", + "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.1.5)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.0.1)\n", + "Building wheels for collected packages: sphinx-argparse\n", + " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=0f3830a0bf7a6cfa99000091da945e9dd814b2f1e1f9ca5d773f99aaa0d3a4a5\n", + " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n", + "Successfully built sphinx-argparse\n", + "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n", + "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, sphinx-argparse, docutils, sphinx-rtd-theme, indic-nlp-library\n", + " Found existing installation: docutils 0.17.1\n", + " Uninstalling docutils-0.17.1:\n", + " Successfully uninstalled docutils-0.17.1\n", + "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n", + "Cloning into 'fairseq'...\n", + "remote: Enumerating objects: 28410, done.\u001b[K\n", + "remote: Counting objects: 100% (229/229), done.\u001b[K\n", + "remote: Compressing objects: 100% (127/127), done.\u001b[K\n", + "remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n", + "Receiving objects: 100% (28410/28410), 11.96 MiB | 24.45 MiB/s, done.\n", + "Resolving deltas: 100% (21310/21310), done.\n", + "/content/training/fairseq\n", + "Obtaining file:///content/training/fairseq\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n", + "Collecting omegaconf<2.1\n", + " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n", + "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n", + "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n", + "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n", + "Collecting hydra-core<1.1\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n", + "\u001b[K |████████████████████████████████| 133kB 4.7MB/s \n", + "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n", + "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n", + "Collecting PyYAML>=5.1.*\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n", + "\u001b[K |████████████████████████████████| 645kB 32.4MB/s \n", + "\u001b[?25hRequirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n", + "Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n", + "Collecting antlr4-python3-runtime==4.8\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n", + "\u001b[K |████████████████████████████████| 112kB 53.0MB/s \n", + "\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n", + "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n", + "Building wheels for collected packages: antlr4-python3-runtime\n", + " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=52f59bfe6322a04598da6960d2d5675a581273a45e4391e04cf1240c97346019\n", + " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n", + "Successfully built antlr4-python3-runtime\n", + "Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + " Running setup.py develop for fairseq\n", + "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n", + "/content/training\n" + ] + } + ], + "source": [ + "! sudo apt install tree\n", + "\n", + "# Install the necessary libraries\n", + "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n", + "# Install fairseq from source\n", + "!git clone https://github.com/pytorch/fairseq.git\n", + "%cd fairseq\n", + "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n", + "!pip install --editable ./\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tmfGYkd58UiO", + "outputId": "3b83bcf6-bbbf-4e49-c2bb-7d0fb999297d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "^C\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "--2021-12-18 21:31:57-- https://storage.googleapis.com/samanantar-public/benchmarks.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.160.144, 216.58.196.176, 142.250.71.16, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.160.144|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 7301872 (7.0M) [application/zip]\n", + "Saving to: 'benchmarks.zip'\n", + "\n", + " 0K .......... .......... .......... .......... .......... 0% 774K 9s\n", + " 50K .......... .......... .......... .......... .......... 1% 2.10M 6s\n", + " 100K .......... .......... .......... .......... .......... 2% 2.46M 5s\n", + " 150K .......... .......... .......... .......... .......... 2% 2.68M 4s\n", + " 200K .......... .......... .......... .......... .......... 3% 1.44M 4s\n", + " 250K .......... .......... .......... .......... .......... 4% 2.48M 4s\n", + " 300K .......... .......... .......... .......... .......... 4% 3.41M 4s\n", + " 350K .......... .......... .......... .......... .......... 5% 2.22M 4s\n", + " 400K .......... .......... .......... .......... .......... 6% 1.20M 4s\n", + " 450K .......... .......... .......... .......... .......... 7% 2.65M 4s\n", + " 500K .......... .......... .......... .......... .......... 7% 2.97M 3s\n", + " 550K .......... .......... .......... .......... .......... 8% 887K 4s\n", + " 600K .......... .......... .......... .......... .......... 9% 2.90M 4s\n", + " 650K .......... .......... .......... .......... .......... 9% 2.76M 4s\n", + " 700K .......... .......... .......... .......... .......... 10% 980K 4s\n", + " 750K .......... .......... .......... .......... .......... 11% 2.55M 4s\n", + " 800K .......... .......... .......... .......... .......... 11% 2.86M 3s\n", + " 850K .......... .......... .......... .......... .......... 12% 3.04M 3s\n", + " 900K .......... .......... .......... .......... .......... 13% 1.01M 3s\n", + " 950K .......... .......... .......... .......... .......... 14% 3.35M 3s\n", + " 1000K .......... .......... .......... .......... .......... 14% 5.04M 3s\n", + " 1050K .......... .......... .......... .......... .......... 15% 14.5M 3s\n", + " 1100K .......... .......... .......... .......... .......... 16% 1.01M 3s\n", + " 1150K .......... .......... .......... .......... .......... 16% 4.48M 3s\n", + " 1200K .......... .......... .......... .......... .......... 17% 4.34M 3s\n", + " 1250K .......... .......... .......... .......... .......... 18% 2.90M 3s\n", + " 1300K .......... .......... .......... .......... .......... 18% 1.14M 3s\n", + " 1350K .......... .......... .......... .......... .......... 19% 3.00M 3s\n", + " 1400K .......... .......... .......... .......... .......... 20% 5.09M 3s\n", + " 1450K .......... .......... .......... .......... .......... 21% 1.91M 3s\n", + " 1500K .......... .......... .......... .......... .......... 21% 7.70M 3s\n", + " 1550K .......... .......... .......... .......... .......... 22% 1.27M 3s\n", + " 1600K .......... .......... .......... .......... .......... 23% 3.06M 3s\n", + " 1650K .......... .......... .......... .......... .......... 23% 4.11M 3s\n", + " 1700K .......... .......... .......... .......... .......... 24% 3.34M 3s\n", + " 1750K .......... .......... .......... .......... .......... 25% 4.13M 2s\n", + " 1800K .......... .......... .......... .......... .......... 25% 7.95M 2s\n", + " 1850K .......... .......... .......... .......... .......... 26% 3.69M 2s\n", + " 1900K .......... .......... .......... .......... .......... 27% 4.00M 2s\n", + " 1950K .......... .......... .......... .......... .......... 28% 3.50M 2s\n", + " 2000K .......... .......... .......... .......... .......... 28% 4.04M 2s\n", + " 2050K .......... .......... .......... .......... .......... 29% 3.31M 2s\n", + " 2100K .......... .......... .......... .......... .......... 30% 2.49M 2s\n", + " 2150K .......... .......... .......... .......... .......... 30% 4.19M 2s\n", + " 2200K .......... .......... .......... .......... .......... 31% 5.18M 2s\n", + " 2250K .......... .......... .......... .......... .......... 32% 9.49M 2s\n", + " 2300K .......... .......... .......... .......... .......... 32% 8.67M 2s\n", + " 2350K .......... .......... .......... .......... .......... 33% 4.88M 2s\n", + " 2400K .......... .......... .......... .......... .......... 34% 4.56M 2s\n", + " 2450K .......... .......... .......... .......... .......... 35% 4.94M 2s\n", + " 2500K .......... .......... .......... .......... .......... 35% 4.38M 2s\n", + " 2550K .......... .......... .......... .......... .......... 36% 3.78M 2s\n", + " 2600K .......... .......... .......... .......... .......... 37% 4.95M 2s\n", + " 2650K .......... .......... .......... .......... .......... 37% 5.50M 2s\n", + " 2700K .......... .......... .......... .......... .......... 38% 5.23M 2s\n", + " 2750K .......... .......... .......... .......... .......... 39% 3.77M 2s\n", + " 2800K .......... .......... .......... .......... .......... 39% 10.7M 2s\n", + " 2850K .......... .......... .......... .......... .......... 40% 7.16M 2s\n", + " 2900K .......... .......... .......... .......... .......... 41% 5.36M 2s\n", + " 2950K .......... .......... .......... .......... .......... 42% 6.80M 1s\n", + " 3000K .......... .......... .......... .......... .......... 42% 6.57M 1s\n", + " 3050K .......... .......... .......... .......... .......... 43% 7.21M 1s\n", + " 3100K .......... .......... .......... .......... .......... 44% 6.66M 1s\n", + " 3150K .......... .......... .......... .......... .......... 44% 6.42M 1s\n", + " 3200K .......... .......... .......... .......... .......... 45% 8.02M 1s\n", + " 3250K .......... .......... .......... .......... .......... 46% 5.96M 1s\n", + " 3300K .......... .......... .......... .......... .......... 46% 5.13M 1s\n", + " 3350K .......... .......... .......... .......... .......... 47% 5.19M 1s\n", + " 3400K .......... .......... .......... .......... .......... 48% 7.64M 1s\n", + " 3450K .......... .......... .......... .......... .......... 49% 6.11M 1s\n", + " 3500K .......... .......... .......... .......... .......... 49% 4.01M 1s\n", + " 3550K .......... .......... .......... .......... .......... 50% 4.52M 1s\n", + " 3600K .......... .......... .......... .......... .......... 51% 6.72M 1s\n", + " 3650K .......... .......... .......... .......... .......... 51% 5.45M 1s\n", + " 3700K .......... .......... .......... .......... .......... 52% 4.37M 1s\n", + " 3750K .......... .......... .......... .......... .......... 53% 5.39M 1s\n", + " 3800K .......... .......... .......... .......... .......... 53% 7.40M 1s\n", + " 3850K .......... .......... .......... .......... .......... 54% 6.70M 1s\n", + " 3900K .......... .......... .......... .......... .......... 55% 5.14M 1s\n", + " 3950K .......... .......... .......... .......... .......... 56% 5.02M 1s\n", + " 4000K .......... .......... .......... .......... .......... 56% 6.70M 1s\n", + " 4050K .......... .......... .......... .......... .......... 57% 6.76M 1s\n", + " 4100K .......... .......... .......... .......... .......... 58% 2.52M 1s\n", + " 4150K .......... .......... .......... .......... .......... 58% 887K 1s\n", + " 4200K .......... .......... .......... .......... .......... 59% 9.25M 1s\n", + " 4250K .......... .......... .......... .......... .......... 60% 1.27M 1s\n", + " 4300K .......... .......... .......... .......... .......... 61% 5.72M 1s\n", + " 4350K .......... .......... .......... .......... .......... 61% 4.48M 1s\n", + " 4400K .......... .......... .......... .......... .......... 62% 5.20M 1s\n", + " 4450K .......... .......... .......... .......... .......... 63% 6.21M 1s\n", + " 4500K .......... .......... .......... .......... .......... 63% 7.94M 1s\n", + " 4550K .......... .......... .......... .......... .......... 64% 4.76M 1s\n", + " 4600K .......... .......... .......... .......... .......... 65% 4.74M 1s\n", + " 4650K .......... .......... .......... .......... .......... 65% 6.94M 1s\n", + " 4700K .......... .......... .......... .......... .......... 66% 5.62M 1s\n", + " 4750K .......... .......... .......... .......... .......... 67% 4.44M 1s\n", + " 4800K .......... .......... .......... .......... .......... 68% 6.02M 1s\n", + " 4850K .......... .......... .......... .......... .......... 68% 6.61M 1s\n", + " 4900K .......... .......... .......... .......... .......... 69% 3.04M 1s\n", + " 4950K .......... .......... .......... .......... .......... 70% 5.34M 1s\n", + " 5000K .......... .......... .......... .......... .......... 70% 3.03M 1s\n", + " 5050K .......... .......... .......... .......... .......... 71% 19.8M 1s\n", + " 5100K .......... .......... .......... .......... .......... 72% 6.17M 1s\n", + " 5150K .......... .......... .......... .......... .......... 72% 5.58M 1s\n", + " 5200K .......... .......... .......... .......... .......... 73% 7.38M 1s\n", + " 5250K .......... .......... .......... .......... .......... 74% 7.11M 1s\n", + " 5300K .......... .......... .......... .......... .......... 75% 6.24M 1s\n", + " 5350K .......... .......... .......... .......... .......... 75% 4.62M 1s\n", + " 5400K .......... .......... .......... .......... .......... 76% 7.64M 0s\n", + " 5450K .......... .......... .......... .......... .......... 77% 6.06M 0s\n", + " 5500K .......... .......... .......... .......... .......... 77% 5.56M 0s\n", + " 5550K .......... .......... .......... .......... .......... 78% 2.96M 0s\n", + " 5600K .......... .......... .......... .......... .......... 79% 6.17M 0s\n", + " 5650K .......... .......... .......... .......... .......... 79% 9.58M 0s\n", + " 5700K .......... .......... .......... .......... .......... 80% 2.58M 0s\n", + " 5750K .......... .......... .......... .......... .......... 81% 4.23M 0s\n", + " 5800K .......... .......... .......... .......... .......... 82% 5.70M 0s\n", + " 5850K .......... .......... .......... .......... .......... 82% 4.72M 0s\n", + " 5900K .......... .......... .......... .......... .......... 83% 6.52M 0s\n", + " 5950K .......... .......... .......... .......... .......... 84% 5.86M 0s\n", + " 6000K .......... .......... .......... .......... .......... 84% 5.22M 0s\n", + " 6050K .......... .......... .......... .......... .......... 85% 5.50M 0s\n", + " 6100K .......... .......... .......... .......... .......... 86% 6.29M 0s\n", + " 6150K .......... .......... .......... .......... .......... 86% 6.93M 0s\n", + " 6200K .......... .......... .......... .......... .......... 87% 5.50M 0s\n", + " 6250K .......... .......... .......... .......... .......... 88% 5.82M 0s\n", + " 6300K .......... .......... .......... .......... .......... 89% 6.76M 0s\n", + " 6350K .......... .......... .......... .......... .......... 89% 3.73M 0s\n", + " 6400K .......... .......... .......... .......... .......... 90% 5.98M 0s\n", + " 6450K .......... .......... .......... .......... .......... 91% 5.78M 0s\n", + " 6500K .......... .......... .......... .......... .......... 91% 5.60M 0s\n", + " 6550K .......... .......... .......... .......... .......... 92% 4.84M 0s\n", + " 6600K .......... .......... .......... .......... .......... 93% 7.25M 0s\n", + " 6650K .......... .......... .......... .......... .......... 93% 2.60M 0s\n", + " 6700K .......... .......... .......... .......... .......... 94% 6.02M 0s\n", + " 6750K .......... .......... .......... .......... .......... 95% 6.57M 0s\n", + " 6800K .......... .......... .......... .......... .......... 96% 8.30M 0s\n", + " 6850K .......... .......... .......... .......... .......... 96% 14.4M 0s\n", + " 6900K .......... .......... .......... .......... .......... 97% 4.58M 0s\n", + " 6950K .......... .......... .......... .......... .......... 98% 3.31M 0s\n", + " 7000K .......... .......... .......... .......... .......... 98% 6.88M 0s\n", + " 7050K .......... .......... .......... .......... .......... 99% 4.40M 0s\n", + " 7100K .......... .......... .......... 100% 15.1M=1.9s\n", + "\n", + "2021-12-18 21:32:01 (3.64 MB/s) - 'benchmarks.zip' saved [7301872/7301872]\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: samanatar-en-indic-v0.2.zip\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " End-of-central-directory signature not found. Either this file is not\n", + " a zipfile, or it constitutes one disk of a multi-part archive. In the\n", + " latter case the central directory and zipfile comment will be found on\n", + " the last disk(s) of this archive.\n", + "unzip: cannot find zipfile directory in one of samanatar-en-indic-v0.2.zip or\n", + " samanatar-en-indic-v0.2.zip.zip, and cannot find samanatar-en-indic-v0.2.zip.ZIP, period.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: benchmarks.zip\n", + " creating: benchmarks/\n", + " creating: benchmarks/pmi/\n", + " creating: benchmarks/pmi/en-as/\n", + " inflating: benchmarks/pmi/en-as/dev.as \n", + " inflating: benchmarks/pmi/en-as/dev.en \n", + " inflating: benchmarks/pmi/en-as/test.as \n", + " inflating: benchmarks/pmi/en-as/test.en \n", + " creating: benchmarks/wat2021-devtest/\n", + " inflating: benchmarks/wat2021-devtest/dev.gu \n", + " inflating: benchmarks/wat2021-devtest/dev.en \n", + " inflating: benchmarks/wat2021-devtest/test.bn \n", + " inflating: benchmarks/wat2021-devtest/dev.bn \n", + " inflating: benchmarks/wat2021-devtest/test.hi \n", + " inflating: benchmarks/wat2021-devtest/dev.kn \n", + " inflating: benchmarks/wat2021-devtest/dev.ta \n", + " inflating: benchmarks/wat2021-devtest/test.pa \n", + " inflating: benchmarks/wat2021-devtest/test.en \n", + " inflating: benchmarks/wat2021-devtest/test.mr \n", + " inflating: benchmarks/wat2021-devtest/test.kn \n", + " inflating: benchmarks/wat2021-devtest/dev.ml \n", + " inflating: benchmarks/wat2021-devtest/test.ta \n", + " inflating: benchmarks/wat2021-devtest/test.gu \n", + " inflating: benchmarks/wat2021-devtest/dev.or \n", + " inflating: benchmarks/wat2021-devtest/test.or \n", + " inflating: benchmarks/wat2021-devtest/test.te \n", + " inflating: benchmarks/wat2021-devtest/dev.mr \n", + " inflating: benchmarks/wat2021-devtest/test.ml \n", + " inflating: benchmarks/wat2021-devtest/dev.pa \n", + " inflating: benchmarks/wat2021-devtest/dev.te \n", + " inflating: benchmarks/wat2021-devtest/dev.hi \n", + " creating: benchmarks/wat2020-devtest/\n", + " creating: benchmarks/wat2020-devtest/en-bn/\n", + " inflating: benchmarks/wat2020-devtest/en-bn/dev.en \n", + " inflating: benchmarks/wat2020-devtest/en-bn/test.bn \n", + " inflating: benchmarks/wat2020-devtest/en-bn/dev.bn \n", + " inflating: benchmarks/wat2020-devtest/en-bn/test.en \n", + " creating: benchmarks/wat2020-devtest/en-ta/\n", + " inflating: benchmarks/wat2020-devtest/en-ta/dev.en \n", + " inflating: benchmarks/wat2020-devtest/en-ta/dev.ta \n", + " inflating: benchmarks/wat2020-devtest/en-ta/test.en \n", + " inflating: benchmarks/wat2020-devtest/en-ta/test.ta \n", + " creating: benchmarks/wat2020-devtest/en-mr/\n", + " inflating: benchmarks/wat2020-devtest/en-mr/dev.en \n", + " inflating: benchmarks/wat2020-devtest/en-mr/test.en \n", + " inflating: benchmarks/wat2020-devtest/en-mr/test.mr \n", + " inflating: benchmarks/wat2020-devtest/en-mr/dev.mr \n", + " creating: benchmarks/wat2020-devtest/en-te/\n", + " inflating: benchmarks/wat2020-devtest/en-te/dev.en \n", + " inflating: benchmarks/wat2020-devtest/en-te/test.en \n", + " inflating: benchmarks/wat2020-devtest/en-te/test.te \n", + " inflating: benchmarks/wat2020-devtest/en-te/dev.te \n", + " creating: benchmarks/wat2020-devtest/en-hi/\n", + " inflating: benchmarks/wat2020-devtest/en-hi/dev.en \n", + " inflating: benchmarks/wat2020-devtest/en-hi/test.hi \n", + " inflating: benchmarks/wat2020-devtest/en-hi/test.en \n", + " inflating: benchmarks/wat2020-devtest/en-hi/dev.hi \n", + " creating: benchmarks/wat2020-devtest/en-gu/\n", + " inflating: benchmarks/wat2020-devtest/en-gu/dev.gu \n", + " inflating: benchmarks/wat2020-devtest/en-gu/dev.en \n", + " inflating: benchmarks/wat2020-devtest/en-gu/test.en \n", + " inflating: benchmarks/wat2020-devtest/en-gu/test.gu \n", + " creating: benchmarks/wat2020-devtest/en-ml/\n", + " inflating: benchmarks/wat2020-devtest/en-ml/dev.en \n", + " inflating: benchmarks/wat2020-devtest/en-ml/test.en \n", + " inflating: benchmarks/wat2020-devtest/en-ml/dev.ml \n", + " inflating: benchmarks/wat2020-devtest/en-ml/test.ml \n", + " creating: benchmarks/ufal-ta/\n", + " creating: benchmarks/ufal-ta/en-ta/\n", + " inflating: benchmarks/ufal-ta/en-ta/dev.en \n", + " inflating: benchmarks/ufal-ta/en-ta/dev.ta \n", + " inflating: benchmarks/ufal-ta/en-ta/test.en \n", + " inflating: benchmarks/ufal-ta/en-ta/test.ta \n", + " creating: benchmarks/wmt-news/\n", + " creating: benchmarks/wmt-news/en-ta/\n", + " inflating: benchmarks/wmt-news/en-ta/dev.en \n", + " inflating: benchmarks/wmt-news/en-ta/dev.ta \n", + " inflating: benchmarks/wmt-news/en-ta/test.en \n", + " inflating: benchmarks/wmt-news/en-ta/test.ta \n", + " creating: benchmarks/wmt-news/en-hi/\n", + " inflating: benchmarks/wmt-news/en-hi/dev.en \n", + " inflating: benchmarks/wmt-news/en-hi/test.hi \n", + " inflating: benchmarks/wmt-news/en-hi/test.en \n", + " inflating: benchmarks/wmt-news/en-hi/dev.hi \n", + " creating: benchmarks/wmt-news/en-gu/\n", + " inflating: benchmarks/wmt-news/en-gu/test.en \n", + " inflating: benchmarks/wmt-news/en-gu/test.gu \n" + ] + } + ], + "source": [ + "## for the latest samanantar dataset v0.3 -> please use this link: https://storage.googleapis.com/samanantar-public/V0.3/source_wise_splits.zip\n", + "# This v0.3 dataset has source wise splits to indicate where the data has been collected from\n", + "# For preprocessing simplicity we will use v0.2( which just uses raw text files without source information) in this tutorial\n", + "# \n", + "# \n", + "# lets now download the indictrans data v0.2 dataset\n", + "! wget https://storage.googleapis.com/samanantar-public/V0.2/data/en2indic/samanatar-en-indic-v0.2.zip\n", + "\n", + "\n", + "\n", + "# lets also download the benchmarks for dev and test set\n", + "\n", + "! wget https://storage.googleapis.com/samanantar-public/benchmarks.zip\n", + "\n", + "# training data is organized as en-X folders where each folder contains two text files containing parallel data for en-X lang pair.\n", + "\n", + "# final_data\n", + "# ├── en-as\n", + "# │ ├── train.as\n", + "# │ └── train.en\n", + "# ├── en-bn\n", + "# │ ├── train.bn\n", + "# │ └── train.en\n", + "# ├── en-gu\n", + "# │ ├── train.en\n", + "# │ └── train.gu\n", + "# ├── en-hi\n", + "# │ ├── train.en\n", + "# │ └── train.hi\n", + "# ├── en-kn\n", + "# │ ├── train.en\n", + "# │ └── train.kn\n", + "# ├── en-ml\n", + "# │ ├── train.en\n", + "# │ └── train.ml\n", + "# ├── en-mr\n", + "# │ ├── train.en\n", + "# │ └── train.mr\n", + "# ├── en-or\n", + "# │ ├── train.en\n", + "# │ └── train.or\n", + "# ├── en-pa\n", + "# │ ├── train.en\n", + "# │ └── train.pa\n", + "# ├── en-ta\n", + "# │ ├── train.en\n", + "# │ └── train.ta\n", + "# └── en-te\n", + "# ├── train.en\n", + "# └── train.te\n", + "\n", + "\n", + "! unzip samanatar-en-indic-v0.2.zip\n", + "\n", + "# benchmarks folder consists of all the benchmarks we report in the paper - pmi, ufal-ta, wat2020, wat2021, wmt-news\n", + "\n", + "! unzip benchmarks.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MR_2GQoa84Jn" + }, + "outputs": [], + "source": [ + "# create an experiment dir to store train data, devtest data. \n", + "# This folder will also store vocabulary files (created with subword_nmt for bpe), fairseq bin files (for training), model checkpoints.\n", + "\n", + "# for this example we will be training indic to en translation model. We will name our exp_dir as indic-en-exp\n", + "! mkdir indic-en-exp\n", + "# copying all the train folders to exp_dir\n", + "! cp -r final_data/* indic-en-exp\n", + "\n", + "! mkdir -p indic-en-exp/devtest\n", + "\n", + "# copying all benchmarks to devtest folder in exp_dir\n", + "! cp -r benchmarks/* indic-en-exp/devtest\n", + "\n", + "# folder to store combined devtest data (based on the domains you want to test, you can combine multiple benchmarks dev datasets, remove duplicates)\n", + "! mkdir -p indic-en-exp/devtest/all\n", + "\n", + "# in this tutorial, for simplicity, we will just use wat2020 devtest for dev and test set\n", + "! cp -r indic-en-exp/devtest/wat2020-devtest/* indic-en-exp/devtest/all\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lorcT8wkFPtQ" + }, + "outputs": [], + "source": [ + "% cd indicTrans" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vhvYXUc1FaVn" + }, + "outputs": [], + "source": [ + "# prepare_data_joint_training.sh takes experiment dir, src_lang, tgt_lang as input \n", + "# This does preprocessing, building vocab, binarization for joint training\n", + "\n", + "# The learning and applying vocabulary will take a while if the dataset is huge. To make it faster, run it on a multicore system\n", + "\n", + "! bash prepare_data_joint_training.sh '../indic-en-exp' 'indic' 'en'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p1i3fRQzF2-x" + }, + "outputs": [], + "source": [ + "# Training the model\n", + "\n", + "# pls refer to fairseq documentaion to know more about each of these options (https://fairseq.readthedocs.io/en/latest/command_line_tools.html)\n", + "\n", + "\n", + "# some notable args:\n", + "# --max-updates -> maximum update steps the model will be trained for\n", + "# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer base)\n", + "# --user_dir -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture\n", + "# --lr -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.\n", + "# --max_tokens -> this is max tokens per batch. You should limit to lower values if you get oom errors.\n", + "# --update-freq -> gradient accumulation steps\n", + "\n", + "\n", + "!( fairseq-train ../indic-en-exp/final_bin \\\n", + "--max-source-positions=210 \\\n", + "--max-target-positions=210 \\\n", + "--max-update= \\\n", + "--save-interval=1 \\\n", + "--arch=transformer_4x \\\n", + "--criterion=label_smoothed_cross_entropy \\\n", + "--source-lang=SRC \\\n", + "--lr-scheduler=inverse_sqrt \\\n", + "--target-lang=TGT \\\n", + "--label-smoothing=0.1 \\\n", + "--optimizer adam \\\n", + "--adam-betas \"(0.9, 0.98)\" \\\n", + "--clip-norm 1.0 \\\n", + "--warmup-init-lr 1e-07 \\\n", + "--lr 0.0005 \\\n", + "--warmup-updates 4000 \\\n", + "--dropout 0.2 \\\n", + "--save-dir ../indic-en-exp/model \\\n", + "--keep-last-epochs 5 \\\n", + "--patience 5 \\\n", + "--skip-invalid-size-inputs-valid-test \\\n", + "--fp16 \\\n", + "--user-dir model_configs \\\n", + "--wandb-project \\\n", + "--update-freq= \\\n", + "--distributed-world-size \\\n", + "--max-tokens )" + ] + } + ], + "metadata": { + "colab": { + "authorship_tag": "ABX9TyO6AA5gXphZ5kJ6h+dgeSqb", + "collapsed_sections": [], + "include_colab_link": true, + "name": "IndicTrans_training.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..3895630adb00ec0b8e12a876031e94862af65abe --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Gowtham.R + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/api.py b/api.py new file mode 100644 index 0000000000000000000000000000000000000000..601dd5ec161baa5d3041be111a0d83dd6f9073c3 --- /dev/null +++ b/api.py @@ -0,0 +1,86 @@ +import time + +from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils +from inference.engine import Model +from flask import Flask, request +from flask import jsonify +from flask_cors import CORS, cross_origin +import webvtt +from io import StringIO + + +app = Flask(__name__) +cors = CORS(app) +app.config['CORS_HEADERS'] = 'Content-Type' + +indic2en_model = Model(expdir='../models/v3/indic-en') +en2indic_model = Model(expdir='../models/v3/en-indic') +m2m_model = Model(expdir='../models/m2m') + +language_dict = { + 'Assamese': 'as', + 'Hindi' : 'hi', + 'Marathi' : 'mr', + 'Tamil' : 'ta', + 'Bengali' : 'bn', + 'Kannada' : 'kn', + 'Oriya' : 'or', + 'Telugu' : 'te', + 'Gujarati' : 'gu', + 'Malayalam' : 'ml', + 'Punjabi' : 'pa', +} + +def get_inference_params(): + model_type = request.form['model_type'] + source_language = request.form['source_language'] + target_language = request.form['target_language'] + + if model_type == 'indic-en': + model = indic2en_model + source_lang = language_dict[source_language] + assert target_language == 'English' + target_lang = 'en' + elif model_type == 'en-indic': + model = en2indic_model + assert source_language == 'English' + source_lang = 'en' + target_lang = language_dict[target_language] + elif model_type == 'm2m': + model = m2m_model + source_lang = language_dict[source_language] + target_lang = language_dict[target_language] + + return model, source_lang, target_lang + +@app.route('/', methods=['GET']) +def main(): + return "IndicTrans API" + +@app.route("/translate", methods=['POST']) +@cross_origin() +def infer_indic_en(): + model, source_lang, target_lang = get_inference_params() + source_text = request.form['text'] + + start_time = time.time() + target_text = model.translate_paragraph(source_text, source_lang, target_lang) + end_time = time.time() + return {'text':target_text, 'duration':round(end_time-start_time, 2)} + +@app.route("/translate_vtt", methods=['POST']) +@cross_origin() +def infer_vtt_indic_en(): + model, source_lang, target_lang = get_inference_params() + source_text = request.form['text'] + captions = webvtt.read_buffer(StringIO(source_text)) + source_sentences = [caption.text.replace('\r', '').replace('\n', ' ') for caption in captions] + + start_time = time.time() + target_sentences = model.batch_translate(source_sentences, source_lang, target_lang) + end_time = time.time() + + for i in range(len(target_sentences)): + captions[i].text = target_sentences[i] + + return {'text': captions.content, 'duration':round(end_time-start_time, 2)} diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..5f7cf6933c43c2d0f4245de8b22539995e0c52d1 --- /dev/null +++ b/app.py @@ -0,0 +1,36 @@ +import os +#import gradio as gr + +os.system('wget -q https://storage.googleapis.com/vakyaansh-open-models/translation_models/en-indic.zip') +os.system('unzip /home/user/app/en-indic.zip') +os.system('pip uninstall -y numpy') +os.system('pip install numpy') +#os.system('pip uninstall -y numba') +#os.system('pip install numba==0.53') + +from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils +import gradio as grd +from inference.engine import Model +indic2en_model = Model(expdir='en-indic') + +INDIC = {"Assamese": "as", "Bengali": "bn", "Gujarati": "gu", "Hindi": "hi","Kannada": "kn","Malayalam": "ml", "Marathi": "mr", "Odia": "or","Punjabi": "pa","Tamil": "ta", "Telugu" : "te"} + + +def translate(text, lang): + return indic2en_model.translate_paragraph(text, 'en', INDIC[lang]) + + + +languages = list(INDIC.keys()) + +#print(translate('helo how are you')) +ddwn = grd.inputs.Dropdown(languages, type="value", default="Hindi", label="Select Target Language") +txt = grd.inputs.Textbox( lines=5, placeholder="Enter Text to translate", default="", label="Enter Text in English") +txt_ouptut = grd.outputs.Textbox(type="auto", label="Translated text in Target Language") + +example=[['I want to translate this sentence in Hindi','Hindi'], + ['I am feeling very good today.', 'Bengali']] + +supp = ','.join(languages) +iface = grd.Interface(fn=translate, inputs=[txt,ddwn] , outputs=txt_ouptut, title='Translation for 11 Indic Languages', description = 'This is a demo based on IndicTrans. Languages Supported: '+supp, article = 'Original repo [link](https://github.com/AI4Bharat/indicTrans) by AI4Bharat. Note: This space can only perform translation from English to Indic languages. Support for other combinations will be provided soon.', examples=example) +iface.launch(enable_queue=True) diff --git a/apply_bpe_traindevtest_notag.sh b/apply_bpe_traindevtest_notag.sh new file mode 100644 index 0000000000000000000000000000000000000000..a3bd22677f2d9082f82052a1831139ea3d855cd5 --- /dev/null +++ b/apply_bpe_traindevtest_notag.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +expdir=$1 # EXPDIR + +SUBWORD_NMT_DIR="subword-nmt" + +data_dir="$expdir/data" +mkdir -p $expdir/bpe + +for dset in `echo train dev test` +do + echo $dset + in_dset_dir="$data_dir/$dset" + out_dset_dir="$expdir/bpe/$dset" + # out_dset_dir="$expdir/final/$dset" + echo "Apply joint vocab to SRC corpus" + # for very large datasets, use gnu-parallel to speed up applying bpe + # uncomment the below line if the apply bpe is slow + + # parallel --pipe --keep-order \ + python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --vocabulary $expdir/vocab/vocab.SRC \ + --vocabulary-threshold 5 \ + --num-workers "-1" \ + < $in_dset_dir.SRC \ + > $out_dset_dir.SRC + echo "Apply joint vocab to TGT corpus" + + # for very large datasets, use gnu-parallel to speed up applying bpe + # uncomment the below line if the apply bpe is slow + + # parallel --pipe --keep-order \ + python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --vocabulary $expdir/vocab/vocab.TGT \ + --vocabulary-threshold 5 \ + --num-workers "-1" \ + < $in_dset_dir.TGT \ + > $out_dset_dir.TGT +done diff --git a/apply_single_bpe_traindevtest_notag.sh b/apply_single_bpe_traindevtest_notag.sh new file mode 100644 index 0000000000000000000000000000000000000000..010ba2483f8f66f957f2f5ae6b9d22a5458792f8 --- /dev/null +++ b/apply_single_bpe_traindevtest_notag.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +expdir=$1 # EXPDIR + +SUBWORD_NMT_DIR="subword-nmt" + +data_dir="$expdir/data" +mkdir -p $expdir/bpe + +for dset in `echo train dev test` +do + echo $dset + in_dset_dir="$data_dir/$dset" + out_dset_dir="$expdir/bpe/$dset" + # out_dset_dir="$expdir/final/$dset" + echo "Apply to SRC corpus" + # for very large datasets, use gnu-parallel to speed up applying bpe + # uncomment the below line if the apply bpe is slow + + # parallel --pipe --keep-order \ + python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC \ + --vocabulary $expdir/vocab/vocab.SRC \ + --vocabulary-threshold 5 \ + --num-workers "-1" \ + < $in_dset_dir.SRC \ + > $out_dset_dir.SRC + echo "Apply to TGT corpus" + # for very large datasets, use gnu-parallel to speed up applying bpe + # uncomment the below line if the apply bpe is slow + + # parallel --pipe --keep-order \ + python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.TGT \ + --vocabulary $expdir/vocab/vocab.TGT \ + --vocabulary-threshold 5 \ + --num-workers "-1" \ + < $in_dset_dir.TGT \ + > $out_dset_dir.TGT +done diff --git a/binarize_training_exp.sh b/binarize_training_exp.sh new file mode 100755 index 0000000000000000000000000000000000000000..52e74449df27835ceab7489ce2c8ea3b5feaaf4b --- /dev/null +++ b/binarize_training_exp.sh @@ -0,0 +1,24 @@ +#/bin/bash + +exp_dir=$1 +src_lang=$2 +tgt_lang=$3 + +# use cpu_count to get num_workers instead of setting it manually when running in different +# instances +num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"` + +data_dir=$exp_dir/final +out_data_dir=$exp_dir/final_bin + +rm -rf $out_data_dir + +fairseq-preprocess \ + --source-lang $src_lang --target-lang $tgt_lang \ + --trainpref $data_dir/train \ + --validpref $data_dir/dev \ + --testpref $data_dir/test \ + --destdir $out_data_dir \ + --workers $num_workers \ + --thresholdtgt 5 \ + --thresholdsrc 5 diff --git a/compute_bleu.sh b/compute_bleu.sh new file mode 100644 index 0000000000000000000000000000000000000000..b8b55325ff183fce3b59b5b5319dbc1b9c438d1c --- /dev/null +++ b/compute_bleu.sh @@ -0,0 +1,28 @@ +pred_fname=$1 +ref_fname=$2 +src_lang=$3 +tgt_lang=$4 + +# we compute and report tokenized bleu scores. +# For computing BLEU scores, systems should output detokenized outputs. Your MT system might be doing it out of the box if you are using SentencePiece - nothing to do in that case. +# If you are using BPE then: +# 1. For English, you can use MosesDetokenizer (either the scripts in moses or the sacremoses python package) +# 2. For Indian languages, you can use the IndicNLP library detokenizer (note: please don't skip this step, since detok/tokenizer are not guaranteed to be reversible**. +# ^ both 1. and 2. are scripts/postprocess_translate.py + + +# For computing BLEU, we use sacrebleu: +# For English output: sacrebleu reffile < outputfile. This internally tokenizes using mteval-v13a +# For Indian language output, we need tokenized output and reference since we don't know how well the sacrebleu tokenizer works for Indic input. +# Hence we tokenize both preds and target files with IndicNLP tokenizer and then run: sacrebleu --tokenize none reffile < outputfile +if [ $tgt_lang == 'en' ]; then + # indic to en models + sacrebleu $ref_fname < $pred_fname +else + # indicnlp tokenize predictions and reference files before evaluation + input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang` + input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang` + + # since we are tokenizing with indicnlp separately, we are setting tokenize to none here + sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok +fi \ No newline at end of file diff --git a/indicTrans_Finetuning.ipynb b/indicTrans_Finetuning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..007a436d87b5ae1fc36663ed25ee66e1cb5aa150 --- /dev/null +++ b/indicTrans_Finetuning.ipynb @@ -0,0 +1,1849 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rE4MO-8bDtwD", + "outputId": "e54447b4-2b04-44c4-96a2-a79e7ed014ae" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/content/finetuning\n" + ] + } + ], + "source": [ + "# create a seperate folder to store everything\n", + "!mkdir finetuning\n", + "%cd finetuning" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-2Rs6_WkD_gF", + "outputId": "95d19041-0e73-406c-a3c2-c7bddbfda916" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'indicTrans'...\n", + "remote: Enumerating objects: 398, done.\u001b[K\n", + "remote: Counting objects: 100% (398/398), done.\u001b[K\n", + "remote: Compressing objects: 100% (267/267), done.\u001b[K\n", + "remote: Total 398 (delta 231), reused 251 (delta 126), pack-reused 0\u001b[K\n", + "Receiving objects: 100% (398/398), 1.41 MiB | 17.84 MiB/s, done.\n", + "Resolving deltas: 100% (231/231), done.\n", + "/content/finetuning/indicTrans\n", + "Cloning into 'indic_nlp_library'...\n", + "remote: Enumerating objects: 1325, done.\u001b[K\n", + "remote: Counting objects: 100% (147/147), done.\u001b[K\n", + "remote: Compressing objects: 100% (103/103), done.\u001b[K\n", + "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n", + "Receiving objects: 100% (1325/1325), 9.57 MiB | 14.30 MiB/s, done.\n", + "Resolving deltas: 100% (688/688), done.\n", + "Cloning into 'indic_nlp_resources'...\n", + "remote: Enumerating objects: 133, done.\u001b[K\n", + "remote: Counting objects: 100% (7/7), done.\u001b[K\n", + "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", + "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n", + "Receiving objects: 100% (133/133), 149.77 MiB | 35.48 MiB/s, done.\n", + "Resolving deltas: 100% (51/51), done.\n", + "Cloning into 'subword-nmt'...\n", + "remote: Enumerating objects: 580, done.\u001b[K\n", + "remote: Counting objects: 100% (4/4), done.\u001b[K\n", + "remote: Compressing objects: 100% (4/4), done.\u001b[K\n", + "remote: Total 580 (delta 0), reused 0 (delta 0), pack-reused 576\u001b[K\n", + "Receiving objects: 100% (580/580), 237.41 KiB | 18.26 MiB/s, done.\n", + "Resolving deltas: 100% (349/349), done.\n", + "/content/finetuning\n" + ] + } + ], + "source": [ + "# clone the repo for running finetuning\n", + "!git clone https://github.com/AI4Bharat/indicTrans.git\n", + "%cd indicTrans\n", + "# clone requirements repositories\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n", + "!git clone https://github.com/rsennrich/subword-nmt.git\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "duwTvJ9xEBJ1", + "outputId": "98445af3-041d-415d-97f3-a322939260e4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following NEW packages will be installed:\n", + " tree\n", + "0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.\n", + "Need to get 40.7 kB of archives.\n", + "After this operation, 105 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n", + "Fetched 40.7 kB in 0s (121 kB/s)\n", + "debconf: unable to initialize frontend: Dialog\n", + "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)\n", + "debconf: falling back to frontend: Readline\n", + "debconf: unable to initialize frontend: Readline\n", + "debconf: (This frontend requires a controlling tty.)\n", + "debconf: falling back to frontend: Teletype\n", + "dpkg-preconfigure: unable to re-open stdin: \n", + "Selecting previously unselected package tree.\n", + "(Reading database ... 160772 files and directories currently installed.)\n", + "Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n", + "Unpacking tree (1.7.0-5) ...\n", + "Setting up tree (1.7.0-5) ...\n", + "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", + "Collecting sacremoses\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", + "\u001b[K |████████████████████████████████| 901kB 30.0MB/s \n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", + "Collecting mock\n", + " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n", + "Collecting sacrebleu\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n", + "\u001b[K |████████████████████████████████| 61kB 9.1MB/s \n", + "\u001b[?25hCollecting tensorboardX\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/07/84/46421bd3e0e89a92682b1a38b40efc22dafb6d8e3d947e4ceefd4a5fabc7/tensorboardX-2.2-py2.py3-none-any.whl (120kB)\n", + "\u001b[K |████████████████████████████████| 122kB 58.2MB/s \n", + "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n", + "Collecting indic-nlp-library\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n", + "\u001b[K |████████████████████████████████| 40kB 6.3MB/s \n", + "\u001b[?25hRequirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n", + "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n", + "Collecting portalocker==2.0.0\n", + " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n", + "Collecting sphinx-rtd-theme\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n", + "\u001b[K |████████████████████████████████| 9.2MB 41.6MB/s \n", + "\u001b[?25hCollecting sphinx-argparse\n", + " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n", + "Collecting morfessor\n", + " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n", + "Requirement already satisfied: sphinx in /usr/local/lib/python3.7/dist-packages (from sphinx-rtd-theme->indic-nlp-library) (1.8.5)\n", + "Collecting docutils<0.17\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n", + "\u001b[K |████████████████████████████████| 552kB 33.3MB/s \n", + "\u001b[?25hRequirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.6.1)\n", + "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.23.0)\n", + "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.9.1)\n", + "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.11.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (20.9)\n", + "Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.4)\n", + "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.0)\n", + "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.1.0)\n", + "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (0.7.12)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2020.12.5)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (3.0.4)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.10)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.24.3)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.0.1)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.4.7)\n", + "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.1.4)\n", + "Building wheels for collected packages: sphinx-argparse\n", + " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=16adb2732e7fea31509536176157766068ca67667ad9ad00a5ee3b15bdec2d18\n", + " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n", + "Successfully built sphinx-argparse\n", + "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n", + "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, docutils, sphinx-rtd-theme, sphinx-argparse, morfessor, indic-nlp-library\n", + " Found existing installation: docutils 0.17.1\n", + " Uninstalling docutils-0.17.1:\n", + " Successfully uninstalled docutils-0.17.1\n", + "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.2\n", + "Cloning into 'fairseq'...\n", + "remote: Enumerating objects: 28243, done.\u001b[K\n", + "remote: Counting objects: 100% (62/62), done.\u001b[K\n", + "remote: Compressing objects: 100% (39/39), done.\u001b[K\n", + "remote: Total 28243 (delta 29), reused 44 (delta 22), pack-reused 28181\u001b[K\n", + "Receiving objects: 100% (28243/28243), 11.81 MiB | 24.38 MiB/s, done.\n", + "Resolving deltas: 100% (21225/21225), done.\n", + "/content/finetuning/fairseq\n", + "Obtaining file:///content/finetuning/fairseq\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "Collecting omegaconf<2.1\n", + " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n", + "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.19.5)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (4.41.1)\n", + "Collecting hydra-core<1.1\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n", + "\u001b[K |████████████████████████████████| 133kB 32.0MB/s \n", + "\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (2019.12.20)\n", + "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.14.5)\n", + "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.5.1)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.8.1+cu101)\n", + "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (0.29.23)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+2fd9d8a) (3.7.4.3)\n", + "Collecting PyYAML>=5.1.*\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n", + "\u001b[K |████████████████████████████████| 645kB 31.7MB/s \n", + "\u001b[?25hRequirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+2fd9d8a) (5.1.3)\n", + "Collecting antlr4-python3-runtime==4.8\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n", + "\u001b[K |████████████████████████████████| 112kB 53.4MB/s \n", + "\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+2fd9d8a) (2.20)\n", + "Requirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+2fd9d8a) (2.0.0)\n", + "Requirement already satisfied: zipp>=0.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+2fd9d8a) (3.4.1)\n", + "Building wheels for collected packages: antlr4-python3-runtime\n", + " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=5e816253108c1c7a8687228b17c910230fee3243ba77f5567a8b08f7c1a5a101\n", + " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n", + "Successfully built antlr4-python3-runtime\n", + "Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + " Running setup.py develop for fairseq\n", + "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n", + "/content/finetuning\n" + ] + } + ], + "source": [ + "! sudo apt install tree\n", + "\n", + "# Install the necessary libraries\n", + "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n", + "# Install fairseq from source\n", + "!git clone https://github.com/pytorch/fairseq.git\n", + "%cd fairseq\n", + "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n", + "!pip install --editable ./\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oD2EHQdqEH70", + "outputId": "0b988dde-9da3-487c-a393-510fbcae92f3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2021-06-09 18:47:20-- https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 172.253.62.128, 172.253.115.128, 172.253.122.128, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|172.253.62.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4551079075 (4.2G) [application/zip]\n", + "Saving to: ‘indic-en.zip’\n", + "\n", + "indic-en.zip 100%[===================>] 4.24G 61.3MB/s in 56s \n", + "\n", + "2021-06-09 18:48:16 (77.9 MB/s) - ‘indic-en.zip’ saved [4551079075/4551079075]\n", + "\n", + "Archive: indic-en.zip\n", + " creating: indic-en/\n", + " creating: indic-en/vocab/\n", + " inflating: indic-en/vocab/bpe_codes.32k.SRC \n", + " inflating: indic-en/vocab/vocab.SRC \n", + " inflating: indic-en/vocab/vocab.TGT \n", + " inflating: indic-en/vocab/bpe_codes.32k.TGT \n", + " creating: indic-en/final_bin/\n", + " inflating: indic-en/final_bin/dict.TGT.txt \n", + " inflating: indic-en/final_bin/dict.SRC.txt \n", + " creating: indic-en/model/\n", + " inflating: indic-en/model/checkpoint_best.pt \n" + ] + } + ], + "source": [ + "# download the indictrans model\n", + "\n", + "\n", + "# downloading the en-indic model\n", + "# this will contain:\n", + "# en-indic/\n", + "# ├── final_bin # contains fairseq dictionaries (we will use this to binarize the new finetuning data)\n", + "# │ ├── dict.SRC.txt\n", + "# │ └── dict.TGT.txt\n", + "# ├── model # contains model checkpoint(s)\n", + "# │ └── checkpoint_best.pt\n", + "# └── vocab # contains bpes for src and tgt (since we train seperate vocabularies) generated with subword_nmt (we will use this bpes to convert finetuning data to subwords)\n", + "# ├── bpe_codes.32k.SRC\n", + "# ├── bpe_codes.32k.TGT\n", + "# ├── vocab.SRC\n", + "# └── vocab.TGT\n", + "\n", + "\n", + "\n", + "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip\n", + "!unzip indic-en.zip\n", + "\n", + "# if you want to finetune indic-en models, use the link below\n", + "\n", + "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip\n", + "# !unzip en-indic.zip\n", + "\n", + "# if you want to finetune indic-indic models, use the link below\n", + "\n", + "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip\n", + "# !unzip m2m.zip\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lj7XNBuwE0OV", + "outputId": "98b3a156-c205-4f1b-de79-f1d640555349" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2021-06-09 18:50:23-- http://lotus.kuee.kyoto-u.ac.jp/WAT/indic-multilingual/indic_wat_2021.tar.gz\n", + "Resolving lotus.kuee.kyoto-u.ac.jp (lotus.kuee.kyoto-u.ac.jp)... 130.54.208.131\n", + "Connecting to lotus.kuee.kyoto-u.ac.jp (lotus.kuee.kyoto-u.ac.jp)|130.54.208.131|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 777928004 (742M) [application/x-gzip]\n", + "Saving to: ‘indic_wat_2021.tar.gz’\n", + "\n", + "indic_wat_2021.tar. 100%[===================>] 741.89M 13.6MB/s in 57s \n", + "\n", + "2021-06-09 18:51:20 (13.1 MB/s) - ‘indic_wat_2021.tar.gz’ saved [777928004/777928004]\n", + "\n", + "finalrepo/\n", + "finalrepo/README\n", + "finalrepo/dev/\n", + "finalrepo/dev/dev.mr\n", + "finalrepo/dev/dev.kn\n", + "finalrepo/dev/dev.gu\n", + "finalrepo/dev/dev.ta\n", + "finalrepo/dev/dev.bn\n", + "finalrepo/dev/dev.pa\n", + "finalrepo/dev/dev.ml\n", + "finalrepo/dev/dev.or\n", + "finalrepo/dev/dev.en\n", + "finalrepo/dev/dev.hi\n", + "finalrepo/dev/dev.te\n", + "finalrepo/train/\n", + "finalrepo/train/zeroshotcorpstats\n", + "finalrepo/train/opensubtitles/\n", + "finalrepo/train/opensubtitles/en-ta/\n", + "finalrepo/train/opensubtitles/en-ta/train.ta\n", + "finalrepo/train/opensubtitles/en-ta/train.en\n", + "finalrepo/train/opensubtitles/en-te/\n", + "finalrepo/train/opensubtitles/en-te/train.te\n", + "finalrepo/train/opensubtitles/en-te/train.en\n", + "finalrepo/train/opensubtitles/en-ml/\n", + "finalrepo/train/opensubtitles/en-ml/train.ml\n", + "finalrepo/train/opensubtitles/en-ml/train.en\n", + "finalrepo/train/opensubtitles/en-bn/\n", + "finalrepo/train/opensubtitles/en-bn/train.bn\n", + "finalrepo/train/opensubtitles/en-bn/train.en\n", + "finalrepo/train/opensubtitles/en-hi/\n", + "finalrepo/train/opensubtitles/en-hi/train.hi\n", + "finalrepo/train/opensubtitles/en-hi/train.en\n", + "finalrepo/train/cvit-pib/\n", + "finalrepo/train/cvit-pib/en-ta/\n", + "finalrepo/train/cvit-pib/en-ta/train.ta\n", + "finalrepo/train/cvit-pib/en-ta/train.en\n", + "finalrepo/train/cvit-pib/en-te/\n", + "finalrepo/train/cvit-pib/en-te/train.te\n", + "finalrepo/train/cvit-pib/en-te/train.en\n", + "finalrepo/train/cvit-pib/en-or/\n", + "finalrepo/train/cvit-pib/en-or/train.or\n", + "finalrepo/train/cvit-pib/en-or/train.en\n", + "finalrepo/train/cvit-pib/en-ml/\n", + "finalrepo/train/cvit-pib/en-ml/train.ml\n", + "finalrepo/train/cvit-pib/en-ml/train.en\n", + "finalrepo/train/cvit-pib/en-bn/\n", + "finalrepo/train/cvit-pib/en-bn/train.bn\n", + "finalrepo/train/cvit-pib/en-bn/train.en\n", + "finalrepo/train/cvit-pib/en-gu/\n", + "finalrepo/train/cvit-pib/en-gu/train.en\n", + "finalrepo/train/cvit-pib/en-gu/train.gu\n", + "finalrepo/train/cvit-pib/en-mr/\n", + "finalrepo/train/cvit-pib/en-mr/train.mr\n", + "finalrepo/train/cvit-pib/en-mr/train.en\n", + "finalrepo/train/cvit-pib/en-pa/\n", + "finalrepo/train/cvit-pib/en-pa/train.pa\n", + "finalrepo/train/cvit-pib/en-pa/train.en\n", + "finalrepo/train/cvit-pib/en-hi/\n", + "finalrepo/train/cvit-pib/en-hi/train.hi\n", + "finalrepo/train/cvit-pib/en-hi/train.en\n", + "finalrepo/train/bibleuedin/\n", + "finalrepo/train/bibleuedin/en-te/\n", + "finalrepo/train/bibleuedin/en-te/train.te\n", + "finalrepo/train/bibleuedin/en-te/train.en\n", + "finalrepo/train/bibleuedin/en-ml/\n", + "finalrepo/train/bibleuedin/en-ml/train.ml\n", + "finalrepo/train/bibleuedin/en-ml/train.en\n", + "finalrepo/train/bibleuedin/en-gu/\n", + "finalrepo/train/bibleuedin/en-gu/train.en\n", + "finalrepo/train/bibleuedin/en-gu/train.gu\n", + "finalrepo/train/bibleuedin/en-mr/\n", + "finalrepo/train/bibleuedin/en-mr/train.mr\n", + "finalrepo/train/bibleuedin/en-mr/train.en\n", + "finalrepo/train/bibleuedin/en-hi/\n", + "finalrepo/train/bibleuedin/en-hi/train.hi\n", + "finalrepo/train/bibleuedin/en-hi/train.en\n", + "finalrepo/train/bibleuedin/en-kn/\n", + "finalrepo/train/bibleuedin/en-kn/train.kn\n", + "finalrepo/train/bibleuedin/en-kn/train.en\n", + "finalrepo/train/iitb/\n", + "finalrepo/train/iitb/en-hi/\n", + "finalrepo/train/iitb/en-hi/train.hi\n", + "finalrepo/train/iitb/en-hi/train.en\n", + "finalrepo/train/wikimatrix/\n", + "finalrepo/train/wikimatrix/en-ta/\n", + "finalrepo/train/wikimatrix/en-ta/train.ta\n", + "finalrepo/train/wikimatrix/en-ta/train.en\n", + "finalrepo/train/wikimatrix/en-te/\n", + "finalrepo/train/wikimatrix/en-te/train.te\n", + "finalrepo/train/wikimatrix/en-te/train.en\n", + "finalrepo/train/wikimatrix/en-ml/\n", + "finalrepo/train/wikimatrix/en-ml/train.ml\n", + "finalrepo/train/wikimatrix/en-ml/train.en\n", + "finalrepo/train/wikimatrix/en-bn/\n", + "finalrepo/train/wikimatrix/en-bn/train.bn\n", + "finalrepo/train/wikimatrix/en-bn/train.en\n", + "finalrepo/train/wikimatrix/en-mr/\n", + "finalrepo/train/wikimatrix/en-mr/train.mr\n", + "finalrepo/train/wikimatrix/en-mr/train.en\n", + "finalrepo/train/wikimatrix/en-hi/\n", + "finalrepo/train/wikimatrix/en-hi/train.hi\n", + "finalrepo/train/wikimatrix/en-hi/train.en\n", + "finalrepo/train/alt/\n", + "finalrepo/train/alt/en-bn/\n", + "finalrepo/train/alt/en-bn/train.bn\n", + "finalrepo/train/alt/en-bn/train.en\n", + "finalrepo/train/alt/en-hi/\n", + "finalrepo/train/alt/en-hi/train.hi\n", + "finalrepo/train/alt/en-hi/train.en\n", + "finalrepo/train/pmi/\n", + "finalrepo/train/pmi/en-ta/\n", + "finalrepo/train/pmi/en-ta/train.ta\n", + "finalrepo/train/pmi/en-ta/train.en\n", + "finalrepo/train/pmi/en-te/\n", + "finalrepo/train/pmi/en-te/train.te\n", + "finalrepo/train/pmi/en-te/train.en\n", + "finalrepo/train/pmi/en-or/\n", + "finalrepo/train/pmi/en-or/train.or\n", + "finalrepo/train/pmi/en-or/train.en\n", + "finalrepo/train/pmi/en-ml/\n", + "finalrepo/train/pmi/en-ml/train.ml\n", + "finalrepo/train/pmi/en-ml/train.en\n", + "finalrepo/train/pmi/en-bn/\n", + "finalrepo/train/pmi/en-bn/train.bn\n", + "finalrepo/train/pmi/en-bn/train.en\n", + "finalrepo/train/pmi/en-gu/\n", + "finalrepo/train/pmi/en-gu/train.en\n", + "finalrepo/train/pmi/en-gu/train.gu\n", + "finalrepo/train/pmi/en-mr/\n", + "finalrepo/train/pmi/en-mr/train.mr\n", + "finalrepo/train/pmi/en-mr/train.en\n", + "finalrepo/train/pmi/en-pa/\n", + "finalrepo/train/pmi/en-pa/train.pa\n", + "finalrepo/train/pmi/en-pa/train.en\n", + "finalrepo/train/pmi/en-hi/\n", + "finalrepo/train/pmi/en-hi/train.hi\n", + "finalrepo/train/pmi/en-hi/train.en\n", + "finalrepo/train/pmi/en-kn/\n", + "finalrepo/train/pmi/en-kn/train.kn\n", + "finalrepo/train/pmi/en-kn/train.en\n", + "finalrepo/train/wikititles/\n", + "finalrepo/train/wikititles/en-ta/\n", + "finalrepo/train/wikititles/en-ta/train.ta\n", + "finalrepo/train/wikititles/en-ta/train.en\n", + "finalrepo/train/wikititles/en-gu/\n", + "finalrepo/train/wikititles/en-gu/train.en\n", + "finalrepo/train/wikititles/en-gu/train.gu\n", + "finalrepo/train/mtenglish2odia/\n", + "finalrepo/train/mtenglish2odia/en-or/\n", + "finalrepo/train/mtenglish2odia/en-or/train.or\n", + "finalrepo/train/mtenglish2odia/en-or/train.en\n", + "finalrepo/train/urst/\n", + "finalrepo/train/urst/en-gu/\n", + "finalrepo/train/urst/en-gu/train.en\n", + "finalrepo/train/urst/en-gu/train.gu\n", + "finalrepo/train/jw/\n", + "finalrepo/train/jw/en-ta/\n", + "finalrepo/train/jw/en-ta/train.ta\n", + "finalrepo/train/jw/en-ta/train.en\n", + "finalrepo/train/jw/en-te/\n", + "finalrepo/train/jw/en-te/train.te\n", + "finalrepo/train/jw/en-te/train.en\n", + "finalrepo/train/jw/en-ml/\n", + "finalrepo/train/jw/en-ml/train.ml\n", + "finalrepo/train/jw/en-ml/train.en\n", + "finalrepo/train/jw/en-bn/\n", + "finalrepo/train/jw/en-bn/train.bn\n", + "finalrepo/train/jw/en-bn/train.en\n", + "finalrepo/train/jw/en-gu/\n", + "finalrepo/train/jw/en-gu/train.en\n", + "finalrepo/train/jw/en-gu/train.gu\n", + "finalrepo/train/jw/en-mr/\n", + "finalrepo/train/jw/en-mr/train.mr\n", + "finalrepo/train/jw/en-mr/train.en\n", + "finalrepo/train/jw/en-pa/\n", + "finalrepo/train/jw/en-pa/train.pa\n", + "finalrepo/train/jw/en-pa/train.en\n", + "finalrepo/train/jw/en-hi/\n", + "finalrepo/train/jw/en-hi/train.hi\n", + "finalrepo/train/jw/en-hi/train.en\n", + "finalrepo/train/jw/en-kn/\n", + "finalrepo/train/jw/en-kn/train.kn\n", + "finalrepo/train/jw/en-kn/train.en\n", + "finalrepo/train/nlpc/\n", + "finalrepo/train/nlpc/en-ta/\n", + "finalrepo/train/nlpc/en-ta/train.ta\n", + "finalrepo/train/nlpc/en-ta/train.en\n", + "finalrepo/train/get_zero_shot_pairs.py\n", + "finalrepo/train/ufal/\n", + "finalrepo/train/ufal/en-ta/\n", + "finalrepo/train/ufal/en-ta/train.ta\n", + "finalrepo/train/ufal/en-ta/train.en\n", + "finalrepo/train/odiencorp/\n", + "finalrepo/train/odiencorp/en-or/\n", + "finalrepo/train/odiencorp/en-or/train.or\n", + "finalrepo/train/odiencorp/en-or/train.en\n", + "finalrepo/train/tanzil/\n", + "finalrepo/train/tanzil/en-ta/\n", + "finalrepo/train/tanzil/en-ta/train.ta\n", + "finalrepo/train/tanzil/en-ta/train.en\n", + "finalrepo/train/tanzil/en-ml/\n", + "finalrepo/train/tanzil/en-ml/train.ml\n", + "finalrepo/train/tanzil/en-ml/train.en\n", + "finalrepo/train/tanzil/en-bn/\n", + "finalrepo/train/tanzil/en-bn/train.bn\n", + "finalrepo/train/tanzil/en-bn/train.en\n", + "finalrepo/train/tanzil/en-hi/\n", + "finalrepo/train/tanzil/en-hi/train.hi\n", + "finalrepo/train/tanzil/en-hi/train.en\n", + "finalrepo/train/ted2020/\n", + "finalrepo/train/ted2020/en-ta/\n", + "finalrepo/train/ted2020/en-ta/train.ta\n", + "finalrepo/train/ted2020/en-ta/train.en\n", + "finalrepo/train/ted2020/en-te/\n", + "finalrepo/train/ted2020/en-te/train.te\n", + "finalrepo/train/ted2020/en-te/train.en\n", + "finalrepo/train/ted2020/en-ml/\n", + "finalrepo/train/ted2020/en-ml/train.ml\n", + "finalrepo/train/ted2020/en-ml/train.en\n", + "finalrepo/train/ted2020/en-bn/\n", + "finalrepo/train/ted2020/en-bn/train.bn\n", + "finalrepo/train/ted2020/en-bn/train.en\n", + "finalrepo/train/ted2020/en-gu/\n", + "finalrepo/train/ted2020/en-gu/train.en\n", + "finalrepo/train/ted2020/en-gu/train.gu\n", + "finalrepo/train/ted2020/en-mr/\n", + "finalrepo/train/ted2020/en-mr/train.mr\n", + "finalrepo/train/ted2020/en-mr/train.en\n", + "finalrepo/train/ted2020/en-pa/\n", + "finalrepo/train/ted2020/en-pa/train.pa\n", + "finalrepo/train/ted2020/en-pa/train.en\n", + "finalrepo/train/ted2020/en-hi/\n", + "finalrepo/train/ted2020/en-hi/train.hi\n", + "finalrepo/train/ted2020/en-hi/train.en\n", + "finalrepo/train/ted2020/en-kn/\n", + "finalrepo/train/ted2020/en-kn/train.kn\n", + "finalrepo/train/ted2020/en-kn/train.en\n", + "finalrepo/test/\n", + "finalrepo/test/test.gu\n", + "finalrepo/test/test.fm.prob\n", + "finalrepo/test/test.kn\n", + "finalrepo/test/test.ta\n", + "finalrepo/test/cached_lm_test.en\n", + "finalrepo/test/test.pa\n", + "finalrepo/test/test.bn\n", + "finalrepo/test/test.hi\n", + "finalrepo/test/test.ml\n", + "finalrepo/test/test.or\n", + "finalrepo/test/test.mr\n", + "finalrepo/test/test.en\n", + "finalrepo/test/test.te\n" + ] + } + ], + "source": [ + "# In this example, we will finetuning on cvit-pib corpus which is part of the WAT2021 training dataset.\n", + "\n", + "# Lets first download the full wat2021 training data (cvit-pib is a part of this big training set)\n", + "# ***Note***: See the next section to mine for mining indic to indic data from english centric WAT data. This dataset can be used to finetune indic2indic model\n", + "!wget http://lotus.kuee.kyoto-u.ac.jp/WAT/indic-multilingual/indic_wat_2021.tar.gz\n", + "!tar -xzvf indic_wat_2021.tar.gz\n", + "# all train sets will now be in wat2021/train\n", + "!mv finalrepo wat2021" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BSoZDR3fHpUk", + "outputId": "11bd057b-d1b0-45b8-feac-85b3e900104e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: cannot create directory ‘wat2021-indic2indic’: File exists\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r 0%| | 0/2 [00:00 remove_train_devtest_overlaps(train_dir, devtest_dir, many2many=True)\n", + "\n", + "extract_non_english_pairs('wat2021/train/cvit-pib', 'wat2021-indic2indic', ['bn', 'hi', 'gu'])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ys_QURP3Sx7G", + "outputId": "d41f5baa-e700-4e07-93cd-b23b08122dc5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/content/finetuning/indicTrans\n" + ] + } + ], + "source": [ + "# wat2021\n", + "# ├── dev # contains Wat2021 dev data\n", + "# │ ├── dev.bn\n", + "# │ ├── dev.en\n", + "# │ ├── dev.gu\n", + "# │ ├── dev.hi\n", + "# │ ├── dev.kn\n", + "# │ ├── dev.ml\n", + "# │ ├── dev.mr\n", + "# │ ├── dev.or\n", + "# │ ├── dev.pa\n", + "# │ ├── dev.ta\n", + "# │ └── dev.te\n", + "# ├── README\n", + "# ├── test # contains Wat2021 test data\n", + "# │ ├── test.bn\n", + "# │ ├── test.en\n", + "# │ ├── test.gu\n", + "# │ ├── test.hi\n", + "# │ ├── test.kn\n", + "# │ ├── test.ml\n", + "# │ ├── test.mr\n", + "# │ ├── test.or\n", + "# │ ├── test.pa\n", + "# │ ├── test.ta\n", + "# │ └── test.te\n", + "# └── train # contains WAT2021 train data which has lot of corpuses (alt, bible, Jw300, etc)\n", + "# ├── alt/\n", + "# ├── bibleuedin/\n", + "# ├── iitb/\n", + "# ├── jw/\n", + "# ├── mtenglish2odia/\n", + "# ├── nlpc/\n", + "# ├── odiencorp/\n", + "# ├── opensubtitles/\n", + "# ├── pmi/\n", + "# ├── tanzil/\n", + "# ├── ted2020/\n", + "# ├── ufal/\n", + "# ├── urst/\n", + "# ├── wikimatrix/\n", + "# ├── wikititles/\n", + "# └── cvit-pib \n", + "# ├── en-bn # within a train corpus folder the files are arranged in {src_lang}-{tgt_lang}/train.{src_lang}, train.{tgt_lang}\n", + "# │ ├── train.bn\n", + "# │ └── train.en\n", + "# ├── en-gu\n", + "# │ ├── train.en\n", + "# │ └── train.gu\n", + "# ├── en-hi\n", + "# │ ├── train.en\n", + "# │ └── train.hi\n", + "# ├── en-ml\n", + "# │ ├── train.en\n", + "# │ └── train.ml\n", + "# ├── en-mr\n", + "# │ ├── train.en\n", + "# │ └── train.mr\n", + "# ├── en-or\n", + "# │ ├── train.en\n", + "# │ └── train.or\n", + "# ├── en-pa\n", + "# │ ├── train.en\n", + "# │ └── train.pa\n", + "# ├── en-ta\n", + "# │ ├── train.en\n", + "# │ └── train.ta\n", + "# └── en-te\n", + "# ├── train.en\n", + "# └── train.te\n", + "\n", + "\n", + "\n", + "# instead of using all the data for this example, we will mainly use the cvit-pib corpus from wat2021 train set\n", + "# for dev and test set, we will use the dev and test provided by wat2021\n", + "\n", + "# In case, you want to finetune on all these corpuses, you would need to merge all the training data into one folder and remove duplicate train sentence pairs.\n", + "# To do this, refer to this gist: https://gist.github.com/gowtham1997/2524f8e9559cff586d1f935e621fc598\n", + "\n", + "\n", + "# copy everything to a dataset folder\n", + "!mkdir -p dataset/train\n", + "! cp -r wat2021/train/cvit-pib/* dataset/train\n", + "! cp -r wat2021/dev dataset\n", + "! cp -r wat2021/test dataset\n", + "\n", + "\n", + "# lets cd to indicTrans\n", + "%cd indicTrans" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8yPTbM_clKfI", + "outputId": "d4459da6-3e0b-45c8-f291-d6761e536284" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../dataset\n" + ] + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 7, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "%%shell\n", + "\n", + "exp_dir=../dataset\n", + "src_lang=en\n", + "tgt_lang=indic\n", + "\n", + "# change this to indic-en, if you have downloaded the indic-en dir or m2m if you have downloaded the indic2indic model\n", + "download_dir=../en-indic\n", + "\n", + "train_data_dir=$exp_dir/train\n", + "dev_data_dir=$exp_dir/dev\n", + "test_data_dir=$exp_dir/test\n", + "echo $exp_dir\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NhwUXyYVXrOY", + "outputId": "9ddb06dd-3fcc-4d4c-a4ec-131a9f4ea220" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running experiment ../dataset on en to indic\n", + "Applying normalization and script conversion for train bn\n", + "100% 91985/91985 [00:25<00:00, 3582.55it/s]\n", + "100% 91985/91985 [00:14<00:00, 6330.85it/s]\n", + "Number of sentences in train bn: 91985\n", + "Applying normalization and script conversion for dev bn\n", + "100% 1000/1000 [00:00<00:00, 1593.70it/s]\n", + "100% 1000/1000 [00:00<00:00, 7232.26it/s]\n", + "Number of sentences in dev bn: 1000\n", + "Applying normalization and script conversion for test bn\n", + "100% 2390/2390 [00:00<00:00, 2874.03it/s]\n", + "100% 2390/2390 [00:00<00:00, 6727.65it/s]\n", + "Number of sentences in test bn: 2390\n", + "Applying normalization and script conversion for train hi\n", + "100% 266545/266545 [01:15<00:00, 3546.17it/s]\n", + "100% 266545/266545 [00:45<00:00, 5913.09it/s]\n", + "Number of sentences in train hi: 266545\n", + "Applying normalization and script conversion for dev hi\n", + "100% 1000/1000 [00:00<00:00, 1666.49it/s]\n", + "100% 1000/1000 [00:00<00:00, 5857.08it/s]\n", + "Number of sentences in dev hi: 1000\n", + "Applying normalization and script conversion for test hi\n", + "100% 2390/2390 [00:00<00:00, 2928.00it/s]\n", + "100% 2390/2390 [00:00<00:00, 6789.39it/s]\n", + "Number of sentences in test hi: 2390\n", + "Applying normalization and script conversion for train gu\n", + "100% 58264/58264 [00:15<00:00, 3688.72it/s]\n", + "100% 58264/58264 [00:09<00:00, 6391.97it/s]\n", + "Number of sentences in train gu: 58264\n", + "Applying normalization and script conversion for dev gu\n", + "100% 1000/1000 [00:00<00:00, 1670.01it/s]\n", + "100% 1000/1000 [00:00<00:00, 6530.46it/s]\n", + "Number of sentences in dev gu: 1000\n", + "Applying normalization and script conversion for test gu\n", + "100% 2390/2390 [00:00<00:00, 2884.69it/s]\n", + "100% 2390/2390 [00:00<00:00, 6099.24it/s]\n", + "Number of sentences in test gu: 2390\n", + "Applying normalization and script conversion for train ml\n", + "100% 43087/43087 [00:12<00:00, 3589.89it/s]\n", + "100% 43087/43087 [00:07<00:00, 5968.67it/s]\n", + "Number of sentences in train ml: 43087\n", + "Applying normalization and script conversion for dev ml\n", + "100% 1000/1000 [00:00<00:00, 1691.23it/s]\n", + "100% 1000/1000 [00:00<00:00, 6090.55it/s]\n", + "Number of sentences in dev ml: 1000\n", + "Applying normalization and script conversion for test ml\n", + "100% 2390/2390 [00:00<00:00, 2961.81it/s]\n", + "100% 2390/2390 [00:00<00:00, 6878.08it/s]\n", + "Number of sentences in test ml: 2390\n", + "Applying normalization and script conversion for train mr\n", + "100% 114220/114220 [00:30<00:00, 3773.79it/s]\n", + "100% 114220/114220 [00:17<00:00, 6513.13it/s]\n", + "Number of sentences in train mr: 114220\n", + "Applying normalization and script conversion for dev mr\n", + "100% 1000/1000 [00:00<00:00, 1671.69it/s]\n", + "100% 1000/1000 [00:00<00:00, 5737.54it/s]\n", + "Number of sentences in dev mr: 1000\n", + "Applying normalization and script conversion for test mr\n", + "100% 2390/2390 [00:00<00:00, 2959.82it/s]\n", + "100% 2390/2390 [00:00<00:00, 6393.52it/s]\n", + "Number of sentences in test mr: 2390\n", + "Applying normalization and script conversion for train or\n", + "100% 94494/94494 [00:24<00:00, 3912.66it/s]\n", + "100% 94494/94494 [00:13<00:00, 6919.45it/s]\n", + "Number of sentences in train or: 94494\n", + "Applying normalization and script conversion for dev or\n", + "100% 1000/1000 [00:00<00:00, 1680.80it/s]\n", + "100% 1000/1000 [00:00<00:00, 5797.35it/s]\n", + "Number of sentences in dev or: 1000\n", + "Applying normalization and script conversion for test or\n", + "100% 2390/2390 [00:00<00:00, 2978.67it/s]\n", + "100% 2390/2390 [00:00<00:00, 6787.01it/s]\n", + "Number of sentences in test or: 2390\n", + "Applying normalization and script conversion for train pa\n", + "100% 101092/101092 [00:26<00:00, 3826.32it/s]\n", + "100% 101092/101092 [00:15<00:00, 6425.22it/s]\n", + "Number of sentences in train pa: 101092\n", + "Applying normalization and script conversion for dev pa\n", + "100% 1000/1000 [00:00<00:00, 1667.88it/s]\n", + "100% 1000/1000 [00:00<00:00, 6182.50it/s]\n", + "Number of sentences in dev pa: 1000\n", + "Applying normalization and script conversion for test pa\n", + "100% 2390/2390 [00:00<00:00, 2993.56it/s]\n", + "100% 2390/2390 [00:00<00:00, 8002.74it/s]\n", + "Number of sentences in test pa: 2390\n", + "Applying normalization and script conversion for train ta\n", + "100% 115968/115968 [00:30<00:00, 3838.68it/s]\n", + "100% 115968/115968 [00:19<00:00, 5805.14it/s]\n", + "Number of sentences in train ta: 115968\n", + "Applying normalization and script conversion for dev ta\n", + "100% 1000/1000 [00:00<00:00, 1659.50it/s]\n", + "100% 1000/1000 [00:00<00:00, 6223.34it/s]\n", + "Number of sentences in dev ta: 1000\n", + "Applying normalization and script conversion for test ta\n", + "100% 2390/2390 [00:00<00:00, 3046.92it/s]\n", + "100% 2390/2390 [00:00<00:00, 6047.32it/s]\n", + "Number of sentences in test ta: 2390\n", + "Applying normalization and script conversion for train te\n", + "100% 44720/44720 [00:12<00:00, 3524.75it/s]\n", + "100% 44720/44720 [00:07<00:00, 6016.25it/s]\n", + "Number of sentences in train te: 44720\n", + "Applying normalization and script conversion for dev te\n", + "100% 1000/1000 [00:00<00:00, 1673.03it/s]\n", + "100% 1000/1000 [00:00<00:00, 6102.16it/s]\n", + "Number of sentences in dev te: 1000\n", + "Applying normalization and script conversion for test te\n", + "100% 2390/2390 [00:00<00:00, 2960.42it/s]\n", + "100% 2390/2390 [00:00<00:00, 7440.37it/s]\n", + "Number of sentences in test te: 2390\n", + "\n", + "../dataset/data/train.SRC\n", + "../dataset/data/train.TGT\n", + " 0% 0/11 [00:00\n", + "2021-05-09 14:03:48 | INFO | fairseq_cli.preprocess | [SRC] Dictionary: 32104 types\n", + "2021-05-09 14:03:49 | INFO | fairseq_cli.preprocess | [SRC] ../dataset/final/dev.SRC: 9000 sents, 200619 tokens, 0.117% replaced by \n", + "2021-05-09 14:03:49 | INFO | fairseq_cli.preprocess | [SRC] Dictionary: 32104 types\n", + "2021-05-09 14:03:51 | INFO | fairseq_cli.preprocess | [SRC] ../dataset/final/test.SRC: 21510 sents, 471564 tokens, 0.155% replaced by \n", + "2021-05-09 14:03:51 | INFO | fairseq_cli.preprocess | [TGT] Dictionary: 35848 types\n", + "2021-05-09 14:07:06 | INFO | fairseq_cli.preprocess | [TGT] ../dataset/final/train.TGT: 930375 sents, 35902065 tokens, 0.318% replaced by \n", + "2021-05-09 14:07:06 | INFO | fairseq_cli.preprocess | [TGT] Dictionary: 35848 types\n", + "2021-05-09 14:07:07 | INFO | fairseq_cli.preprocess | [TGT] ../dataset/final/dev.TGT: 9000 sents, 224623 tokens, 0.631% replaced by \n", + "2021-05-09 14:07:07 | INFO | fairseq_cli.preprocess | [TGT] Dictionary: 35848 types\n", + "2021-05-09 14:07:11 | INFO | fairseq_cli.preprocess | [TGT] ../dataset/final/test.TGT: 21510 sents, 526380 tokens, 0.57% replaced by \n", + "2021-05-09 14:07:11 | INFO | fairseq_cli.preprocess | Wrote preprocessed data to ../dataset/final_bin\n" + ] + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 9, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "# all the data preparation happens in this cell\n", + "%%shell\n", + "\n", + "exp_dir=../dataset\n", + "src_lang=en\n", + "tgt_lang=indic\n", + "\n", + "# change this to indic-en, if you have downloaded the indic-en dir or m2m if you have downloaded the indic2indic model\n", + "download_dir=../en-indic\n", + "\n", + "train_data_dir=$exp_dir/train\n", + "dev_data_dir=$exp_dir/dev\n", + "test_data_dir=$exp_dir/test\n", + "\n", + "\n", + "echo \"Running experiment ${exp_dir} on ${src_lang} to ${tgt_lang}\"\n", + "\n", + "\n", + "train_processed_dir=$exp_dir/data\n", + "devtest_processed_dir=$exp_dir/data\n", + "\n", + "out_data_dir=$exp_dir/final_bin\n", + "\n", + "mkdir -p $train_processed_dir\n", + "mkdir -p $devtest_processed_dir\n", + "mkdir -p $out_data_dir\n", + "\n", + "# indic languages.\n", + "# cvit-pib corpus does not have as (assamese) and kn (kannada), hence its not part of this list\n", + "langs=(bn hi gu ml mr or pa ta te)\n", + "\n", + "for lang in ${langs[@]};do\n", + "\tif [ $src_lang == en ]; then\n", + "\t\ttgt_lang=$lang\n", + "\telse\n", + "\t\tsrc_lang=$lang\n", + "\tfi\n", + "\n", + "\ttrain_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang\n", + "\tdevtest_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang\n", + "\tmkdir -p $train_norm_dir\n", + "\tmkdir -p $devtest_norm_dir\n", + "\n", + "\n", + " # preprocessing pretokenizes the input (we use moses tokenizer for en and indicnlp lib for indic languages)\n", + " # after pretokenization, we use indicnlp to transliterate all the indic data to devnagiri script\n", + "\n", + "\t# train preprocessing\n", + "\ttrain_infname_src=$train_data_dir/en-${lang}/train.$src_lang\n", + "\ttrain_infname_tgt=$train_data_dir/en-${lang}/train.$tgt_lang\n", + "\ttrain_outfname_src=$train_norm_dir/train.$src_lang\n", + "\ttrain_outfname_tgt=$train_norm_dir/train.$tgt_lang\n", + "\techo \"Applying normalization and script conversion for train $lang\"\n", + "\tinput_size=`python scripts/preprocess_translate.py $train_infname_src $train_outfname_src $src_lang true`\n", + "\tinput_size=`python scripts/preprocess_translate.py $train_infname_tgt $train_outfname_tgt $tgt_lang true`\n", + "\techo \"Number of sentences in train $lang: $input_size\"\n", + "\n", + "\t# dev preprocessing\n", + "\tdev_infname_src=$dev_data_dir/dev.$src_lang\n", + "\tdev_infname_tgt=$dev_data_dir/dev.$tgt_lang\n", + "\tdev_outfname_src=$devtest_norm_dir/dev.$src_lang\n", + "\tdev_outfname_tgt=$devtest_norm_dir/dev.$tgt_lang\n", + "\techo \"Applying normalization and script conversion for dev $lang\"\n", + "\tinput_size=`python scripts/preprocess_translate.py $dev_infname_src $dev_outfname_src $src_lang true`\n", + "\tinput_size=`python scripts/preprocess_translate.py $dev_infname_tgt $dev_outfname_tgt $tgt_lang true`\n", + "\techo \"Number of sentences in dev $lang: $input_size\"\n", + "\n", + "\t# test preprocessing\n", + "\ttest_infname_src=$test_data_dir/test.$src_lang\n", + "\ttest_infname_tgt=$test_data_dir/test.$tgt_lang\n", + "\ttest_outfname_src=$devtest_norm_dir/test.$src_lang\n", + "\ttest_outfname_tgt=$devtest_norm_dir/test.$tgt_lang\n", + "\techo \"Applying normalization and script conversion for test $lang\"\n", + "\tinput_size=`python scripts/preprocess_translate.py $test_infname_src $test_outfname_src $src_lang true`\n", + "\tinput_size=`python scripts/preprocess_translate.py $test_infname_tgt $test_outfname_tgt $tgt_lang true`\n", + "\techo \"Number of sentences in test $lang: $input_size\"\n", + "done\n", + "\n", + "\n", + "\n", + "\n", + "# Now that we have preprocessed all the data, we can now merge these different text files into one\n", + "# ie. for en-as, we have train.en and corresponding train.as, similarly for en-bn, we have train.en and corresponding train.bn\n", + "# now we will concatenate all this into en-X where train.SRC will have all the en (src) training data and train.TGT will have all the concatenated indic lang data\n", + "\n", + "python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'train'\n", + "python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'dev'\n", + "python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'test'\n", + "\n", + "# use the vocab from downloaded dir\n", + "cp -r $download_dir/vocab $exp_dir\n", + "\n", + "\n", + "echo \"Applying bpe to the new finetuning data\"\n", + "bash apply_single_bpe_traindevtest_notag.sh $exp_dir\n", + "\n", + "mkdir -p $exp_dir/final\n", + "\n", + "# We also add special tags to indicate the source and target language in the inputs\n", + "# Eg: to translate a sentence from english to hindi , the input would be __src__en__ __tgt__hi__ \n", + "\n", + "echo \"Adding language tags\"\n", + "python scripts/add_joint_tags_translate.py $exp_dir 'train'\n", + "python scripts/add_joint_tags_translate.py $exp_dir 'dev'\n", + "python scripts/add_joint_tags_translate.py $exp_dir 'test'\n", + "\n", + "\n", + "\n", + "data_dir=$exp_dir/final\n", + "out_data_dir=$exp_dir/final_bin\n", + "\n", + "rm -rf $out_data_dir\n", + "\n", + "# binarizing the new data (train, dev and test) using dictionary from the download dir\n", + "\n", + " num_workers=`python -c \"import multiprocessing; print(multiprocessing.cpu_count())\"`\n", + "\n", + "data_dir=$exp_dir/final\n", + "out_data_dir=$exp_dir/final_bin\n", + "\n", + "# rm -rf $out_data_dir\n", + "\n", + "echo \"Binarizing data. This will take some time depending on the size of finetuning data\"\n", + "fairseq-preprocess --source-lang SRC --target-lang TGT \\\n", + " --trainpref $data_dir/train --validpref $data_dir/dev --testpref $data_dir/test \\\n", + " --destdir $out_data_dir --workers $num_workers \\\n", + " --srcdict $download_dir/final_bin/dict.SRC.txt --tgtdict $download_dir/final_bin/dict.TGT.txt --thresholdtgt 5 --thresholdsrc 5 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iz6tzbe2tcs7", + "outputId": "6705e2d6-b5cb-4810-c833-6a1370d3fce4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021-05-09 14:29:11 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'tensorboard_logdir': '../dataset/tensorboard-wandb', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'user_dir': 'model_configs', 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 1, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': None, 'distributed_port': -1, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_algorithm': 'LocalSGD', 'localsgd_frequency': 3, 'nprocs_per_node': 1, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'distributed_num_procs': 1}, 'dataset': {'_name': None, 'num_workers': 1, 'skip_invalid_size_inputs_valid_test': True, 'max_tokens': 256, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'validate_interval': 1, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 256, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 1000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [2], 'lr': [3e-05], 'stop_min_lr': -1.0, 'use_bmuf': False}, 'checkpoint': {'_name': None, 'save_dir': '../dataset/model', 'restore_file': '../en-indic/model/checkpoint_best.pt', 'finetune_from_model': None, 'reset_dataloader': True, 'reset_lr_scheduler': True, 'reset_meters': True, 'reset_optimizer': True, 'optimizer_overrides': '{}', 'save_interval': 1, 'save_interval_updates': 0, 'keep_interval_updates': -1, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': 5, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': False, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': 5, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 1}, 'generation': {'_name': None, 'beam': 5, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(_name='transformer_4x', activation_dropout=0.0, activation_fn='relu', adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, all_gather_list_size=16384, arch='transformer_4x', attention_dropout=0.0, azureml_logging=False, batch_size=None, batch_size_valid=None, best_checkpoint_metric='loss', bf16=False, bpe=None, broadcast_buffers=False, bucket_cap_mb=25, checkpoint_activations=False, checkpoint_shard_count=1, checkpoint_suffix='', clip_norm=1.0, cpu=False, cpu_offload=False, criterion='label_smoothed_cross_entropy', cross_self_attention=False, curriculum=0, data='../dataset/final_bin', data_buffer_size=10, dataset_impl=None, ddp_backend='pytorch_ddp', ddp_comm_hook='none', decoder_attention_heads=16, decoder_embed_dim=1536, decoder_embed_path=None, decoder_ffn_embed_dim=4096, decoder_input_dim=1536, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=False, decoder_normalize_before=False, decoder_output_dim=1536, device_id=0, disable_validation=False, distributed_backend='nccl', distributed_init_method=None, distributed_no_spawn=False, distributed_port=-1, distributed_rank=0, distributed_world_size=1, dropout=0.2, empty_cache_freq=0, encoder_attention_heads=16, encoder_embed_dim=1536, encoder_embed_path=None, encoder_ffn_embed_dim=4096, encoder_layerdrop=0, encoder_layers=6, encoder_layers_to_keep=None, encoder_learned_pos=False, encoder_normalize_before=False, eos=2, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_bleu_print_samples=False, eval_bleu_remove_bpe=None, eval_tokenized_bleu=False, fast_stat_sync=False, find_unused_parameters=False, finetune_from_model=None, fix_batches_to_gpus=False, fixed_validation_seed=None, fp16=True, fp16_init_scale=128, fp16_no_flatten_grads=False, fp16_scale_tolerance=0.0, fp16_scale_window=None, fp32_reduce_scatter=False, gen_subset='test', heartbeat_timeout=-1, ignore_prefix_size=0, keep_best_checkpoints=-1, keep_interval_updates=-1, keep_interval_updates_pattern=-1, keep_last_epochs=5, label_smoothing=0.1, layernorm_embedding=False, left_pad_source=True, left_pad_target=False, load_alignments=False, load_checkpoint_on_all_dp_ranks=False, localsgd_frequency=3, log_file=None, log_format=None, log_interval=100, lr=[3e-05], lr_scheduler='inverse_sqrt', max_epoch=0, max_source_positions=210, max_target_positions=210, max_tokens=256, max_tokens_valid=256, max_update=1000, max_valid_steps=None, maximize_best_checkpoint_metric=False, memory_efficient_bf16=False, memory_efficient_fp16=False, min_loss_scale=0.0001, min_params_to_wrap=100000000, model_parallel_size=1, no_cross_attention=False, no_epoch_checkpoints=False, no_last_checkpoints=False, no_progress_bar=False, no_reshard_after_forward=False, no_save=False, no_save_optimizer_state=False, no_scale_embedding=False, no_seed_provided=False, no_token_positional_embeddings=False, nprocs_per_node=1, num_batch_buckets=0, num_shards=1, num_workers=1, offload_activations=False, optimizer='adam', optimizer_overrides='{}', pad=1, patience=5, pipeline_balance=None, pipeline_checkpoint='never', pipeline_chunks=0, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_devices=None, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_model_parallel=False, plasma_path='/tmp/plasma', profile=False, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, quantization_config_path=None, report_accuracy=False, required_batch_size_multiple=8, required_seq_len_multiple=1, reset_dataloader=True, reset_logging=False, reset_lr_scheduler=True, reset_meters=True, reset_optimizer=True, restore_file='../en-indic/model/checkpoint_best.pt', save_dir='../dataset/model', save_interval=1, save_interval_updates=0, scoring='bleu', seed=1, sentence_avg=False, shard_id=0, share_all_embeddings=False, share_decoder_input_output_embed=False, skip_invalid_size_inputs_valid_test=True, slowmo_algorithm='LocalSGD', slowmo_momentum=None, source_lang='SRC', stop_min_lr=-1.0, stop_time_hours=0, suppress_crashes=False, target_lang='TGT', task='translation', tensorboard_logdir='../dataset/tensorboard-wandb', threshold_loss_scale=None, tie_adaptive_weights=False, tokenizer=None, tpu=False, train_subset='train', truncate_source=False, unk=3, update_freq=[2], upsample_primary=-1, use_bmuf=False, use_old_adam=False, use_plasma_view=False, user_dir='model_configs', valid_subset='valid', validate_after_updates=0, validate_interval=1, validate_interval_updates=0, wandb_project=None, warmup_init_lr=1e-07, warmup_updates=4000, weight_decay=0.0, write_checkpoints_asynchronously=False, zero_sharding='none'), 'task': {'_name': 'translation', 'data': '../dataset/final_bin', 'source_lang': 'SRC', 'target_lang': 'TGT', 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 210, 'max_target_positions': 210, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'tpu': False, 'lr': [3e-05]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': 1e-07, 'lr': [3e-05]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None}\n", + "2021-05-09 14:29:11 | INFO | fairseq.tasks.translation | [SRC] dictionary: 32104 types\n", + "2021-05-09 14:29:11 | INFO | fairseq.tasks.translation | [TGT] dictionary: 35848 types\n", + "2021-05-09 14:29:19 | INFO | fairseq_cli.train | TransformerModel(\n", + " (encoder): TransformerEncoder(\n", + " (dropout_module): FairseqDropout()\n", + " (embed_tokens): Embedding(32104, 1536, padding_idx=1)\n", + " (embed_positions): SinusoidalPositionalEmbedding()\n", + " (layers): ModuleList(\n", + " (0): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (dropout_module): FairseqDropout()\n", + " (activation_dropout_module): FairseqDropout()\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (1): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (dropout_module): FairseqDropout()\n", + " (activation_dropout_module): FairseqDropout()\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (2): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (dropout_module): FairseqDropout()\n", + " (activation_dropout_module): FairseqDropout()\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (3): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (dropout_module): FairseqDropout()\n", + " (activation_dropout_module): FairseqDropout()\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (4): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (dropout_module): FairseqDropout()\n", + " (activation_dropout_module): FairseqDropout()\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (5): TransformerEncoderLayer(\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (dropout_module): FairseqDropout()\n", + " (activation_dropout_module): FairseqDropout()\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " )\n", + " (decoder): TransformerDecoder(\n", + " (dropout_module): FairseqDropout()\n", + " (embed_tokens): Embedding(35848, 1536, padding_idx=1)\n", + " (embed_positions): SinusoidalPositionalEmbedding()\n", + " (layers): ModuleList(\n", + " (0): TransformerDecoderLayer(\n", + " (dropout_module): FairseqDropout()\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (activation_dropout_module): FairseqDropout()\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (encoder_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (encoder_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (1): TransformerDecoderLayer(\n", + " (dropout_module): FairseqDropout()\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (activation_dropout_module): FairseqDropout()\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (encoder_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (encoder_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (2): TransformerDecoderLayer(\n", + " (dropout_module): FairseqDropout()\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (activation_dropout_module): FairseqDropout()\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (encoder_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (encoder_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (3): TransformerDecoderLayer(\n", + " (dropout_module): FairseqDropout()\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (activation_dropout_module): FairseqDropout()\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (encoder_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (encoder_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (4): TransformerDecoderLayer(\n", + " (dropout_module): FairseqDropout()\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (activation_dropout_module): FairseqDropout()\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (encoder_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (encoder_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (5): TransformerDecoderLayer(\n", + " (dropout_module): FairseqDropout()\n", + " (self_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (activation_dropout_module): FairseqDropout()\n", + " (self_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (encoder_attn): MultiheadAttention(\n", + " (dropout_module): FairseqDropout()\n", + " (k_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (v_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (q_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " (out_proj): Linear(in_features=1536, out_features=1536, bias=True)\n", + " )\n", + " (encoder_attn_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " (fc1): Linear(in_features=1536, out_features=4096, bias=True)\n", + " (fc2): Linear(in_features=4096, out_features=1536, bias=True)\n", + " (final_layer_norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " )\n", + " (output_projection): Linear(in_features=1536, out_features=35848, bias=False)\n", + " )\n", + ")\n", + "2021-05-09 14:29:19 | INFO | fairseq_cli.train | task: TranslationTask\n", + "2021-05-09 14:29:19 | INFO | fairseq_cli.train | model: TransformerModel\n", + "2021-05-09 14:29:19 | INFO | fairseq_cli.train | criterion: LabelSmoothedCrossEntropyCriterion\n", + "2021-05-09 14:29:19 | INFO | fairseq_cli.train | num. shared model params: 480,571,392 (num. trained: 480,571,392)\n", + "2021-05-09 14:29:19 | INFO | fairseq_cli.train | num. expert model params: 0 (num. trained: 0)\n", + "2021-05-09 14:29:19 | INFO | fairseq.data.data_utils | loaded 9,000 examples from: ../dataset/final_bin/valid.SRC-TGT.SRC\n", + "2021-05-09 14:29:19 | INFO | fairseq.data.data_utils | loaded 9,000 examples from: ../dataset/final_bin/valid.SRC-TGT.TGT\n", + "2021-05-09 14:29:19 | INFO | fairseq.tasks.translation | ../dataset/final_bin valid SRC-TGT 9000 examples\n", + "2021-05-09 14:29:21 | INFO | fairseq.utils | ***********************CUDA enviroments for all 1 workers***********************\n", + "2021-05-09 14:29:21 | INFO | fairseq.utils | rank 0: capabilities = 3.7 ; total memory = 11.173 GB ; name = Tesla K80 \n", + "2021-05-09 14:29:21 | INFO | fairseq.utils | ***********************CUDA enviroments for all 1 workers***********************\n", + "2021-05-09 14:29:21 | INFO | fairseq_cli.train | training on 1 devices (GPUs/TPUs)\n", + "2021-05-09 14:29:21 | INFO | fairseq_cli.train | max tokens per device = 256 and max sentences per device = None\n", + "2021-05-09 14:29:21 | INFO | fairseq.trainer | Preparing to load checkpoint ../en-indic/model/checkpoint_best.pt\n", + "tcmalloc: large alloc 1922285568 bytes == 0x55e01c93a000 @ 0x7f8579074b6b 0x7f8579094379 0x7f851797e25e 0x7f851797f9d2 0x7f85559a8e7d 0x7f85665a3120 0x7f85661e1bd9 0x55df57c868a8 0x55df57cf9fd5 0x55df57cf47ad 0x55df57c873ea 0x55df57cf53b5 0x55df57cf47ad 0x55df57c87003 0x55df57c86b09 0x55df57dce28d 0x55df57d3d1db 0x55df57c85bb1 0x55df57d76fed 0x55df57cf9988 0x55df57cf47ad 0x55df57bc6e2c 0x55df57cf6bb5 0x55df57cf44ae 0x55df57c873ea 0x55df57cf632a 0x55df57cf44ae 0x55df57c873ea 0x55df57cf632a 0x55df57cf44ae 0x55df57c873ea\n", + "tcmalloc: large alloc 1922285568 bytes == 0x55e08f276000 @ 0x7f8579074b6b 0x7f8579094379 0x7f851797e25e 0x7f851797f9d2 0x7f85559a8e7d 0x7f85665a3120 0x7f85661e1bd9 0x55df57c868a8 0x55df57cf9fd5 0x55df57cf47ad 0x55df57c873ea 0x55df57cf53b5 0x55df57cf47ad 0x55df57c87003 0x55df57c86b09 0x55df57dce28d 0x55df57d3d1db 0x55df57c85bb1 0x55df57d76fed 0x55df57cf9988 0x55df57cf47ad 0x55df57bc6e2c 0x55df57cf6bb5 0x55df57cf44ae 0x55df57c873ea 0x55df57cf632a 0x55df57cf44ae 0x55df57c873ea 0x55df57cf632a 0x55df57cf44ae 0x55df57c873ea\n", + "2021-05-09 14:32:01 | INFO | fairseq.trainer | NOTE: your device does NOT support faster training with --fp16, please switch to FP32 which is likely to be faster\n", + "2021-05-09 14:32:01 | INFO | fairseq.trainer | Loaded checkpoint ../en-indic/model/checkpoint_best.pt (epoch 20 @ 0 updates)\n", + "2021-05-09 14:32:01 | INFO | fairseq.trainer | loading train data for epoch 1\n", + "2021-05-09 14:32:01 | INFO | fairseq.data.data_utils | loaded 930,375 examples from: ../dataset/final_bin/train.SRC-TGT.SRC\n", + "2021-05-09 14:32:01 | INFO | fairseq.data.data_utils | loaded 930,375 examples from: ../dataset/final_bin/train.SRC-TGT.TGT\n", + "2021-05-09 14:32:01 | INFO | fairseq.tasks.translation | ../dataset/final_bin train SRC-TGT 930375 examples\n", + "2021-05-09 14:32:01 | WARNING | fairseq.tasks.fairseq_task | 1,647 samples have invalid sizes and will be skipped, max_positions=(210, 210), first few sample ids=[865604, 927195, 465934, 204968, 865293, 859052, 1713, 672173, 858328, 286278]\n", + "epoch 001: 0% 0/86283 [00:00\n", + " sys.exit(load_entry_point('fairseq', 'console_scripts', 'fairseq-train')())\n", + " File \"/content/finetuning/fairseq/fairseq_cli/train.py\", line 496, in cli_main\n", + " distributed_utils.call_main(cfg, main)\n", + " File \"/content/finetuning/fairseq/fairseq/distributed/utils.py\", line 369, in call_main\n", + " main(cfg, **kwargs)\n", + " File \"/content/finetuning/fairseq/fairseq_cli/train.py\", line 173, in main\n", + " valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)\n", + " File \"/usr/lib/python3.7/contextlib.py\", line 74, in inner\n", + " return func(*args, **kwds)\n", + " File \"/content/finetuning/fairseq/fairseq_cli/train.py\", line 284, in train\n", + " log_output = trainer.train_step(samples)\n", + " File \"/usr/lib/python3.7/contextlib.py\", line 74, in inner\n", + " return func(*args, **kwds)\n", + " File \"/content/finetuning/fairseq/fairseq/trainer.py\", line 810, in train_step\n", + " raise e\n", + " File \"/content/finetuning/fairseq/fairseq/trainer.py\", line 782, in train_step\n", + " self.optimizer, model=self.model, update_num=self.get_num_updates()\n", + " File \"/content/finetuning/fairseq/fairseq/tasks/fairseq_task.py\", line 489, in optimizer_step\n", + " optimizer.step()\n", + " File \"/content/finetuning/fairseq/fairseq/optim/fp16_optimizer.py\", line 213, in step\n", + " self.fp32_optimizer.step(closure, groups=groups)\n", + " File \"/content/finetuning/fairseq/fairseq/optim/fairseq_optimizer.py\", line 127, in step\n", + " self.optimizer.step(closure)\n", + " File \"/usr/local/lib/python3.7/dist-packages/torch/optim/optimizer.py\", line 89, in wrapper\n", + " return func(*args, **kwargs)\n", + " File \"/content/finetuning/fairseq/fairseq/optim/adam.py\", line 210, in step\n", + " denom = exp_avg_sq.sqrt().add_(group[\"eps\"])\n", + "RuntimeError: CUDA out of memory. Tried to allocate 1.79 GiB (GPU 0; 11.17 GiB total capacity; 8.96 GiB already allocated; 1.66 GiB free; 9.08 GiB reserved in total by PyTorch)\n" + ] + } + ], + "source": [ + "# Finetuning the model\n", + "\n", + "# pls refer to fairseq documentaion to know more about each of these options (https://fairseq.readthedocs.io/en/latest/command_line_tools.html)\n", + "\n", + "\n", + "# some notable args:\n", + "# --max-update=1000 -> for this example, to demonstrate how to finetune we are only training for 1000 steps. You should increase this when finetuning\n", + "# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer base)\n", + "# --user_dir -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture\n", + "# --lr -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.\n", + "# --restore-file -> reload the pretrained checkpoint and start training from here (change this path for indic-en. Currently its is set to en-indic)\n", + "# --reset-* -> reset and not use lr scheduler, dataloader, optimizer etc of the older checkpoint\n", + "# --max_tokns -> this is max tokens per batch\n", + "\n", + "\n", + "!( fairseq-train ../dataset/final_bin \\\n", + "--max-source-positions=210 \\\n", + "--max-target-positions=210 \\\n", + "--max-update=1000 \\\n", + "--save-interval=1 \\\n", + "--arch=transformer_4x \\\n", + "--criterion=label_smoothed_cross_entropy \\\n", + "--source-lang=SRC \\\n", + "--lr-scheduler=inverse_sqrt \\\n", + "--target-lang=TGT \\\n", + "--label-smoothing=0.1 \\\n", + "--optimizer adam \\\n", + "--adam-betas \"(0.9, 0.98)\" \\\n", + "--clip-norm 1.0 \\\n", + "--warmup-init-lr 1e-07 \\\n", + "--warmup-updates 4000 \\\n", + "--dropout 0.2 \\\n", + "--tensorboard-logdir ../dataset/tensorboard-wandb \\\n", + "--save-dir ../dataset/model \\\n", + "--keep-last-epochs 5 \\\n", + "--patience 5 \\\n", + "--skip-invalid-size-inputs-valid-test \\\n", + "--fp16 \\\n", + "--user-dir model_configs \\\n", + "--update-freq=2 \\\n", + "--distributed-world-size 1 \\\n", + "--max-tokens 256 \\\n", + "--lr 3e-5 \\\n", + "--restore-file ../en-indic/model/checkpoint_best.pt \\\n", + "--reset-lr-scheduler \\\n", + "--reset-meters \\\n", + "--reset-dataloader \\\n", + "--reset-optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tpPsT1e7vuO9" + }, + "outputs": [], + "source": [ + "# To test the models after training, you can use joint_translate.sh\n", + "\n", + "\n", + "\n", + "# joint_translate takes src_file, output_fname, src_lang, tgt_lang, model_folder as inputs\n", + "# src_file -> input text file to be translated\n", + "# output_fname -> name of the output file (will get created) containing the model predictions\n", + "# src_lang -> source lang code of the input text ( in this case we are using en-indic model and hence src_lang would be 'en')\n", + "# tgt_lang -> target lang code of the input text ( tgt lang for en-indic model would be any of the 11 indic langs we trained on:\n", + "# as, bn, hi, gu, kn, ml, mr, or, pa, ta, te)\n", + "# supported languages are:\n", + "# as - assamese, bn - bengali, gu - gujarathi, hi - hindi, kn - kannada, \n", + "# ml - malayalam, mr - marathi, or - oriya, pa - punjabi, ta - tamil, te - telugu\n", + "\n", + "# model_dir -> the directory containing the model and the vocab files\n", + "\n", + "# Note: if the translation is taking a lot of time, please tune the buffer_size and batch_size parameter for fairseq-interactive defined inside this joint_translate script\n", + "\n", + "\n", + "# here we are translating the english sentences to hindi\n", + "!bash joint_translate.sh $exp_dir/test/test.en en_hi_outputs.txt 'en' 'hi' $exp_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bPqneByPxilN" + }, + "outputs": [], + "source": [ + "# to compute bleu scores for the predicitions with a reference file, use the following command\n", + "# arguments:\n", + "# pred_fname: file that contains model predictions\n", + "# ref_fname: file that contains references\n", + "# src_lang and tgt_lang : the source and target language\n", + "\n", + "bash compute_bleu.sh en_hi_outputs.txt $exp_dir/test/test.hi 'en' 'hi'\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "indicTrans_Finetuning.ipynb", + "provenance": [] + }, + "interpreter": { + "hash": "3c7d4130300118f0c7487d576c6841c0dbbdeec039e1e658ac9b107412a09af0" + }, + "kernelspec": { + "display_name": "Python 3.7.7 64-bit", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/indicTrans_python_interface.ipynb b/indicTrans_python_interface.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7215f7e953c650ca83d506638763a8309e4a581c --- /dev/null +++ b/indicTrans_python_interface.ipynb @@ -0,0 +1,462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CjfzxXZLHed_", + "outputId": "69a66b95-41b2-4413-82d1-0caacbddb3f3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'indicTrans-1'...\n", + "remote: Enumerating objects: 486, done.\u001b[K\n", + "remote: Counting objects: 100% (189/189), done.\u001b[K\n", + "remote: Compressing objects: 100% (67/67), done.\u001b[K\n", + "remote: Total 486 (delta 154), reused 134 (delta 121), pack-reused 297\u001b[K\n", + "Receiving objects: 100% (486/486), 1.48 MiB | 17.61 MiB/s, done.\n", + "Resolving deltas: 100% (281/281), done.\n", + "/content/indicTrans\n", + "Cloning into 'indic_nlp_library'...\n", + "remote: Enumerating objects: 1325, done.\u001b[K\n", + "remote: Counting objects: 100% (147/147), done.\u001b[K\n", + "remote: Compressing objects: 100% (103/103), done.\u001b[K\n", + "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n", + "Receiving objects: 100% (1325/1325), 9.57 MiB | 13.55 MiB/s, done.\n", + "Resolving deltas: 100% (688/688), done.\n", + "Cloning into 'indic_nlp_resources'...\n", + "remote: Enumerating objects: 133, done.\u001b[K\n", + "remote: Counting objects: 100% (7/7), done.\u001b[K\n", + "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", + "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n", + "Receiving objects: 100% (133/133), 149.77 MiB | 33.48 MiB/s, done.\n", + "Resolving deltas: 100% (51/51), done.\n", + "Checking out files: 100% (28/28), done.\n", + "Cloning into 'subword-nmt'...\n", + "remote: Enumerating objects: 580, done.\u001b[K\n", + "remote: Counting objects: 100% (4/4), done.\u001b[K\n", + "remote: Compressing objects: 100% (4/4), done.\u001b[K\n", + "remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n", + "Receiving objects: 100% (580/580), 237.41 KiB | 18.26 MiB/s, done.\n", + "Resolving deltas: 100% (349/349), done.\n", + "/content\n" + ] + } + ], + "source": [ + "# clone the repo for running evaluation\n", + "!git clone https://github.com/AI4Bharat/indicTrans.git\n", + "%cd indicTrans\n", + "# clone requirements repositories\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n", + "!git clone https://github.com/rsennrich/subword-nmt.git\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IeYW2BJhlJvx", + "outputId": "3357bc85-44d8-43b0-8c64-eef9f18be716" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting sacremoses\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", + "\r\u001b[K |▍ | 10kB 14.0MB/s eta 0:00:01\r\u001b[K |▊ | 20kB 18.8MB/s eta 0:00:01\r\u001b[K |█ | 30kB 22.5MB/s eta 0:00:01\r\u001b[K |█▌ | 40kB 25.7MB/s eta 0:00:01\r\u001b[K |█▉ | 51kB 27.6MB/s eta 0:00:01\r\u001b[K |██▏ | 61kB 29.2MB/s eta 0:00:01\r\u001b[K |██▋ | 71kB 27.3MB/s eta 0:00:01\r\u001b[K |███ | 81kB 27.7MB/s eta 0:00:01\r\u001b[K |███▎ | 92kB 28.8MB/s eta 0:00:01\r\u001b[K |███▋ | 102kB 29.9MB/s eta 0:00:01\r\u001b[K |████ | 112kB 29.9MB/s eta 0:00:01\r\u001b[K |████▍ | 122kB 29.9MB/s eta 0:00:01\r\u001b[K |████▊ | 133kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▏ | 143kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▌ | 153kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▉ | 163kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▎ | 174kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▋ | 184kB 29.9MB/s eta 0:00:01\r\u001b[K |███████ | 194kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▎ | 204kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▊ | 215kB 29.9MB/s eta 0:00:01\r\u001b[K |████████ | 225kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▍ | 235kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▉ | 245kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▏ | 256kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 266kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▉ | 276kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▎ | 286kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▋ | 296kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████ | 307kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▍ | 317kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▊ | 327kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████ | 337kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▌ | 348kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▉ | 358kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▏ | 368kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▌ | 378kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████ | 389kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▎ | 399kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 409kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████ | 419kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▍ | 430kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 440kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████ | 450kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 460kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▉ | 471kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▏ | 481kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▋ | 491kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████ | 501kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 512kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 522kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████ | 532kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 542kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▊ | 552kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▏ | 563kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▌ | 573kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▉ | 583kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▎ | 593kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 604kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 614kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 624kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▊ | 634kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 645kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▍ | 655kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 665kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 675kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 686kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 696kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 706kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 716kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 727kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▍ | 737kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▊ | 747kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████ | 757kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▌ | 768kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 778kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 788kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 798kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 808kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 819kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▋ | 829kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 839kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▍ | 849kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 860kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 870kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 880kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▉| 890kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 901kB 29.9MB/s \n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", + "Collecting mock\n", + " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n", + "Collecting sacrebleu\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n", + "\u001b[K |████████████████████████████████| 61kB 7.5MB/s \n", + "\u001b[?25hCollecting tensorboardX\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n", + "\u001b[K |████████████████████████████████| 133kB 47.5MB/s \n", + "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n", + "Collecting indic-nlp-library\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n", + "\u001b[K |████████████████████████████████| 40kB 5.2MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n", + "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n", + "Collecting portalocker==2.0.0\n", + " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n", + "Collecting sphinx-rtd-theme\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n", + "\u001b[K |████████████████████████████████| 9.2MB 42.0MB/s \n", + "\u001b[?25hCollecting morfessor\n", + " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n", + "Collecting sphinx-argparse\n", + " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n", + "Requirement already satisfied: sphinx in /usr/local/lib/python3.7/dist-packages (from sphinx-rtd-theme->indic-nlp-library) (1.8.5)\n", + "Collecting docutils<0.17\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n", + "\u001b[K |████████████████████████████████| 552kB 31.5MB/s \n", + "\u001b[?25hRequirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.4)\n", + "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.1.0)\n", + "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.11.3)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (20.9)\n", + "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (0.7.12)\n", + "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.0)\n", + "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.6.1)\n", + "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.23.0)\n", + "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.9.1)\n", + "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.1.5)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.0.1)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.4.7)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.10)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2021.5.30)\n", + "Building wheels for collected packages: sphinx-argparse\n", + " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=d8cbdca000085e2e2c122c305bb21aa76a9600012ded8e06c300e03d1c4d1e32\n", + " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n", + "Successfully built sphinx-argparse\n", + "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n", + "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, docutils, sphinx-rtd-theme, morfessor, sphinx-argparse, indic-nlp-library\n", + " Found existing installation: docutils 0.17.1\n", + " Uninstalling docutils-0.17.1:\n", + " Successfully uninstalled docutils-0.17.1\n", + "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n", + "Collecting mosestokenizer\n", + " Downloading https://files.pythonhosted.org/packages/4b/b3/c0af235b16c4f44a2828ef017f7947d1262b2646e440f85c6a2ff26a8c6f/mosestokenizer-1.1.0.tar.gz\n", + "Collecting subword-nmt\n", + " Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl\n", + "Requirement already satisfied: docopt in /usr/local/lib/python3.7/dist-packages (from mosestokenizer) (0.6.2)\n", + "Collecting openfile\n", + " Downloading https://files.pythonhosted.org/packages/93/e6/805db6867faacb488b44ba8e0829ef4de151dd0499f3c5da5f4ad11698a7/openfile-0.0.7-py3-none-any.whl\n", + "Collecting uctools\n", + " Downloading https://files.pythonhosted.org/packages/04/cb/70ed842d9a43460eedaa11f7503b4ab6537b43b63f0d854d59d8e150fac1/uctools-1.3.0.tar.gz\n", + "Collecting toolwrapper\n", + " Downloading https://files.pythonhosted.org/packages/41/7b/34bf8fb69426d8a18bcc61081e9d126f4fcd41c3c832072bef39af1602cd/toolwrapper-2.1.0.tar.gz\n", + "Building wheels for collected packages: mosestokenizer, uctools, toolwrapper\n", + " Building wheel for mosestokenizer (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for mosestokenizer: filename=mosestokenizer-1.1.0-cp37-none-any.whl size=49120 sha256=4fc04046040e73bd5d13c606ebbfc65ac38c7d073f7fc0b0e4cc1d4215b595f3\n", + " Stored in directory: /root/.cache/pip/wheels/a2/e7/48/48d5e0f9c0cd5def2dfd7cb8543945f906448ed1313de24a29\n", + " Building wheel for uctools (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for uctools: filename=uctools-1.3.0-cp37-none-any.whl size=6163 sha256=c5a865107c59f98c4da5d18ddc754fa141ab494574187281de1502561c6a004e\n", + " Stored in directory: /root/.cache/pip/wheels/06/b6/8f/935d5bf5bca85d47c6f5ec31641879bba057d336ab36b1e773\n", + " Building wheel for toolwrapper (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for toolwrapper: filename=toolwrapper-2.1.0-cp37-none-any.whl size=3356 sha256=41a3e12078d5681e8467701735208d880ba588b0f5dbfb3b99c4e04bc643eccc\n", + " Stored in directory: /root/.cache/pip/wheels/84/ea/29/e02f3b855bf19344972092873a1091b329309bbc3d3d0cbaef\n", + "Successfully built mosestokenizer uctools toolwrapper\n", + "Installing collected packages: openfile, uctools, toolwrapper, mosestokenizer, subword-nmt\n", + "Successfully installed mosestokenizer-1.1.0 openfile-0.0.7 subword-nmt-0.3.7 toolwrapper-2.1.0 uctools-1.3.0\n", + "Cloning into 'fairseq'...\n", + "remote: Enumerating objects: 28410, done.\u001b[K\n", + "remote: Counting objects: 100% (229/229), done.\u001b[K\n", + "remote: Compressing objects: 100% (127/127), done.\u001b[K\n", + "remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n", + "Receiving objects: 100% (28410/28410), 11.96 MiB | 24.16 MiB/s, done.\n", + "Resolving deltas: 100% (21310/21310), done.\n", + "/content/fairseq\n", + "Obtaining file:///content/fairseq\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n", + "Collecting hydra-core<1.1\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n", + "\u001b[K |████████████████████████████████| 133kB 11.6MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n", + "Collecting omegaconf<2.1\n", + " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n", + "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n", + "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n", + "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n", + "Collecting antlr4-python3-runtime==4.8\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n", + "\u001b[K |████████████████████████████████| 112kB 33.5MB/s \n", + "\u001b[?25hRequirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n", + "Collecting PyYAML>=5.1.*\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n", + "\u001b[K |████████████████████████████████| 645kB 30.2MB/s \n", + "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n", + "Requirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n", + "Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n", + "Building wheels for collected packages: antlr4-python3-runtime\n", + " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=69960f774a6fdb385fed1a63fb02ae50b57299408cfd6fb33be60d686be878b7\n", + " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n", + "Successfully built antlr4-python3-runtime\n", + "Installing collected packages: antlr4-python3-runtime, PyYAML, omegaconf, hydra-core, fairseq\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + " Running setup.py develop for fairseq\n", + "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n", + "/content\n" + ] + } + ], + "source": [ + "# Install the necessary libraries\n", + "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n", + "! pip install mosestokenizer subword-nmt\n", + "# Install fairseq from source\n", + "!git clone https://github.com/pytorch/fairseq.git\n", + "%cd fairseq\n", + "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n", + "!pip install --editable ./\n", + "\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "TktUu9NW_PLq" + }, + "outputs": [], + "source": [ + "# this step is only required if you are running the code on colab\n", + "# restart the runtime after running prev cell (to update). See this -> https://stackoverflow.com/questions/57838013/modulenotfounderror-after-successful-pip-install-in-google-colaboratory\n", + "\n", + "# this import will not work without restarting runtime\n", + "from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E_4JxNdRlPQB", + "outputId": "82ab5e2f-d560-4f4e-bf3f-f1ca0a8d31b8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2021-06-27 12:43:16-- https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.13.240, 172.217.15.80, 142.251.33.208, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.13.240|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4551079075 (4.2G) [application/zip]\n", + "Saving to: ‘indic-en.zip’\n", + "\n", + "indic-en.zip 100%[===================>] 4.24G 28.8MB/s in 83s \n", + "\n", + "2021-06-27 12:44:39 (52.1 MB/s) - ‘indic-en.zip’ saved [4551079075/4551079075]\n", + "\n", + "Archive: indic-en.zip\n", + " creating: indic-en/\n", + " creating: indic-en/vocab/\n", + " inflating: indic-en/vocab/bpe_codes.32k.SRC \n", + " inflating: indic-en/vocab/vocab.SRC \n", + " inflating: indic-en/vocab/vocab.TGT \n", + " inflating: indic-en/vocab/bpe_codes.32k.TGT \n", + " creating: indic-en/final_bin/\n", + " inflating: indic-en/final_bin/dict.TGT.txt \n", + " inflating: indic-en/final_bin/dict.SRC.txt \n", + " creating: indic-en/model/\n", + " inflating: indic-en/model/checkpoint_best.pt \n", + "/content/indicTrans\n" + ] + } + ], + "source": [ + "# download the indictrans model\n", + "\n", + "\n", + "# downloading the indic-en model\n", + "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip\n", + "!unzip indic-en.zip\n", + "\n", + "# downloading the en-indic model\n", + "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip\n", + "# !unzip en-indic.zip\n", + "\n", + "# # downloading the indic-indic model\n", + "# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip\n", + "# !unzip m2m.zip\n", + "\n", + "%cd indicTrans" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yTnWbHqY01-B", + "outputId": "0d075f51-097b-46ad-aade-407a4437aa62" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initializing vocab and bpe\n", + "Initializing model for translation\n" + ] + } + ], + "source": [ + "from indicTrans.inference.engine import Model\n", + "\n", + "indic2en_model = Model(expdir='../indic-en')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QTp2NOgQ__sB", + "outputId": "e015a71e-8206-4e1d-cb3e-11ecb4d44f76" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3/3 [00:00<00:00, 1225.21it/s]\n", + "/usr/local/lib/python3.7/dist-packages/torch/_tensor.py:575: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.\n", + "To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)\n", + " return torch.floor_divide(self, other)\n" + ] + }, + { + "data": { + "text/plain": [ + "['He seems to know us.',\n", + " 'I couldnt find it anywhere.',\n", + " 'If someone in your neighbourhood develops these symptoms, staying at home can help prevent the spread of the coronavirus infection.']" + ] + }, + "execution_count": 11, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "ta_sents = ['அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது',\n", + " \"இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\",\n", + " 'உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.']\n", + "\n", + "\n", + "indic2en_model.batch_translate(ta_sents, 'ta', 'en')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + }, + "id": "VFXrCNZGEN7Z", + "outputId": "f72aad17-1cc0-4774-a7ee-5b3a5d954de3" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4/4 [00:00<00:00, 1496.76it/s]\n" + ] + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'The pandemic has resulted in worldwide social and economic disruption. The world is facing the worst recession since the global financial crisis. This led to the postponement or cancellation of sporting, religious, political and cultural events. Due to the fear, there was shortage of supply as more people purchased items like masks, sanitizers etc.'" + ] + }, + "execution_count": 13, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "ta_paragraph = \"\"\"இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.\n", + "அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.\"\"\"\n", + "\n", + "indic2en_model.translate_paragraph(ta_paragraph, 'ta', 'en')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Hi_D7s_VIjis" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "ABX9TyM3t8oQYMhBUuq4/Pyhcr0+", + "collapsed_sections": [], + "include_colab_link": true, + "name": "indicTrans_python_interface.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/indic_nlp_library/LICENSE b/indic_nlp_library/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..ca34be1f41fef5f3a3fcd86197fe5932f7f3eed9 --- /dev/null +++ b/indic_nlp_library/LICENSE @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2013-present Anoop Kunchukuttan + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/indic_nlp_library/README.md b/indic_nlp_library/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0b7f8a82798e3ee874f8f838a635f89290d3e47e --- /dev/null +++ b/indic_nlp_library/README.md @@ -0,0 +1,142 @@ +# Indic NLP Library + +The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text. + +The library provides the following functionalities: + +- Text Normalization +- Script Information +- Word Tokenization and Detokenization +- Sentence Splitting +- Word Segmentation +- Syllabification +- Script Conversion +- Romanization +- Indicization +- Transliteration +- Translation + +The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project. + +**If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/indicnlpweb/indicnlp_catalog) for pointers.** + +## Pre-requisites + +- Python 3.x + - (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible) +- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) +- [Urduhack](https://github.com/urduhack/urduhack): Needed only if Urdu normalization is required. It has other dependencies like Tensorflow. +- Other dependencies are listed in setup.py + + +## Configuration + +- Installation from pip: + + `pip install indic-nlp-library` + +- If you want to use the project from the github repo, add the project to the Python Path: + + - Clone this repository + - Install dependencies: `pip install -r requirements.txt` + - Run: `export PYTHONPATH=$PYTHONPATH:` + +- In either case, export the path to the _Indic NLP Resources_ directory + + Run: `export INDIC_RESOURCES_PATH=` + +## Usage + +You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API. + +### Getting Started + +Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API. + - You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb) + +### Documentation + +You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest) + +This documents the Python API as well as the commandline reference. + +## Citing + +If you use this library, please include the following citation: + +``` +@misc{kunchukuttan2020indicnlp, +author = "Anoop Kunchukuttan", +title = "{The IndicNLP Library}", +year = "2020", +howpublished={\url{https://github.com/anoopkunchukuttan/indic_nlp_library/blob/master/docs/indicnlp.pdf}} +} +``` +You can find the document [HERE](docs/indicnlp.pdf) + +## Website + +`http://anoopkunchukuttan.github.io/indic_nlp_library` + +## Author +Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](anoop.kunchukuttan@gmail.com)) + +## Companies, Organizations, Projects using IndicNLP Library + +- [AI4Bharat-IndicNLPSuite](https://indicnlp.ai4bharat.org) +- [The Classical Language Toolkit](http://cltk.org) +- [Microsoft NLP Recipes](https://github.com/microsoft/nlp-recipes) +- [Facebook M2M-100](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100) + +## Revision Log + + +0.81 : 26 May 2021 + + - Bug fix in version number extraction + +0.80 : 24 May 2021 + + - Improved sentence splitting + - Bug fixes + - Support for Urdu Normalizer + +0.71 : 03 Sep 2020 + + - Improved documentation + - Bug fixes + +0.7 : 02 Apr 2020: + + - Unified commandline + - Improved documentation + - Added setup.py + +0.6 : 16 Dec 2019: + + - New romanizer and indicizer + - Script Unifiers + - Improved script normalizers + - Added contrib directory for sample uses + - changed to MIT license + +0.5 : 03 Jun 2019: + + - Improved word tokenizer to handle dates and numbers. + - Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics. + - Added detokenizer + - Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts + +0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification. + +0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages + +0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages + +0.1 : 12 Mar 2014: Initial version. Supports text normalization. + +## LICENSE + +Indic NLP Library is released under the MIT license + + diff --git a/indic_nlp_library/contrib/README.md b/indic_nlp_library/contrib/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0a99b9ddd9e9bcc72bae930fc8a778f3094fea50 --- /dev/null +++ b/indic_nlp_library/contrib/README.md @@ -0,0 +1,7 @@ +# Contrib + +Contains additional utilities and applications using Indic NLP library core + +- `indic_scraper_project_sample.ipynb`: A simple pipeline for building monolingual corpora for Indian languages from crawled web content, Wikipedia, etc. An extensible framework which allows incorporation of website specific extractors, whereas generic NLP tasks like tokenization, sentence splitting, normalization, etc. are handled by the framework. +- `correct_moses_tokenizer.py`: This script corrects the incorrect tokenization done by Moses tokenizer. The Moses tokenizer splits on nukta and halant characters. +- `hindi_to_kannada_transliterator.py`: This script transliterates Hindi to Kannada. It removes/remaps characters only found in Hindi. It also adds halanta to words ending with consonant - as is the convention in Kannada. diff --git a/indic_nlp_library/contrib/correct_moses_tokenizer.py b/indic_nlp_library/contrib/correct_moses_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..9c656d4d69fd16638dbfa4a4435920bea50a6fe5 --- /dev/null +++ b/indic_nlp_library/contrib/correct_moses_tokenizer.py @@ -0,0 +1,29 @@ +import sys +from indicnlp import langinfo +from indicnlp import loader + +if __name__ == '__main__': + """ + This script corrects the incorrect tokenization done by Moses tokenizer. + The Moses tokenizer splits on nukta and halant characters + Usage: python correct_moses_tokenizer.py + """ + + loader.load() + + infname=sys.argv[1] + outfname=sys.argv[2] + lang=sys.argv[3] + + halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang) + nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang) + + with open(infname,'r',encoding='utf-8') as infile, \ + open(outfname,'w',encoding='utf-8') as outfile: + for line in infile: + outfile.write( + line.replace( + ' {} '.format(halant_char), halant_char).replace( + ' {} '.format(nukta_char), nukta_char).replace( + ' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char)) + ) diff --git a/indic_nlp_library/contrib/hindi_to_kannada_transliterator.py b/indic_nlp_library/contrib/hindi_to_kannada_transliterator.py new file mode 100644 index 0000000000000000000000000000000000000000..a88f7d42120a0ae6eedaea91080c8d2a75539ee8 --- /dev/null +++ b/indic_nlp_library/contrib/hindi_to_kannada_transliterator.py @@ -0,0 +1,62 @@ +import sys +from indicnlp import common +common.set_resources_path(INDIC_NLP_RESOURCES) + +from indicnlp import loader +from indicnlp.normalize import indic_normalize +from indicnlp.transliterate import unicode_transliterate + +if __name__ == '__main__': + """ + This script transliterates Hindi to Kannada. It removes/remaps + characters only found in Hindi. It also adds halanta to words ending + with consonant - as is the convention in Kannada + """ + + infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized + outfname=sys.agv[2] + loader.load() + + normalizer_factory=indic_normalize.IndicNormalizerFactory() + normalizer=normalizer_factory.get_normalizer('hi') + + with open(infname,'r',encoding='utf-8') as infile, \ + open(outfname,'w',encoding='utf-8') as outfile: + for line in infile: + line=line.strip() + line=normalizer.normalize(line) + + ## replace chandrabindus with anusvara + line=line.replace('\u0900','\u0902') + line=line.replace('\u0901','\u0902') + + ### replace chandra e and o diacritics with e and o respectively + #line=line.replace('\u0945','\u0947') + #line=line.replace('\u0949','\u094b') + + ### replace chandra e and o diacritics with a diacritic + ## this seems to be general usage + line=line.replace('\u0945','\u093e') + line=line.replace('\u0949','\u093e') + + ## remove nukta + line=line.replace('\u093c','') + + ## add halant if word ends with consonant + #if isc.is_consonant(isc.get_phonetic_feature_vector(line[-1],'hi')): + # line=line+'\u094d' + words=line.split(' ') + outwords=[] + for word in line.split(' '): + if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')): + word=word+'\u094d' + outwords.append(word) + line=' '.join(outwords) + + + ## script conversion + line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn') + + outfile.write(line+'\n') + + diff --git a/indic_nlp_library/contrib/indic_scraper_project_sample.ipynb b/indic_nlp_library/contrib/indic_scraper_project_sample.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..5ada2c984ecd5dd0b88c5eb91591d02b803c0fed --- /dev/null +++ b/indic_nlp_library/contrib/indic_scraper_project_sample.ipynb @@ -0,0 +1,569 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pre-requisites\n", + "\n", + "- Python 3.5+\n", + "- Python packages: \n", + " - `pip install bs4 pandas mmh3`\n", + "- [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library)\n", + "- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initialize the Indic NLP Library\n", + "\n", + "Run the cell below to initialize the Indic NLP Library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The path to the local git repo for Indic NLP Library\n", + "INDIC_NLP_LIB_HOME=\"/disk1/src/indic_nlp_library\"\n", + "\n", + "# The path to the local git repo for Indic NLP Resources\n", + "INDIC_NLP_RESOURCES=\"/disk1/src/indic_nlp_resources\"\n", + "\n", + "import sys\n", + "sys.path.append('{}/src'.format(INDIC_NLP_LIB_HOME))\n", + "\n", + "from indicnlp import common\n", + "common.set_resources_path(INDIC_NLP_RESOURCES)\n", + "\n", + "from indicnlp import loader\n", + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "import os\n", + "import string\n", + "import indicnlp\n", + "from indicnlp.tokenize import indic_tokenize\n", + "from indicnlp.normalize import indic_normalize\n", + "from indicnlp.transliterate import unicode_transliterate\n", + "from indicnlp.tokenize import sentence_tokenize\n", + "import re\n", + "import collections\n", + "import random\n", + "import mmh3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Common Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_sent(text,lang,normalizer):\n", + " \"\"\"\n", + " Pre-process text (normalization and tokenization)\n", + " \n", + " text: text string to preprocess\n", + " lang: language code (2-letter ISO code)\n", + " normalizer: normalizer object for language\n", + " \n", + " returns the processed text string\n", + " \"\"\"\n", + " return ' '.join(indic_tokenize.trivial_tokenize(normalizer.normalize(text.replace('\\n',' ')),lang)) \n", + "\n", + "def sent_split(text,lang):\n", + " \"\"\"\n", + " Sentence splitter\n", + " \n", + " text: text to sentence split \n", + " lang: language\n", + " \n", + " returns list of sentences \n", + " \"\"\"\n", + " return sentence_tokenize.sentence_split(text,lang)\n", + "\n", + "def extract_all_content(indir,lang,\n", + " article_extract_fn,\n", + " preprocess_fn=preprocess_sent,\n", + " narticles=-1,\n", + " start_artid=0):\n", + " \"\"\"\n", + " This method reads all files from the input directory, extracts text content from each file,\n", + " and pre-processes the text. This method is a generator. \n", + " For each sentence, the method yields a tuple of the format: \n", + " \n", + " (artid, fname, paraid, sentid, processed_text)\n", + " \n", + " indir: path to input directoryo containing files to be parsed \n", + " \n", + " lang: language to the files in the input directory\n", + " \n", + " article_extract_fn: the function to extract text content from each file. \n", + " Signature of the function: get_article_contents(fname,lang,encoding) \n", + " `fname` is name of the file, `lang` is langcode, \n", + " `encoding` is text-encoding (default=utf-8). \n", + " The function yields a tuple (paraid, sentid, extracted_text) \n", + " for each sentence.\n", + " \n", + " preprocess_fn: pre-processing function to apply to the extracted text. \n", + " The function takes a string as input and returns processed string as output.\n", + " \n", + " narticles: extract and process the first `narticles` from input directory. \n", + " if narticles=-1 (default), all files are extracted\n", + " \n", + " start_artid: the start of the article id to assign to extracted articles (default=0)\n", + " \n", + " \"\"\"\n", + "\n", + " fnames = os.listdir(indir)\n", + " if narticles>0:\n", + " fnames=fnames[:narticles]\n", + " nsent=0\n", + "\n", + " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n", + " normalizer=normalizer_factory.get_normalizer(lang)\n", + " \n", + " print('Number of articles: {}'.format(len(fnames)))\n", + " for artid, fname in enumerate(fnames,start_artid):\n", + "# print(fname)\n", + " if artid%100 == 0:\n", + " print('({}|{})'.format(artid,nsent),end=' ... ')\n", + " \n", + " try:\n", + " fpath=os.sep.join([indir,fname])\n", + " for paraid, sentid, sent in article_extract_fn(fpath,lang):\n", + " nsent+=1\n", + " yield( ( artid, fname, paraid, sentid, preprocess_fn(sent,lang,normalizer) ) )\n", + " except:\n", + " print('Cannot parse {}'.format(fname))\n", + " \n", + "def write_corpus(corpus_iterator,content_fname,article_mapping_fname,delimiter=' ||| ', encoding='utf-8'):\n", + " \"\"\"\n", + " Writes the extracted corpus to a file. The extracted data is organized in terms of articles, paragraphs \n", + " and sentences. The following is the format of the output file: \n", + " - one line per sentence\n", + " - format of line: article_id, para_id, sent_id, sentence\n", + " In addition to the content file mention, a metadata file which maps the article id to the filename is also written. \n", + " \n", + " corpus_iterator: iterator over the corpus, yielding tuple (artid, fname, paraid, sentid, processed_text). \n", + " The function `extract_all_content` yields a generator in this format. \n", + " content_fname: output content file to write the extracted data to in the format mentioned above\n", + " article_mapping_fname: output metadata file to write article id to filename mapping.\n", + " delimiter=' ||| ': delimiter for the content file. The default delimiter is the same \n", + " as used in the Moses phrase table\n", + " encoding: text encoding default - 'utf-8'\n", + " \n", + " \"\"\"\n", + " \n", + " artid_name_mapping={}\n", + " with open(content_fname,'w',encoding=encoding) as contentfile:\n", + " for artid, fname, paraid, sentid, text in corpus_iterator:\n", + " contentfile.write(delimiter.join([str(artid), str(paraid), str(sentid), text]) + '\\n')\n", + " artid_name_mapping[artid]=fname\n", + "\n", + " with open(article_mapping_fname,'w',encoding=encoding) as artmappingfile:\n", + " for artid, name in sorted(artid_name_mapping.items(),key=lambda x: x[0]):\n", + " artmappingfile.write('{} {} {}\\n'.format(artid,delimiter,name))\n", + "\n", + "def convert_txt_to_csv_format(infname, outfname, encoding='utf-8'):\n", + " \"\"\"\n", + " convert txt file to csv format. This method is used when the text file is directly available.\n", + " The input file has one sentence per line. Assumed to be preprocessed (tokenized, normalized)\n", + " \n", + " \"\"\"\n", + " with open(infname,'r',encoding=encoding) as infile, \\\n", + " open(outfname,'w',encoding=encoding) as outfile: \n", + " for i, line in enumerate(infile):\n", + " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,line.strip()))\n", + " \n", + "def preprocess_convert_txt_to_csv_format(infname, outfname, lang, encoding='utf-8'):\n", + " \"\"\"\n", + " Convert raw text file to csv format\n", + " \"\"\"\n", + " \n", + " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n", + " normalizer=normalizer_factory.get_normalizer(lang)\n", + " \n", + " with open(infname,'r',encoding=encoding) as infile, \\\n", + " open(outfname,'w',encoding=encoding) as outfile: \n", + " i=0\n", + " for line in infile:\n", + " sents = sent_split(line.strip(),lang)\n", + " for sent in sents:\n", + " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,\n", + " preprocess_sent(sent.strip(), lang, normalizer)) )\n", + " i=i+1\n", + "\n", + "def print_txt(infnames, outfname, encoding='utf-8'):\n", + " \"\"\"\n", + " Extract only the text from the content csv file. The output file has one sentence per file.\n", + " \"\"\"\n", + " with open(outfname,'w',encoding=encoding) as outfile: \n", + " for infname in filter(lambda x: os.path.isfile(x),infnames):\n", + " with open(infname,'r',encoding=encoding) as infile:\n", + " for i, line in enumerate(infile):\n", + " fields=line.strip().split('|||')\n", + " if len(fields) >=4:\n", + " outfile.write('{}\\n'.format(fields[3].strip()))\n", + " \n", + "# def dedup_and_print_txt(infnames, outfname, encoding='utf-8'):\n", + " \n", + "# total=0\n", + "# unique=0\n", + "# hash_codes=set()\n", + " \n", + "# with open(outfname,'w',encoding=encoding) as outfile: \n", + "# for infname in filter(lambda x: os.path.isfile(x),infnames):\n", + "# with open(infname,'r',encoding=encoding) as infile:\n", + "# for i, line in enumerate(infile):\n", + "# fields=line.strip().split('|||')\n", + "# if len(fields) >=4:\n", + "# sent=fields[3].strip()\n", + "# total+=1\n", + "# hs=hash(sent)\n", + "# if hs not in hash_codes:\n", + "# outfile.write('{}\\n'.format(sent))\n", + "# hash_codes.add(hs)\n", + "# unique+=1\n", + " \n", + "# print('Total: {}'.format(total))\n", + "# print('Unique: {}'.format(unique))\n", + "\n", + "def dedup_shuffle_and_print_txt(infnames, outfname, max_buf_size=100000,encoding='utf-8'):\n", + " \"\"\"\n", + " The method creates a sentence level corpora from multiple content csv files.\n", + " All sentences are extracted, they are de-duplicated using murmurhash and shuffled\n", + " before writing the entire corpus to the output file. The output file has one sentence per line.\n", + "\n", + " \"\"\"\n", + " \n", + " total=0\n", + " unique=0\n", + " hash_codes=set()\n", + " sent_buffer=[]\n", + " \n", + " with open(outfname,'w',encoding=encoding) as outfile: \n", + " for infname in filter(lambda x: os.path.isfile(x),infnames):\n", + " print('Processing: {}'.format(infname))\n", + " with open(infname,'r',encoding=encoding) as infile:\n", + " for i, line in enumerate(infile):\n", + " fields=line.strip().split('|||')\n", + " if len(fields) >=4:\n", + " sent=fields[3].strip()\n", + " total+=1\n", + "# hs=hash(sent)\n", + " hs=mmh3.hash128(sent)\n", + " if hs not in hash_codes:\n", + "# outfile.write('{}\\n'.format(sent))\n", + " sent_buffer.append(sent)\n", + " hash_codes.add(hs)\n", + " unique+=1\n", + " if len(sent_buffer)>=max_buf_size:\n", + " random.shuffle(sent_buffer)\n", + " for sent in sent_buffer: \n", + " outfile.write('{}\\n'.format(sent))\n", + " sent_buffer.clear()\n", + " \n", + " if len(sent_buffer)>0:\n", + " random.shuffle(sent_buffer)\n", + " for sent in sent_buffer: \n", + " outfile.write('{}\\n'.format(sent))\n", + " sent_buffer.clear() \n", + " \n", + " print('Total: {}'.format(total))\n", + " print('Unique: {}'.format(unique))\n", + "\n", + "def extract_wikiextractor_file(infname, outfname, lang, \n", + " encoding='utf-8', delimiter=' ||| ', preprocess_fn=preprocess_sent):\n", + " \"\"\"\n", + " Extract text content into a content csv file from wikipedia article page. \n", + " The wikipedia article page is the output from `wikiextractor` [https://github.com/attardi/wikiextractor] \n", + " \n", + " \"\"\"\n", + " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n", + " normalizer=normalizer_factory.get_normalizer(lang)\n", + " \n", + " with open(infname,'r',encoding=encoding) as infile, \\\n", + " open(outfname,'w',encoding=encoding) as outfile: \n", + " artid=-1\n", + " paraid=0\n", + " for line in infile:\n", + " if line.find('0:\n", + " for sentid, sent in enumerate(sent_split(line.strip(),lang)):\n", + " sent=sent.strip()\n", + " if sent!='':\n", + " sent = preprocess_fn(sent,lang,normalizer)\n", + " outfile.write(delimiter.join([str(artid), str(paraid), str(sentid), sent]) + '\\n')\n", + " paraid+=1\n", + "\n", + " \n", + "def extract_leipzig_corpus(infname,outfname,lang,encoding='utf-8'):\n", + " \"\"\"\n", + " Extractor for files form the Leipzig corpus\n", + " [http://wortschatz.uni-leipzig.de/en/download/]\n", + " \n", + " \"\"\"\n", + " normalizer_factory=indic_normalize.IndicNormalizerFactory()\n", + " normalizer=normalizer_factory.get_normalizer(lang) \n", + "\n", + " with open(infname,'r',encoding=encoding) as infile, \\\n", + " open(outfname,'w',encoding=encoding) as outfile: \n", + " for i, line in enumerate(infile):\n", + " outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,preprocess_sent(line,lang,normalizer))) \n", + " \n", + "def dataset_stats(fname):\n", + " \"\"\"\n", + " Extracts dataset statistics from the final extracted file. This input file contains\n", + " one sentence per line. The sentences are tokenized.\n", + " \"\"\"\n", + "\n", + " all_puncs=set(string.punctuation+'\\u0964\\u0965')\n", + " \n", + " sent_count=0\n", + " token_cnt=0\n", + " true_token_cnt=0\n", + " tokens=set()\n", + " \n", + " with open(fname,'r',encoding='utf-8') as infile:\n", + " for line in infile:\n", + " sent_count+=1\n", + " a=line.strip().split(' ')\n", + " token_cnt+=len(a)\n", + " b=list(filter(lambda x: x not in all_puncs,a))\n", + " true_token_cnt+=len(b)\n", + " tokens.update(b)\n", + " \n", + " print('== Stats ==')\n", + " print('Sent count: {}'.format(sent_count))\n", + " print('Token count: {}'.format(token_cnt))\n", + " print('True Token count: {}'.format(true_token_cnt))\n", + " print('Unique Token count: {}'.format(len(tokens)))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Marathi" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wikipedia" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wikipedia extraction commands using wikiextractor\n", + "\n", + "```\n", + "### This uses WikiExtractor (https://github.com/attardi/wikiextractor)\n", + "\n", + "x=/disk1/crawl_project/ta/wikipedia\n", + "mkdir $x\n", + "cd $x\n", + "wget https://dumps.wikimedia.org/tawiki/20190501/tawiki-20190501-pages-articles-multistream.xml.bz2\n", + "cd /disk1/src/wikiextractor\n", + "python3 WikiExtractor.py -cb 250k -o $x/extracted $x/tawiki-20190501-pages-articles-multistream.xml.bz2\n", + "cd -\n", + "find extracted -name '*bz2' -exec bunzip2 -c {} \\; > text.xml\n", + "rm text.xml\n", + "rm tawiki-20190501-pages-articles-multistream.xml.bz2\n", + "rm -rf extracted\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mrwiki-20190401-pages-articles-multistream.xml.bz2\n", + "\n", + "INFO: Finished 1-process extraction of 53715 articles in 123.6s (434.7 art/s)\n", + "\n", + "INFO: total of page: 102025, total of articl page: 53715; total of used articl page: 53715" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Post-processing output generated by wikiextractor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## tex.xml is extracted as shown in commanfs above\n", + "extract_wikiextractor_file('text.xml',\n", + " 'content_fname1.csv',\n", + " 'mr')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loksatta" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Extractor function for Marathi Loksatta page**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_article_contents_mr_loksatta(fname,lang,encoding='utf-8'):\n", + " with open(fname,'r',encoding=encoding) as infile: \n", + " soup = BeautifulSoup(infile)\n", + " for elem in soup.find_all('div'):\n", + " if 'itemprop' in elem.attrs and 'articleBody' in elem['itemprop']:\n", + " filtered_paras=list(filter(lambda x: x.name=='p' and len(x.attrs)==0,elem.children))\n", + " paraid=0\n", + " for blockid, block in enumerate(filtered_paras):\n", + "# print('Para: {}'.format(blockid))\n", + "# print(list(block.strings))\n", + " text=' '.join(block.strings)\n", + " if blockid==0 and text.find(':')>=0 and text.find(':')<20:\n", + " text=':'.join(text.split(':')[1:])\n", + " for para_text in text.split('\\n'): \n", + " for sentid, sent in enumerate(sent_split(para_text,lang)):\n", + " sent=sent.strip()\n", + " if sent!='':\n", + " # print('{}: {}'.format(sentid, sent))\n", + " yield((paraid,sentid,sent))\n", + " # yield((paraid,sentid,preprocess_sent(sent,'ml',normalizer)))\n", + " # print() \n", + " paraid+=1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Extracting data from crawled HTML files**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lang='mr'\n", + "posts_dir='directory_containing_crawled_html_pages'\n", + "content_fname='content_fname2.csv'\n", + "article_mapping_fname='article_mapping_fname'\n", + "get_article_contents=get_article_contents_mr_loksatta\n", + "narticles=-1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "write_corpus(\n", + " extract_all_content(posts_dir, lang, article_extract_fn=get_article_contents,narticles=narticles),\n", + " content_fname,\n", + " article_mapping_fname\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregating all crawled data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### aggregating, de-duplicating and shuffling all the data \n", + "dedup_shuffle_and_print_txt([ 'content_fname1.csv', 'content_fname2.csv' ], 'output_fname.txt' )\n", + "### extract dataset statistics\n", + "dataset_stats('output_fname.txt')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": { + "height": "703px", + "width": "326px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/indic_nlp_library/docs/Makefile b/indic_nlp_library/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..faf86259fdbcb0dff091c22d980623b622f2bbd4 --- /dev/null +++ b/indic_nlp_library/docs/Makefile @@ -0,0 +1,153 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IndicNLPLibrary.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IndicNLPLibrary.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/IndicNLPLibrary" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IndicNLPLibrary" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." diff --git a/indic_nlp_library/docs/cmd.rst b/indic_nlp_library/docs/cmd.rst new file mode 100644 index 0000000000000000000000000000000000000000..86303fb6edc5f7e9921ba96be3735bd8f6f3ec67 --- /dev/null +++ b/indic_nlp_library/docs/cmd.rst @@ -0,0 +1,8 @@ +Commandline +=========== + +.. argparse:: + :module: indicnlp.cli.cliparser + :func: get_parser + :prog: cliparser.py + diff --git a/indic_nlp_library/docs/code.rst b/indic_nlp_library/docs/code.rst new file mode 100644 index 0000000000000000000000000000000000000000..282754cc88a41df2aec511a337b84e1dcafff202 --- /dev/null +++ b/indic_nlp_library/docs/code.rst @@ -0,0 +1,5 @@ +Auto Generated Documentation +============================ + +.. automodule:: indicnlp.langinfo indicnlp.common + :members: diff --git a/indic_nlp_library/docs/conf.py b/indic_nlp_library/docs/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..d3132f5a15fad47556894165ddc6175b6c8adbab --- /dev/null +++ b/indic_nlp_library/docs/conf.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +# +# Indic NLP Library documentation build configuration file, created by +# sphinx-quickstart on Tue Nov 3 01:50:37 2015. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('..')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', 'sphinxarg.ext'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'Indic NLP Library' +copyright = '2015, Anoop Kunchukuttan' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.2' +# The full version, including alpha/beta/rc tags. +release = '0.2' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'IndicNLPLibrarydoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'IndicNLPLibrary.tex', 'Indic NLP Library Documentation', + 'Anoop Kunchukuttan', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'indicnlplibrary', 'Indic NLP Library Documentation', + ['Anoop Kunchukuttan'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'IndicNLPLibrary', 'Indic NLP Library Documentation', + 'Anoop Kunchukuttan', 'IndicNLPLibrary', 'NLP library for Indian languages', + 'NLP'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' diff --git a/indic_nlp_library/docs/index.rst b/indic_nlp_library/docs/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..a9b097b1b620dd136c5026cd401ea85a71de2a55 --- /dev/null +++ b/indic_nlp_library/docs/index.rst @@ -0,0 +1,22 @@ +.. Indic NLP Library documentation master file, created by + sphinx-quickstart on Tue Nov 3 01:50:37 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +:github_url: https://github.com/anoopkunchukuttan/indic_nlp_library + +.. toctree:: + :maxdepth: 2 + :caption: Packages + + indicnlp + +.. toctree:: + :maxdepth: 2 + :caption: Commandline + + cmd + + + + diff --git a/indic_nlp_library/docs/indicnlp.MD b/indic_nlp_library/docs/indicnlp.MD new file mode 100644 index 0000000000000000000000000000000000000000..061775179a9782f2935b45977423111e61f3d85b --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.MD @@ -0,0 +1,122 @@ +# Indic NLP Library +## A unified approach to NLP for Indian languages + +### Anoop Kunchukuttan (`anoop.kunchukuttan@gmail.com`) + +The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text. + +The library provides the following functionalities: + +- Text Normalization +- Script Information +- Word Tokenization and Detokenization +- Sentence Splitting +- Word Segmentation +- Syllabification +- Script Conversion +- Romanization +- Indicization +- Transliteration +- Translation + +The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project. + +**If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/anoopkunchukuttan/indic_nlp_library) for pointers.** + +## Pre-requisites + +- Python 3.x + - (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible) +- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) +- Other dependencies are listed in setup.py + + +## Configuration + +- Installation from pip: + + `pip install indic-nlp-library` + +- If you want to use the project from the github repo, add the project to the Python Path: + + - Clone this repository + - Install dependencies: `pip install -r requirements.txt` + - Run: `export PYTHONPATH=$PYTHONPATH:` + +- In either case, export the path to the _Indic NLP Resources_ directory + + Run: `export INDIC_RESOURCES_PATH=` + +## Usage + +You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API. + +### Getting Started + +Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API. + - You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb) + +### Documentation + +You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest) + +This documents the Python API as well as the commandline reference. + +## Citing + +If you use this library, please include the following citation: + +``` +@unpublished{kunchukuttan2020indicnlp, +author = "Anoop Kunchukuttan", +title = "The IndicNLP Library", +year = "2020", +} +``` +You can find the document [HERE](docs/indicnlp.pdf) + +## Website + +`http://anoopkunchukuttan.github.io/indic_nlp_library` + +## Author +Anoop Kunchukuttan ([anoop.kunchukuttan@gmail.com](anoop.kunchukuttan@gmail.com)) + +## Version: 0.7 + +## Revision Log + +0.7 : 02 Apr 2020: + + - Unified commandline + - Improved documentation + - Added setup.py + +0.6 : 16 Dec 2019: + + - New romanizer and indicizer + - Script Unifiers + - Improved script normalizers + - Added contrib directory for sample uses + - changed to MIT license + +0.5 : 03 Jun 2019: + + - Improved word tokenizer to handle dates and numbers. + - Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics. + - Added detokenizer + - Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts + +0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification. + +0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages + +0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages + +0.1 : 12 Mar 2014: Initial version. Supports text normalization. + +## LICENSE + +Indic NLP Library is released under the MIT license + + diff --git a/indic_nlp_library/docs/indicnlp.cli.rst b/indic_nlp_library/docs/indicnlp.cli.rst new file mode 100644 index 0000000000000000000000000000000000000000..586ece7892d4be9a644d39dafff864f0655e9a41 --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.cli.rst @@ -0,0 +1,11 @@ +cli Package +============= + +:mod:`cliparser` Module +-------------------------------- + +.. automodule:: indicnlp.cli.cliparser + :members: + :undoc-members: + :show-inheritance: + diff --git a/indic_nlp_library/docs/indicnlp.morph.rst b/indic_nlp_library/docs/indicnlp.morph.rst new file mode 100644 index 0000000000000000000000000000000000000000..70074a16683ea38a6cc4aec968d56b85f3e1c536 --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.morph.rst @@ -0,0 +1,11 @@ +morph Package +============= + +:mod:`unsupervised_morph` Module +-------------------------------- + +.. automodule:: indicnlp.morph.unsupervised_morph + :members: + :undoc-members: + :show-inheritance: + diff --git a/indic_nlp_library/docs/indicnlp.normalize.rst b/indic_nlp_library/docs/indicnlp.normalize.rst new file mode 100644 index 0000000000000000000000000000000000000000..afd7e7dad942eeb7128d1394017596998cb5589c --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.normalize.rst @@ -0,0 +1,15 @@ +normalize Package +================= + +:mod:`indic_normalize` Module +----------------------------- + +.. automodule:: indicnlp.normalize.indic_normalize + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: indicnlp.normalize.indic_normalize. + :members: + :undoc-members: + :show-inheritance: diff --git a/indic_nlp_library/docs/indicnlp.pdf b/indic_nlp_library/docs/indicnlp.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1a29abce2b95d74d55b73e43840772faf8b927e3 Binary files /dev/null and b/indic_nlp_library/docs/indicnlp.pdf differ diff --git a/indic_nlp_library/docs/indicnlp.rst b/indic_nlp_library/docs/indicnlp.rst new file mode 100644 index 0000000000000000000000000000000000000000..a14f3d280339cdbb668615982fb548d76a2ea96c --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.rst @@ -0,0 +1,47 @@ +indicnlp Package +================ + +:mod:`common` Module +-------------------- + +.. automodule:: indicnlp.common + :members: + :undoc-members: + :show-inheritance: + +:mod:`langinfo` Module +---------------------- + +.. automodule:: indicnlp.langinfo + :members: + :undoc-members: + :show-inheritance: + +:mod:`loader` Module +-------------------- + +.. automodule:: indicnlp.loader + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + indicnlp.cli + indicnlp.morph + indicnlp.normalize + indicnlp.script + indicnlp.syllable + indicnlp.tokenize + indicnlp.transliterate + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/indic_nlp_library/docs/indicnlp.script.rst b/indic_nlp_library/docs/indicnlp.script.rst new file mode 100644 index 0000000000000000000000000000000000000000..987ccc911840336bf1eaaf1959584537f2bd1804 --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.script.rst @@ -0,0 +1,26 @@ +script Package +============== + +:mod:`indic_scripts` Module +--------------------------- + +.. automodule:: indicnlp.script.indic_scripts + :members: + :undoc-members: + :show-inheritance: + +:mod:`english_script` Module +--------------------------- + +.. automodule:: indicnlp.script.english_script + :members: + :undoc-members: + :show-inheritance: + +:mod:`phonetic_sim` Module +--------------------------- + +.. automodule:: indicnlp.script.phonetic_sim + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/indic_nlp_library/docs/indicnlp.syllable.rst b/indic_nlp_library/docs/indicnlp.syllable.rst new file mode 100644 index 0000000000000000000000000000000000000000..ea7a2517f2a6eacd9afc639cff1d48067dc34b6a --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.syllable.rst @@ -0,0 +1,11 @@ +syllable Package +============== + +:mod:`syllabifier` Module +--------------------------- + +.. automodule:: indicnlp.syllable.syllabifier + :members: + :undoc-members: + :show-inheritance: + diff --git a/indic_nlp_library/docs/indicnlp.tokenize.rst b/indic_nlp_library/docs/indicnlp.tokenize.rst new file mode 100644 index 0000000000000000000000000000000000000000..032c29876f5cc73a3bc9fbd0e73c680b129e99a1 --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.tokenize.rst @@ -0,0 +1,26 @@ +tokenize Package +================ + +:mod:`indic_tokenize` Module +---------------------------- + +.. automodule:: indicnlp.tokenize.indic_tokenize + :members: + :undoc-members: + :show-inheritance: + +:mod:`indic_detokenize` Module +------------------------------ + +.. automodule:: indicnlp.tokenize.indic_detokenize + :members: + :undoc-members: + :show-inheritance: + +:mod:`sentence_tokenize` Module +---------------------------- + +.. automodule:: indicnlp.tokenize.sentence_tokenize + :members: + :undoc-members: + :show-inheritance: diff --git a/indic_nlp_library/docs/indicnlp.transliterate.rst b/indic_nlp_library/docs/indicnlp.transliterate.rst new file mode 100644 index 0000000000000000000000000000000000000000..3d9059306f919c25ccaeb19d86a0d6e9ed498966 --- /dev/null +++ b/indic_nlp_library/docs/indicnlp.transliterate.rst @@ -0,0 +1,34 @@ +transliterate Package +===================== + +:mod:`sinhala_transliterator` Module +------------------------------------ + +.. automodule:: indicnlp.transliterate.sinhala_transliterator + :members: + :undoc-members: + :show-inheritance: + +:mod:`unicode_transliterate` Module +----------------------------------- + +.. automodule:: indicnlp.transliterate.unicode_transliterate + :members: + :undoc-members: + :show-inheritance: + +:mod:`acronym_transliterator` Module +----------------------------------- + +.. automodule:: indicnlp.transliterate.acronym_transliterator + :members: + :undoc-members: + :show-inheritance: + +:mod:`script_unifier` Module +----------------------------------- + +.. automodule:: indicnlp.transliterate.script_unifier + :members: + :undoc-members: + :show-inheritance: diff --git a/indic_nlp_library/docs/make.bat b/indic_nlp_library/docs/make.bat new file mode 100644 index 0000000000000000000000000000000000000000..2119f51099bf37e4fdb6071dce9f451ea44c62dd --- /dev/null +++ b/indic_nlp_library/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/indic_nlp_library/docs/modules.rst b/indic_nlp_library/docs/modules.rst new file mode 100644 index 0000000000000000000000000000000000000000..024709f38f94328dad6abef320c57066b7e25683 --- /dev/null +++ b/indic_nlp_library/docs/modules.rst @@ -0,0 +1,7 @@ +indicnlp +=== + +.. toctree:: + :maxdepth: 4 + + indicnlp diff --git a/indic_nlp_library/indicnlp/__init__.py b/indic_nlp_library/indicnlp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ad075593152cf94d30a903d8add28d8200badbb --- /dev/null +++ b/indic_nlp_library/indicnlp/__init__.py @@ -0,0 +1,10 @@ +import os +import sys + +try: + from .version import __version__ # noqa +except ImportError: + version_txt = os.path.join(os.path.dirname(__file__), "version.txt") + with open(version_txt) as f: + __version__ = f.read().strip() + diff --git a/indic_nlp_library/indicnlp/cli/__init__.py b/indic_nlp_library/indicnlp/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/indic_nlp_library/indicnlp/cli/cliparser.py b/indic_nlp_library/indicnlp/cli/cliparser.py new file mode 100644 index 0000000000000000000000000000000000000000..eb8c8d712668e0814c0f25c162d7a73b329a4da4 --- /dev/null +++ b/indic_nlp_library/indicnlp/cli/cliparser.py @@ -0,0 +1,266 @@ +import argparse +import sys + +from indicnlp import loader +from indicnlp.tokenize import indic_tokenize +from indicnlp.tokenize import indic_detokenize +from indicnlp.normalize import indic_normalize +from indicnlp.morph import unsupervised_morph +from indicnlp.tokenize import sentence_tokenize +from indicnlp.syllable import syllabifier +from indicnlp.transliterate import unicode_transliterate +from indicnlp.transliterate import script_unifier + +DEFAULT_ENCODING='utf-8' + +def run_detokenize(args): + for line in args.infile: + args.outfile.write(indic_detokenize.trivial_detokenize(line,args.lang)) + +def run_tokenize(args): + for line in args.infile: + args.outfile.write(' '.join( + indic_tokenize.trivial_tokenize(line,args.lang))) + +def run_sentence_split(args): + text=' '.join([ l.replace('\n','').replace('\r','') for l in args.infile]) + outlines=sentence_tokenize.sentence_split(text,args.lang) + for line in outlines: + args.outfile.write(line+'\n') + +def run_normalize(args): + + # TODO: add more options to cli + remove_nuktas=False + normalize_nasals='do_nothing' + + # create normalizer + factory=indic_normalize.IndicNormalizerFactory() + normalizer=factory.get_normalizer(args.lang, + remove_nuktas=remove_nuktas, + nasals_mode=normalize_nasals) + + # DO normalization + for line in args.infile: + normalized_line=normalizer.normalize(line) + args.outfile.write(normalized_line) + +def run_morph(args): + + add_marker=False + analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer(args.lang,add_marker) + for line in args.infile: + morph_tokens=analyzer.morph_analyze_document(line.strip().split(' ')) + args.outfile.write(' '.join(morph_tokens) + '\n') + +def run_syllabify(args): + for line in args.infile: + new_line = ' '.join( + [ ' '.join(syllabifier.orthographic_syllabify(w,args.lang)) + for w in line.strip().split(' ') ] + ) + args.outfile.write(new_line+'\n') + +def run_wc(args): + # if args.l==False and args.w==False and args.c==False: + # args.l, args.w, args.c= True, True, True + + nl=0 + nw=0 + nc=0 + + for line in args.infile: + nl+=1 + nw+=len(line.strip(' ').split(' ')) + nc+=len(line) + + print('{} {} {}'.format(nl,nw,nc)) + +def run_indic2roman(args): + for line in args.infile: + transliterated_line=unicode_transliterate.ItransTransliterator.to_itrans( + line,args.lang) + args.outfile.write(transliterated_line) + +def run_roman2indic(args): + for line in args.infile: + transliterated_line=unicode_transliterate.ItransTransliterator.from_itrans( + line,args.lang) + args.outfile.write(transliterated_line) + +def run_script_unify(args): + + unifier=None + + if args.mode=='aggressive': + unifier=script_unifier.AggressiveScriptUnifier(nasals_mode='to_anusvaara_relaxed', common_lang=args.common_lang) + + elif args.mode=='basic': + unifier=script_unifier.BasicScriptUnifier(nasals_mode='do_nothing', + common_lang=args.common_lang) + + elif args.mode=='naive': + unifier=script_unifier.NaiveScriptUnifier(common_lang=args.common_lang) + + assert(unifier is not None) + + for line in args.infile: + transliterated_line=unifier.transform(line,args.lang) + args.outfile.write(transliterated_line) + +def run_script_convert(args): + for line in args.infile: + transliterated_line=unicode_transliterate.UnicodeIndicTransliterator.transliterate( + line,args.srclang,args.tgtlang) + args.outfile.write(transliterated_line) + +def add_common_monolingual_args(task_parser): + task_parser.add_argument('infile', + type=argparse.FileType('r',encoding=DEFAULT_ENCODING), + nargs='?', + default=sys.stdin, + help='Input File path', + ) + task_parser.add_argument('outfile', + type=argparse.FileType('w',encoding=DEFAULT_ENCODING), + nargs='?', + default=sys.stdout, + help='Output File path', + ) + task_parser.add_argument('-l', '--lang', + help='Language', + ) + +def add_common_bilingual_args(task_parser): + task_parser.add_argument('infile', + type=argparse.FileType('r',encoding=DEFAULT_ENCODING), + nargs='?', + default=sys.stdin, + help='Input File path', + ) + task_parser.add_argument('outfile', + type=argparse.FileType('w',encoding=DEFAULT_ENCODING), + nargs='?', + default=sys.stdout, + help='Output File path', + ) + task_parser.add_argument('-s', '--srclang', + help='Source Language', + ) + + task_parser.add_argument('-t', '--tgtlang', + help='Target Language', + ) + +def add_tokenize_parser(subparsers): + task_parser=subparsers.add_parser('tokenize', + help='tokenizer help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_tokenize) + +def add_detokenize_parser(subparsers): + task_parser=subparsers.add_parser('detokenize', + help='de-tokenizer help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_detokenize) + +def add_sentence_split_parser(subparsers): + task_parser=subparsers.add_parser('sentence_split', help='sentence split help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_sentence_split) + +def add_normalize_parser(subparsers): + task_parser=subparsers.add_parser('normalize', help='normalizer help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_normalize) + +def add_morph_parser(subparsers): + task_parser=subparsers.add_parser('morph', help='morph help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_morph) + +def add_syllabify_parser(subparsers): + task_parser=subparsers.add_parser('syllabify', help='syllabify help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_syllabify) + +def add_wc_parser(subparsers): + task_parser=subparsers.add_parser('wc', help='wc help') + + task_parser.add_argument('infile', + type=argparse.FileType('r',encoding=DEFAULT_ENCODING), + nargs='?', + default=sys.stdin, + help='Input File path', + ) + # task_parser.add_argument('-l', action='store_true') + # task_parser.add_argument('-w', action='store_true') + # task_parser.add_argument('-c', action='store_true') + # task_parser.set_defaults(l=False) + # task_parser.set_defaults(w=False) + # task_parser.set_defaults(c=False) + + task_parser.set_defaults(func=run_wc) + +def add_indic2roman_parser(subparsers): + task_parser=subparsers.add_parser('indic2roman', help='indic2roman help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_indic2roman) + +def add_roman2indic_parser(subparsers): + task_parser=subparsers.add_parser('roman2indic', help='roman2indic help') + add_common_monolingual_args(task_parser) + task_parser.set_defaults(func=run_indic2roman) + +def add_script_unify_parser(subparsers): + task_parser=subparsers.add_parser('script_unify', help='script_unify help') + add_common_monolingual_args(task_parser) + task_parser.add_argument('-m','--mode', + default='basic', + choices=['naive', 'basic', 'aggressive'] , + help='Script unification mode', + ) + task_parser.add_argument('-c','--common_lang', + default='hi', + help='Common language in which all languages are represented', + ) + + task_parser.set_defaults(func=run_script_unify) + +def add_script_convert_parser(subparsers): + task_parser=subparsers.add_parser('script_convert', help='script convert help') + add_common_bilingual_args(task_parser) + task_parser.set_defaults(func=run_script_convert) + +def get_parser(): + parser = argparse.ArgumentParser(prog='indicnlp') + subparsers = parser.add_subparsers(help='Invoke each operation with one of the subcommands', dest='subcommand') + + add_tokenize_parser(subparsers) + add_detokenize_parser(subparsers) + add_sentence_split_parser(subparsers) + add_normalize_parser(subparsers) + + add_morph_parser(subparsers) + add_syllabify_parser(subparsers) + + add_wc_parser(subparsers) + + add_indic2roman_parser(subparsers) + add_roman2indic_parser(subparsers) + add_script_unify_parser(subparsers) + + add_script_convert_parser(subparsers) + + return parser + +def main(): + parser=get_parser() + args=parser.parse_args() + # print(args) + args.func(args) + +if __name__ == '__main__': + loader.load() + main() + diff --git a/indic_nlp_library/indicnlp/common.py b/indic_nlp_library/indicnlp/common.py new file mode 100644 index 0000000000000000000000000000000000000000..feff2e790d709f859da975b2d11e338eb91d943c --- /dev/null +++ b/indic_nlp_library/indicnlp/common.py @@ -0,0 +1,58 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +import os + +""" +Path to the Indic NLP Resources directory +""" +INDIC_RESOURCES_PATH='' + +def init(): + """ + Initialize the module. The following actions are performed: + + - Checks of INDIC_RESOURCES_PATH variable is set. If not, checks if it can beb initialized from + INDIC_RESOURCES_PATH environment variable. If that fails, an exception is raised + """ + global INDIC_RESOURCES_PATH + try: + if INDIC_RESOURCES_PATH=='': + INDIC_RESOURCES_PATH=os.environ['INDIC_RESOURCES_PATH'] + except Exception as e: + raise IndicNlpException('INDIC_RESOURCES_PATH not set') + + if INDIC_RESOURCES_PATH=='': + raise IndicNlpException('INDIC_RESOURCES_PATH not set') + + + +def get_resources_path(): + """ + Get the path to the Indic NLP Resources directory + """ + return INDIC_RESOURCES_PATH + +def set_resources_path(resources_path): + """ + Set the path to the Indic NLP Resources directory + """ + global INDIC_RESOURCES_PATH + INDIC_RESOURCES_PATH=resources_path + +class IndicNlpException(Exception): + """ + Exceptions thrown by Indic NLP Library components are instances of this class. + 'msg' attribute contains exception details. + """ + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return repr(self.msg) + diff --git a/indic_nlp_library/indicnlp/langinfo.py b/indic_nlp_library/indicnlp/langinfo.py new file mode 100644 index 0000000000000000000000000000000000000000..efb7e372feeb67d7106eb5c443de2e14053fd204 --- /dev/null +++ b/indic_nlp_library/indicnlp/langinfo.py @@ -0,0 +1,488 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +## language codes +LC_TA='ta' + +SCRIPT_RANGES={ + 'pa':[0x0a00,0x0a7f] , + 'gu':[0x0a80,0x0aff] , + 'or':[0x0b00,0x0b7f] , + 'ta':[0x0b80,0x0bff] , + 'te':[0x0c00,0x0c7f] , + 'kn':[0x0c80,0x0cff] , + 'ml':[0x0d00,0x0d7f] , + 'si':[0x0d80,0x0dff] , + 'hi':[0x0900,0x097f] , + 'mr':[0x0900,0x097f] , + 'kK':[0x0900,0x097f] , + 'sa':[0x0900,0x097f] , + 'ne':[0x0900,0x097f] , + 'sd':[0x0900,0x097f] , + 'bn':[0x0980,0x09ff] , + 'as':[0x0980,0x09ff] , + } + +DRAVIDIAN_LANGUAGES=['ta', 'te', 'kn', 'ml',] +IE_LANGUAGES=['hi', 'mr', 'kK', 'sa', 'ne', 'sd', 'bn', 'as', 'pa', 'gu', 'or', 'si', ] +DANDA_DELIM_LANGUAGES=['as','bn','hi','ne','or','pa','sa','sd'] + +URDU_RANGES=[ + [0x0600,0x06ff], + [0x0750,0x077f], + [0xfb50,0xfdff], + [0xfe70,0xfeff], + ] + +COORDINATED_RANGE_START_INCLUSIVE=0 +COORDINATED_RANGE_END_INCLUSIVE=0x6f + +NUMERIC_OFFSET_START=0x66 +NUMERIC_OFFSET_END=0x6f + +HALANTA_OFFSET=0x4d +AUM_OFFSET=0x50 +NUKTA_OFFSET=0x3c + +RUPEE_SIGN=0x20b9 + +DANDA=0x0964 +DOUBLE_DANDA=0x0965 + +#TODO: add missing fricatives and approximants +VELAR_RANGE=[0x15,0x19] +PALATAL_RANGE=[0x1a,0x1e] +RETROFLEX_RANGE=[0x1f,0x23] +DENTAL_RANGE=[0x24,0x29] +LABIAL_RANGE=[0x2a,0x2e] + +# verify +VOICED_LIST=[0x17,0x18,0x1c,0x1d,0x21,0x22,0x26,0x27,0x2c,0x2d] +UNVOICED_LIST=[0x15,0x16,0x1a,0x1b,0x1f,0x20,0x24,0x25,0x2a,0x2b] #TODO: add sibilants/sonorants +ASPIRATED_LIST=[0x16,0x18,0x1b,0x1d,0x20,0x22,0x25,0x27,0x2b,0x2d] +UNASPIRATED_LIST=[0x15,0x17,0x1a,0x1c,0x1f,0x21,0x24,0x26,0x2a,0x2c] +NASAL_LIST=[0x19,0x1e,0x23,0x28,0x29,0x2d] +FRICATIVE_LIST=[0x36,0x37,0x38] +APPROXIMANT_LIST=[0x2f,0x30,0x31,0x32,0x33,0x34,0x35] + +#TODO: ha has to be properly categorized + +def is_danda_delim(lang): + """ + Returns True if danda/double danda is a possible delimiter for the language + """ + return lang in DANDA_DELIM_LANGUAGES + +def get_offset(c,lang): + """ + Applicable to Brahmi derived Indic scripts + """ + return ord(c)-SCRIPT_RANGES[lang][0] + +def offset_to_char(c,lang): + """ + Applicable to Brahmi derived Indic scripts + """ + return chr(c+SCRIPT_RANGES[lang][0]) + +def in_coordinated_range(c_offset): + """ + Applicable to Brahmi derived Indic scripts + """ + return (c_offset>=COORDINATED_RANGE_START_INCLUSIVE and c_offset<=COORDINATED_RANGE_END_INCLUSIVE) + +def is_indiclang_char(c,lang): + """ + Applicable to Brahmi derived Indic scripts + """ + o=get_offset(c,lang) + return (o>=0 and o<=0x7f) or ord(c)==DANDA or ord(c)==DOUBLE_DANDA + +# def is_vowel(c,lang): +# """ +# Is the character a vowel +# """ +# o=get_offset(c,lang) +# return (o>=0x04 and o<=0x14) + +# def is_vowel_sign(c,lang): +# """ +# Is the character a vowel sign (maatraa) +# """ +# o=get_offset(c,lang) +# return (o>=0x3e and o<=0x4c) + +# def is_halanta(c,lang): +# """ +# Is the character the halanta character +# """ +# o=get_offset(c,lang) +# return (o==HALANTA_OFFSET) + +# def is_nukta(c,lang): +# """ +# Is the character the halanta character +# """ +# o=get_offset(c,lang) +# return (o==NUKTA_OFFSET) + +# def is_aum(c,lang): +# """ +# Is the character a vowel sign (maatraa) +# """ +# o=get_offset(c,lang) +# return (o==AUM_OFFSET) + +# def is_consonant(c,lang): +# """ +# Is the character a consonant +# """ +# o=get_offset(c,lang) +# return (o>=0x15 and o<=0x39) + +# def is_velar(c,lang): +# """ +# Is the character a velar +# """ +# o=get_offset(c,lang) +# return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1]) + +# def is_palatal(c,lang): +# """ +# Is the character a palatal +# """ +# o=get_offset(c,lang) +# return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1]) + +# def is_retroflex(c,lang): +# """ +# Is the character a retroflex +# """ +# o=get_offset(c,lang) +# return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1]) + +# def is_dental(c,lang): +# """ +# Is the character a dental +# """ +# o=get_offset(c,lang) +# return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1]) + +# def is_labial(c,lang): +# """ +# Is the character a labial +# """ +# o=get_offset(c,lang) +# return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1]) + +# def is_voiced(c,lang): +# """ +# Is the character a voiced consonant +# """ +# o=get_offset(c,lang) +# return o in VOICED_LIST + +# def is_unvoiced(c,lang): +# """ +# Is the character a unvoiced consonant +# """ +# o=get_offset(c,lang) +# return o in UNVOICED_LIST + +# def is_aspirated(c,lang): +# """ +# Is the character a aspirated consonant +# """ +# o=get_offset(c,lang) +# return o in ASPIRATED_LIST + +# def is_unaspirated(c,lang): +# """ +# Is the character a unaspirated consonant +# """ +# o=get_offset(c,lang) +# return o in UNASPIRATED_LIST + +# def is_nasal(c,lang): +# """ +# Is the character a nasal consonant +# """ +# o=get_offset(c,lang) +# return o in NASAL_LIST + +# def is_fricative(c,lang): +# """ +# Is the character a fricative consonant +# """ +# o=get_offset(c,lang) +# return o in FRICATIVE_LIST + +# def is_approximant(c,lang): +# """ +# Is the character an approximant consonant +# """ +# o=get_offset(c,lang) +# return o in APPROXIMANT_LIST + +# def is_number(c,lang): +# """ +# Is the character a number +# """ +# o=get_offset(c,lang) +# return (o>=0x66 and o<=0x6f) + + +def is_vowel(c,lang): + """ + Is the character a vowel + """ + o=get_offset(c,lang) + return (o>=0x04 and o<=0x14) + +def is_vowel_sign(c,lang): + """ + Is the character a vowel sign (maatraa) + """ + o=get_offset(c,lang) + return (o>=0x3e and o<=0x4c) + +def is_halanta(c,lang): + """ + Is the character the halanta character + """ + o=get_offset(c,lang) + return (o==HALANTA_OFFSET) + +def is_nukta(c,lang): + """ + Is the character the halanta character + """ + o=get_offset(c,lang) + return (o==NUKTA_OFFSET) + +def is_aum(c,lang): + """ + Is the character a vowel sign (maatraa) + """ + o=get_offset(c,lang) + return (o==AUM_OFFSET) + +def is_consonant(c,lang): + """ + Is the character a consonant + """ + o=get_offset(c,lang) + return (o>=0x15 and o<=0x39) + +def is_velar(c,lang): + """ + Is the character a velar + """ + o=get_offset(c,lang) + return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1]) + +def is_palatal(c,lang): + """ + Is the character a palatal + """ + o=get_offset(c,lang) + return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1]) + +def is_retroflex(c,lang): + """ + Is the character a retroflex + """ + o=get_offset(c,lang) + return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1]) + +def is_dental(c,lang): + """ + Is the character a dental + """ + o=get_offset(c,lang) + return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1]) + +def is_labial(c,lang): + """ + Is the character a labial + """ + o=get_offset(c,lang) + return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1]) + +def is_voiced(c,lang): + """ + Is the character a voiced consonant + """ + o=get_offset(c,lang) + return o in VOICED_LIST + +def is_unvoiced(c,lang): + """ + Is the character a unvoiced consonant + """ + o=get_offset(c,lang) + return o in UNVOICED_LIST + +def is_aspirated(c,lang): + """ + Is the character a aspirated consonant + """ + o=get_offset(c,lang) + return o in ASPIRATED_LIST + +def is_unaspirated(c,lang): + """ + Is the character a unaspirated consonant + """ + o=get_offset(c,lang) + return o in UNASPIRATED_LIST + +def is_nasal(c,lang): + """ + Is the character a nasal consonant + """ + o=get_offset(c,lang) + return o in NASAL_LIST + +def is_fricative(c,lang): + """ + Is the character a fricative consonant + """ + o=get_offset(c,lang) + return o in FRICATIVE_LIST + +def is_approximant(c,lang): + """ + Is the character an approximant consonant + """ + o=get_offset(c,lang) + return o in APPROXIMANT_LIST + +def is_number(c,lang): + """ + Is the character a number + """ + o=get_offset(c,lang) + return (o>=0x66 and o<=0x6f) + + +################################################## + +def is_vowel_offset(c_offset): + """ + Is the offset a vowel + """ + return (c_offset>=0x04 and c_offset<=0x14) + +def is_vowel_sign_offset(c_offset): + """ + Is the offset a vowel sign (maatraa) + """ + return (c_offset>=0x3e and c_offset<=0x4c) + +def is_halanta_offset(c_offset): + """ + Is the offset the halanta offset + """ + return (c_offset==HALANTA_OFFSET) + +def is_nukta_offset(c_offset): + """ + Is the offset the halanta offset + """ + return (c_offset==NUKTA_OFFSET) + +def is_aum_offset(c_offset): + """ + Is the offset a vowel sign (maatraa) + """ + return (c_offset==AUM_OFFSET) + +def is_consonant_offset(c_offset): + """ + Is the offset a consonant + """ + return (c_offset>=0x15 and c_offset<=0x39) + +def is_velar_offset(c_offset): + """ + Is the offset a velar + """ + return (c_offset>=VELAR_RANGE[0] and c_offset<=VELAR_RANGE[1]) + +def is_palatal_offset(c_offset): + """ + Is the offset a palatal + """ + return (c_offset>=PALATAL_RANGE[0] and c_offset<=PALATAL_RANGE[1]) + +def is_retroflex_offset(c_offset): + """ + Is the offset a retroflex + """ + return (c_offset>=RETROFLEX_RANGE[0] and c_offset<=RETROFLEX_RANGE[1]) + +def is_dental_offset(c_offset): + """ + Is the offset a dental + """ + return (c_offset>=DENTAL_RANGE[0] and c_offset<=DENTAL_RANGE[1]) + +def is_labial_offset(c_offset): + """ + Is the offset a labial + """ + return (c_offset>=LABIAL_RANGE[0] and c_offset<=LABIAL_RANGE[1]) + +def is_voiced_offset(c_offset): + """ + Is the offset a voiced consonant + """ + return c_offset in VOICED_LIST + +def is_unvoiced_offset(c_offset): + """ + Is the offset a unvoiced consonant + """ + return c_offset in UNVOICED_LIST + +def is_aspirated_offset(c_offset): + """ + Is the offset a aspirated consonant + """ + return c_offset in ASPIRATED_LIST + +def is_unaspirated_offset(c_offset): + """ + Is the offset a unaspirated consonant + """ + return c_offset in UNASPIRATED_LIST + +def is_nasal_offset(c_offset): + """ + Is the offset a nasal consonant + """ + return c_offset in NASAL_LIST + +def is_fricative_offset(c_offset): + """ + Is the offset a fricative consonant + """ + return c_offset in FRICATIVE_LIST + +def is_approximant_offset(c_offset): + """ + Is the offset an approximant consonant + """ + return c_offset in APPROXIMANT_LIST + +def is_number_offset(c_offset): + """ + Is the offset a number + """ + return (c_offset>=0x66 and c_offset<=0x6f) diff --git a/indic_nlp_library/indicnlp/loader.py b/indic_nlp_library/indicnlp/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..b1304f90e8cb354c3c88628069e77c98672073d3 --- /dev/null +++ b/indic_nlp_library/indicnlp/loader.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +from indicnlp import common +from indicnlp.script import indic_scripts +from indicnlp.script import english_script +from indicnlp.transliterate import unicode_transliterate + +def load(): + """ + Initializes the Indic NLP library. Clients should call this method before using the library. + + Any module requiring initialization should have a init() method, to which a call must be made from this method + """ + + ### Order of intialization may matter + + # Common has to be loaded first to get access to resources + common.init() + + ## Initialization of Indic scripts module + indic_scripts.init() + + ## Initialization of English scripts module + english_script.init() + + ## Initialization of unicode_transliterate module + unicode_transliterate.init() + + diff --git a/indic_nlp_library/indicnlp/morph/__init__.py b/indic_nlp_library/indicnlp/morph/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/indic_nlp_library/indicnlp/morph/unsupervised_morph.py b/indic_nlp_library/indicnlp/morph/unsupervised_morph.py new file mode 100644 index 0000000000000000000000000000000000000000..55c70f13e0ff7d4e89726e6b9c7932649afdf068 --- /dev/null +++ b/indic_nlp_library/indicnlp/morph/unsupervised_morph.py @@ -0,0 +1,142 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +import codecs, sys, itertools,re,os +import morfessor + +from functools import lru_cache + +from indicnlp import langinfo +from indicnlp import common +from indicnlp.tokenize import indic_tokenize + +# Unsupervised Morphological Analyser for Indian languages. +# +# @author Anoop Kunchukuttan +# + +class MorphAnalyzerI(object): + """ + Interface for Morph Analyzer + """ + + def morph_analyze(word): + pass + + def morph_analyze_document(tokens): + pass + +class UnsupervisedMorphAnalyzer(MorphAnalyzerI): + """ + Unsupervised Morphological analyser built using Morfessor 2.0 + """ + + def __init__(self,lang,add_marker=False): + self.lang=lang + self.add_marker=add_marker + + io = morfessor.MorfessorIO() + self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang))) + + self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1])) + self._script_check_re=re.compile(self._script_range_pat) + + def _contains_number(self,text): + if self.lang in langinfo.SCRIPT_RANGES: + for c in text: + offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0] + if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END: + return True + return False + + def _morphanalysis_needed(self,word): + return self._script_check_re.match(word) and not self._contains_number(word) + + @lru_cache(maxsize=16384) + def morph_analyze(self,word): + """ + Morphanalyzes a single word and returns a list of component morphemes + + @param word: string input word + """ + m_list=[] + if self._morphanalysis_needed(word): + val=self._morfessor_model.viterbi_segment(word) + m_list=val[0] + if self.add_marker: + m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)] + else: + if self.add_marker: + word='{}_E_'.format(word) + m_list=[word] + return m_list + + ### Older implementation + #val=self._morfessor_model.viterbi_segment(word) + #m_list=val[0] + #if self.add_marker: + # m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)] + #return m_list + + + def morph_analyze_document(self,tokens): + """ + Morphanalyzes a document, represented as a list of tokens + Each word is morphanalyzed and result is a list of morphemes constituting the document + + @param tokens: string sequence of words + + @return list of segments in the document after morph analysis + """ + + out_tokens=[] + for token in tokens: + morphs=self.morph_analyze(token) + out_tokens.extend(morphs) + return out_tokens + + #### Older implementation + #out_tokens=[] + #for token in tokens: + # if self._morphanalysis_needed(token): + # morphs=self.morph_analyze(token) + # out_tokens.extend(morphs) + # else: + # if self.add_marker: + # token=u'{}_E_'.format(token) + # out_tokens.append(token) + #return out_tokens + + +if __name__ == '__main__': + + if len(sys.argv)<4: + print("Usage: python unsupervised_morph.py []") + sys.exit(1) + + language=sys.argv[3] + common.INDIC_RESOURCES_PATH=sys.argv[4] + + add_marker=False + + if len(sys.argv)==6: + add_marker= True if sys.argv[5] == 'True' else False + + print('Loading morph analyser for ' + language) + analyzer=UnsupervisedMorphAnalyzer(language,add_marker) + print('Loaded morph analyser for ' + language) + + with codecs.open(sys.argv[1],'r','utf-8') as ifile: + with codecs.open(sys.argv[2],'w','utf-8') as ofile: + for line in ifile.readlines(): + line=line.strip() + tokens=indic_tokenize.trivial_tokenize(line) + morph_tokens=analyzer.morph_analyze_document(tokens) + ofile.write(' '.join(morph_tokens)) + ofile.write('\n') + diff --git a/indic_nlp_library/indicnlp/normalize/__init__.py b/indic_nlp_library/indicnlp/normalize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/indic_nlp_library/indicnlp/normalize/indic_normalize.py b/indic_nlp_library/indicnlp/normalize/indic_normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..fcd2f4cddc17e5967a4992afb3ec56488c489e1d --- /dev/null +++ b/indic_nlp_library/indicnlp/normalize/indic_normalize.py @@ -0,0 +1,984 @@ +# -*- coding: utf-8 -*- + +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts +# +# @author Anoop Kunchukuttan +# + +import sys, codecs, string, itertools, re +from indicnlp import langinfo + + +class NormalizerI(object): + """ + The normalizer classes do the following: + + * Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation + * Some control characters are deleted + * While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module + + Base class for normalizer. Performs some common normalization, which includes: + + * Byte order mark, word joiner, etc. removal + * ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal + * ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces + + Script specific normalizers should derive from this class and override the normalize() method. + They can call the super class 'normalize() method to avail of the common normalization + + """ + + BYTE_ORDER_MARK='\uFEFF' + BYTE_ORDER_MARK_2='\uFFFE' + WORD_JOINER='\u2060' + SOFT_HYPHEN='\u00AD' + + ZERO_WIDTH_SPACE='\u200B' + NO_BREAK_SPACE='\u00A0' + + ZERO_WIDTH_NON_JOINER='\u200C' + ZERO_WIDTH_JOINER='\u200D' + + def _normalize_punctuations(self, text): + """ + Normalize punctuations. + Applied many of the punctuation normalizations that are part of MosesNormalizer + from sacremoses + """ + text=text.replace(NormalizerI.BYTE_ORDER_MARK,'') + text=text.replace('„', r'"') + text=text.replace('“', r'"') + text=text.replace('”', r'"') + text=text.replace('–', r'-') + text=text.replace('—', r' - ') + text=text.replace('´', r"'") + text=text.replace('‘', r"'") + text=text.replace('‚', r"'") + text=text.replace('’', r"'") + text=text.replace("''", r'"') + text=text.replace('´´', r'"') + text=text.replace('…', r'...') + + return text + + def normalize(self,text): + pass + + +class BaseNormalizer(NormalizerI): + + def __init__(self,lang, + remove_nuktas=False, + nasals_mode='do_nothing', + do_normalize_chandras=False, + do_normalize_vowel_ending=False): + + self.lang=lang + self.remove_nuktas=remove_nuktas + self.nasals_mode=nasals_mode + self.do_normalize_chandras=do_normalize_chandras + self.do_normalize_vowel_ending=do_normalize_vowel_ending + + self._init_normalize_chandras() + self._init_normalize_nasals() + self._init_normalize_vowel_ending() + #self._init_visarga_correction() + + def _init_normalize_vowel_ending(self): + + if self.lang in langinfo.IE_LANGUAGES: + self.fn_vowel_ending=self._normalize_word_vowel_ending_ie + elif self.lang in langinfo.DRAVIDIAN_LANGUAGES: + self.fn_vowel_ending=self._normalize_word_vowel_ending_dravidian + else: + self.fn_vowel_ending=lambda x: x + + def _init_normalize_chandras(self): + + substitution_offsets =\ + [ + [0x0d , 0x0f], # chandra e, independent + [0x11 , 0x13], # chandra o, independent + [0x45 , 0x47], # chandra e , 0xde],pendent + [0x49 , 0x4b], # chandra o , 0xde],pendent + # [0x72 , 0x0f], # mr: chandra e, independent + + [0x00 , 0x02], # chandrabindu + [0x01 , 0x02], # chandrabindu + ] + + self.chandra_substitutions = [ + (langinfo.offset_to_char(x[0],self.lang), langinfo.offset_to_char(x[1],self.lang)) + for x in substitution_offsets ] + + def _normalize_chandras(self,text): + for match, repl in self.chandra_substitutions: + text=text.replace(match,repl) + return text + + def _init_to_anusvaara_strict(self): + """ + `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')` + """ + + pat_signatures=\ + [ + [0x19,0x15,0x18], + [0x1e,0x1a,0x1d], + [0x23,0x1f,0x22], + [0x28,0x24,0x27], + [0x29,0x24,0x27], + [0x2e,0x2a,0x2d], + ] + + halant_offset=0x4d + anusvaara_offset=0x02 + + pats=[] + + for pat_signature in pat_signatures: + pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format( + nasal=langinfo.offset_to_char(pat_signature[0],self.lang), + halant=langinfo.offset_to_char(halant_offset,self.lang), + start_r=langinfo.offset_to_char(pat_signature[1],self.lang), + end_r=langinfo.offset_to_char(pat_signature[2],self.lang), + )) + pats.append(pat) + + repl_string='{anusvaara}\\1'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang)) + + self.pats_repls=(pats,repl_string) + + def _to_anusvaara_strict(self,text): + + pats, repl_string = self.pats_repls + for pat in pats: + text=pat.sub(repl_string,text) + + return text + + def _init_to_anusvaara_relaxed(self): + """ + `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')` + """ + + nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e] + nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list]) + + halant_offset=0x4d + anusvaara_offset=0x02 + + pat=re.compile(r'[{nasals_list_str}]{halant}'.format( + nasals_list_str=nasals_list_str, + halant=langinfo.offset_to_char(halant_offset,self.lang), + )) + + repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang)) + + self.pats_repls = (pat,repl_string) + + def _to_anusvaara_relaxed(self,text): + pat, repl_string = self.pats_repls + return pat.sub(repl_string,text) + + + def _init_to_nasal_consonants(self): + """ + `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')` + """ + + pat_signatures=\ + [ + [0x19,0x15,0x18], + [0x1e,0x1a,0x1d], + [0x23,0x1f,0x22], + [0x28,0x24,0x27], + [0x29,0x24,0x27], + [0x2e,0x2a,0x2d], + ] + + halant_offset=0x4d + anusvaara_offset=0x02 + + pats=[] + repl_strings=[] + + for pat_signature in pat_signatures: + pat=re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format( + anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang), + start_r=langinfo.offset_to_char(pat_signature[1],self.lang), + end_r=langinfo.offset_to_char(pat_signature[2],self.lang), + )) + pats.append(pat) + repl_string='{nasal}{halant}\\1'.format( + nasal=langinfo.offset_to_char(pat_signature[0],self.lang), + halant=langinfo.offset_to_char(halant_offset,self.lang), + ) + repl_strings.append(repl_string) + + self.pats_repls=list(zip(pats,repl_strings)) + + def _to_nasal_consonants(self,text): + + for pat, repl in self.pats_repls: + text=pat.sub(repl,text) + + return text + + def _init_normalize_nasals(self): + + if self.nasals_mode == 'to_anusvaara_strict': + self._init_to_anusvaara_strict() + elif self.nasals_mode == 'to_anusvaara_relaxed': + self._init_to_anusvaara_relaxed() + elif self.nasals_mode == 'to_nasal_consonants': + self._init_to_nasal_consonants() + + def _normalize_nasals(self,text): + if self.nasals_mode == 'to_anusvaara_strict': + return self._to_anusvaara_strict(text) + elif self.nasals_mode == 'to_anusvaara_relaxed': + return self._to_anusvaara_relaxed(text) + elif self.nasals_mode == 'to_nasal_consonants': + return self._to_nasal_consonants(text) + else: + return text + + + def _normalize_word_vowel_ending_dravidian(self,word): + """ + for Dravidian + - consonant ending: add 'a' ki maatra + - halant ending: no change + - 'a' ki maatra: no change + """ + if len(word)>0 and langinfo.is_consonant(word[-1],self.lang): + return word+langinfo.offset_to_char(0x3e,self.lang) + else: + return word + + def _normalize_word_vowel_ending_ie(self,word): + """ + for IE + - consonant ending: add halant + - halant ending: no change + - 'a' ki maatra: no change + """ + if len(word)>0 and langinfo.is_consonant(word[-1],self.lang): + return word+langinfo.offset_to_char(langinfo.HALANTA_OFFSET,self.lang) + else: + return word + + def _normalize_vowel_ending(self,text): + return ' '.join([ self.fn_vowel_ending(w) for w in text.split(' ') ]) + + def normalize(self,text): + """ + Method to be implemented for normalization for each script + """ + text=text.replace(NormalizerI.BYTE_ORDER_MARK,'') + text=text.replace(NormalizerI.BYTE_ORDER_MARK_2,'') + text=text.replace(NormalizerI.WORD_JOINER,'') + text=text.replace(NormalizerI.SOFT_HYPHEN,'') + + text=text.replace(NormalizerI.ZERO_WIDTH_SPACE,' ') # ?? + text=text.replace(NormalizerI.NO_BREAK_SPACE,' ') + + text=text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '') + text=text.replace(NormalizerI.ZERO_WIDTH_JOINER,'') + + text=self._normalize_punctuations(text) + + if self.do_normalize_chandras: + text=self._normalize_chandras(text) + text=self._normalize_nasals(text) + if self.do_normalize_vowel_ending: + text=self._normalize_vowel_ending(text) + + return text + + + def get_char_stats(self,text): + print(len(re.findall(NormalizerI.BYTE_ORDER_MARK,text))) + print(len(re.findall(NormalizerI.BYTE_ORDER_MARK_2,text))) + print(len(re.findall(NormalizerI.WORD_JOINER,text))) + print(len(re.findall(NormalizerI.SOFT_HYPHEN,text))) + + print(len(re.findall(NormalizerI.ZERO_WIDTH_SPACE,text) )) + print(len(re.findall(NormalizerI.NO_BREAK_SPACE,text))) + + print(len(re.findall(NormalizerI.ZERO_WIDTH_NON_JOINER,text))) + print(len(re.findall(NormalizerI.ZERO_WIDTH_JOINER,text))) + + #for mobj in re.finditer(NormalizerI.ZERO_WIDTH_NON_JOINER,text): + # print text[mobj.start()-10:mobj.end()+10].replace('\n', ' ').replace(NormalizerI.ZERO_WIDTH_NON_JOINER,'').encode('utf-8') + #print hex(ord(text[mobj.end():mobj.end()+1])) + + def correct_visarga(self,text,visarga_char,char_range): + text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text) + + + +class DevanagariNormalizer(BaseNormalizer): + """ + Normalizer for the Devanagari script. In addition to basic normalization by the super class, + + * Replaces the composite characters containing nuktas by their decomposed form + * replace pipe character '|' by poorna virama character + * replace colon ':' by visarga if the colon follows a charcter in this script + + """ + + NUKTA='\u093C' + + def __init__(self,lang='hi',remove_nuktas=False,nasals_mode='do_nothing', + do_normalize_chandras=False,do_normalize_vowel_ending=False): + super(DevanagariNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + + def normalize(self,text): + + # common normalization for Indic scripts + text=super(DevanagariNormalizer,self).normalize(text) + + # chandra a replacement for Marathi + text=text.replace('\u0972','\u090f') + + # decomposing Nukta based composite characters + text=text.replace('\u0929','\u0928'+DevanagariNormalizer.NUKTA) + text=text.replace('\u0931','\u0930'+DevanagariNormalizer.NUKTA) + text=text.replace('\u0934','\u0933'+DevanagariNormalizer.NUKTA) + text=text.replace('\u0958','\u0915'+DevanagariNormalizer.NUKTA) + text=text.replace('\u0959','\u0916'+DevanagariNormalizer.NUKTA) + text=text.replace('\u095A','\u0917'+DevanagariNormalizer.NUKTA) + text=text.replace('\u095B','\u091C'+DevanagariNormalizer.NUKTA) + text=text.replace('\u095C','\u0921'+DevanagariNormalizer.NUKTA) + text=text.replace('\u095D','\u0922'+DevanagariNormalizer.NUKTA) + text=text.replace('\u095E','\u092B'+DevanagariNormalizer.NUKTA) + text=text.replace('\u095F','\u092F'+DevanagariNormalizer.NUKTA) + + if self.remove_nuktas: + text=text.replace(DevanagariNormalizer.NUKTA,'') + + # replace pipe character for poorna virama + text=text.replace('\u007c','\u0964') + + # correct visarga + text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text) + + return text + + def get_char_stats(self,text): + super(DevanagariNormalizer,self).get_char_stats(text) + + print((len(re.findall('\u0929',text)))) + print((len(re.findall('\u0931',text)))) + print((len(re.findall('\u0934',text)))) + print((len(re.findall('\u0958',text)))) + print((len(re.findall('\u0959',text)))) + print((len(re.findall('\u095A',text)))) + print((len(re.findall('\u095B',text)))) + print((len(re.findall('\u095C',text)))) + print((len(re.findall('\u095D',text)))) + print((len(re.findall('\u095E',text)))) + print((len(re.findall('\u095F',text)))) + + #print(len(re.findall(u'\u0928'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u0930'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u0933'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u0915'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u0916'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u0917'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u091C'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u0921'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u0922'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u092B'+DevanagariNormalizer.NUKTA,text))) + #print(len(re.findall(u'\u092F'+DevanagariNormalizer.NUKTA,text))) + +class GurmukhiNormalizer(BaseNormalizer): + """ + Normalizer for the Gurmukhi script. In addition to basic normalization by the super class, + + * Replaces the composite characters containing nuktas by their decomposed form + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * replace pipe character '|' by poorna virama character + * replace colon ':' by visarga if the colon follows a charcter in this script + """ + + NUKTA='\u0A3C' + + VOWEL_NORM_MAPS={ + ## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf + ## Table 12-16 + '\u0a05\u0a3e': '\u0a06', + '\u0a72\u0a3f': '\u0a07', + '\u0a72\u0a40': '\u0a08', + '\u0a73\u0a41': '\u0a09', + '\u0a73\u0a42': '\u0a0a', + '\u0a72\u0a47': '\u0a0f', + '\u0a05\u0a48': '\u0a10', + '\u0a73\u0a4b': '\u0a13', + '\u0a05\u0a4c': '\u0a14', + } + + def __init__(self,lang='pa',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False, + do_normalize_vowel_ending=False, + do_canonicalize_addak=False, + do_canonicalize_tippi=False, + do_replace_vowel_bases=False): + super(GurmukhiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + self.do_canonicalize_addak=do_canonicalize_addak + self.do_canonicalize_tippi=do_canonicalize_tippi + self.do_replace_vowel_bases=do_replace_vowel_bases + + + def _normalize_vowels(self,text): + """ + + """ + + ## standard vowel replacements as per suggestions in + ## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf + ## Table 12-16 + + for k,v in GurmukhiNormalizer.VOWEL_NORM_MAPS.items(): + text=text.replace(k,v) + + ## the above mappings should account for majority of the variantions, + ## Rest are handled via this generic rule which looks at the diacritic + ## following the 2 special characters + ## TBD: don't see evidence for this in Wikipedia corpus + + ## If these special characters occur without any diacritic, replace them with closet + ## equivalent vowels + if self.do_replace_vowel_bases: + text=text.replace('\u0a72','\u0a07') + text=text.replace('\u0a73','\u0a09') + + return text + + + def normalize(self,text): + + # Addak + if self.do_canonicalize_addak: + ## replace addak+consonant with consonat+halant+consonant + text=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',text) + + # Tippi + if self.do_canonicalize_tippi: + text=text.replace('\u0a70','\u0a02') + + # Vowels: Gurumuki has multiple ways of representing independent vowels due + # to the characters 'iri' and 'ura'. + text=self._normalize_vowels(text) + + # common normalization for Indic scripts + text=super(GurmukhiNormalizer,self).normalize(text) + + # decomposing Nukta based composite characters + text=text.replace('\u0a33','\u0a32'+GurmukhiNormalizer.NUKTA) + text=text.replace('\u0a36','\u0a38'+GurmukhiNormalizer.NUKTA) + text=text.replace('\u0a59','\u0a16'+GurmukhiNormalizer.NUKTA) + text=text.replace('\u0a5a','\u0a17'+GurmukhiNormalizer.NUKTA) + text=text.replace('\u0a5b','\u0a1c'+GurmukhiNormalizer.NUKTA) + text=text.replace('\u0a5e','\u0a2b'+GurmukhiNormalizer.NUKTA) + + if self.remove_nuktas: + text=text.replace(GurmukhiNormalizer.NUKTA,'') + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u0a64','\u0964') + text=text.replace('\u0a65','\u0965') + + ## replace pipe character for poorna virama + text=text.replace('\u007c','\u0964') + + # correct visarge + text=re.sub(r'([\u0a00-\u0a7f]):','\\1\u0a03',text) + + return text + + +class GujaratiNormalizer(BaseNormalizer): + """ + Normalizer for the Gujarati script. In addition to basic normalization by the super class, + + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * replace colon ':' by visarga if the colon follows a charcter in this script + """ + + NUKTA='\u0ABC' + + def __init__(self,lang='gu',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False, + do_normalize_vowel_ending=False): + super(GujaratiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + + def normalize(self,text): + + # common normalization for Indic scripts + text=super(GujaratiNormalizer,self).normalize(text) + + # decomposing Nukta based composite characters + if self.remove_nuktas: + text=text.replace(GujaratiNormalizer.NUKTA,'') + + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u0ae4','\u0964') + text=text.replace('\u0ae5','\u0965') + + # correct visarge + text=re.sub(r'([\u0a80-\u0aff]):','\\1\u0a83',text) + + return text + + +class OriyaNormalizer(BaseNormalizer): + """ + Normalizer for the Oriya script. In addition to basic normalization by the super class, + + * Replaces the composite characters containing nuktas by their decomposed form + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * Canonicalize two part dependent vowels + * Replace 'va' with 'ba' + * replace pipe character '|' by poorna virama character + * replace colon ':' by visarga if the colon follows a charcter in this script + """ + + NUKTA='\u0B3C' + + VOWEL_NORM_MAPS={ + ## See Table 12-22 in http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf + '\u0b05\u0b3e': '\u0b06', + '\u0b0f\u0b57': '\u0b10', + '\u0b13\u0b57': '\u0b14', + } + + + def __init__(self,lang='or',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False, + do_normalize_vowel_ending=False, + do_remap_wa=False): + super(OriyaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + self.do_remap_wa=do_remap_wa + + def normalize(self,text): + + # common normalization for Indic scripts + text=super(OriyaNormalizer,self).normalize(text) + + ## standard vowel replacements as per suggestions in Unicode documents + for k,v in OriyaNormalizer.VOWEL_NORM_MAPS.items(): + text=text.replace(k,v) + + # decomposing Nukta based composite characters + text=text.replace('\u0b5c','\u0b21'+OriyaNormalizer.NUKTA) + text=text.replace('\u0b5d','\u0b22'+OriyaNormalizer.NUKTA) + + if self.remove_nuktas: + text=text.replace(OriyaNormalizer.NUKTA,'') + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u0b64','\u0964') + text=text.replace('\u0b65','\u0965') + + # replace pipe character for poorna virama + text=text.replace('\u0b7c','\u0964') + + # replace wa with ba + if self.do_remap_wa: + text=text.replace('\u0b71','\u0b2c') + + # replace va with ba + # NOTE: documentation (chapter on Indic scripts) and codepoint chart seem contradictory + # (this applied to wa to ba rule also above) + text=text.replace('\u0b35','\u0b2c') + + # AI dependent vowel sign + text=text.replace('\u0b47\u0b56','\u0b58') + + # two part dependent vowels + text=text.replace('\u0b47\u0b3e','\u0b4b') + text=text.replace('\u0b47\u0b57','\u0b4c') + + + # additional consonant - not clear how to handle this + # ignore + + # correct visarge + text=re.sub(r'([\u0b00-\u0b7f]):','\\1\u0b03',text) + + return text + + +class BengaliNormalizer(BaseNormalizer): + """ + Normalizer for the Bengali script. In addition to basic normalization by the super class, + + * Replaces the composite characters containing nuktas by their decomposed form + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * Canonicalize two part dependent vowels + * replace pipe character '|' by poorna virama character + * replace colon ':' by visarga if the colon follows a charcter in this script + + """ + + NUKTA='\u09BC' + + def __init__(self,lang='bn',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False, + do_normalize_vowel_ending=False, + do_remap_assamese_chars=False): + super(BengaliNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + self.do_remap_assamese_chars=do_remap_assamese_chars + + def normalize(self,text): + + # common normalization for Indic scripts + text=super(BengaliNormalizer,self).normalize(text) + + # decomposing Nukta based composite characters + text=text.replace('\u09dc','\u09a1'+BengaliNormalizer.NUKTA) + text=text.replace('\u09dd','\u09a2'+BengaliNormalizer.NUKTA) + text=text.replace('\u09df','\u09af'+BengaliNormalizer.NUKTA) + + if self.remove_nuktas: + text=text.replace(BengaliNormalizer.NUKTA,'') + + if self.do_remap_assamese_chars and self.lang=='as': + text=text.replace('\u09f0','\u09b0') # 'ra' character + text=text.replace('\u09f1','\u09ac') # 'va' character + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u09e4','\u0964') + text=text.replace('\u09e5','\u0965') + + # replace pipe character for poorna virama + text=text.replace('\u007c','\u0964') + # replace bengali currency numerator four for poorna virama (it looks similar and is used as a substitute) + text=text.replace('\u09f7','\u0964') + + # two part dependent vowels + text=text.replace('\u09c7\u09be','\u09cb') + text=text.replace('\u09c7\u09d7','\u09cc') + + # correct visarge + text=re.sub(r'([\u0980-\u09ff]):','\\1\u0983',text) + + return text + + +class TamilNormalizer(BaseNormalizer): + """ + Normalizer for the Tamil script. In addition to basic normalization by the super class, + + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * canonicalize two-part dependent vowel signs + * replace colon ':' by visarga if the colon follows a charcter in this script + """ + + def __init__(self,lang='ta',remove_nuktas=False,nasals_mode='do_nothing', + do_normalize_chandras=False,do_normalize_vowel_ending=False): + super(TamilNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + + def normalize(self,text): + + # common normalization for Indic scripts + text=super(TamilNormalizer,self).normalize(text) + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u0be4','\u0964') + text=text.replace('\u0be5','\u0965') + + # two part dependent vowels + text=text.replace('\u0b92\u0bd7','\u0b94') + text=text.replace('\u0bc6\u0bbe','\u0bca') + text=text.replace('\u0bc7\u0bbe','\u0bcb') + text=text.replace('\u0bc6\u0bd7','\u0bcc') + + # correct visarge + text=re.sub(r'([\u0b80-\u0bff]):','\\1\u0b83',text) + + return text + + +class TeluguNormalizer(BaseNormalizer): + """ + Normalizer for the Teluguscript. In addition to basic normalization by the super class, + + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * canonicalize two-part dependent vowel signs + * replace colon ':' by visarga if the colon follows a charcter in this script + """ + + def __init__(self,lang='te',remove_nuktas=False,nasals_mode='do_nothing', + do_normalize_chandras=False,do_normalize_vowel_ending=False): + super(TeluguNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + + def normalize(self,text): + + # common normalization for Indic scripts + text=super(TeluguNormalizer,self).normalize(text) + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u0c64','\u0964') + text=text.replace('\u0c65','\u0965') + + # dependent vowels + text=text.replace('\u0c46\u0c56','\u0c48') + + # correct visarge + text=re.sub(r'([\u0c00-\u0c7f]):','\\1\u0c03',text) + + return text + + def get_char_stats(self,text): + pass + +class KannadaNormalizer(BaseNormalizer): + """ + Normalizer for the Kannada script. In addition to basic normalization by the super class, + + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * canonicalize two-part dependent vowel signs + * replace colon ':' by visarga if the colon follows a charcter in this script + """ + + def __init__(self,lang='kn',remove_nuktas=False,nasals_mode='do_nothing', + do_normalize_chandras=False,do_normalize_vowel_ending=False): + super(KannadaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + + + def normalize(self,text): + + # common normalization for Indic scripts + text=super(KannadaNormalizer,self).normalize(text) + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u0ce4','\u0964') + text=text.replace('\u0ce5','\u0965') + + # dependent vowels + text=text.replace('\u0cbf\u0cd5','\u0cc0') + text=text.replace('\u0cc6\u0cd5','\u0cc7') + text=text.replace('\u0cc6\u0cd6','\u0cc8') + text=text.replace('\u0cc6\u0cc2','\u0cca') + text=text.replace('\u0cca\u0cd5','\u0ccb') + + # correct visarge + text=re.sub(r'([\u0c80-\u0cff]):','\\1\u0c83',text) + + return text + + +class MalayalamNormalizer(BaseNormalizer): + """ + Normalizer for the Malayalam script. In addition to basic normalization by the super class, + + * Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama + * canonicalize two-part dependent vowel signs + * Change from old encoding of chillus (till Unicode 5.0) to new encoding + * replace colon ':' by visarga if the colon follows a charcter in this script + """ + + CHILLU_CHAR_MAP= { + '\u0d7a': '\u0d23', + '\u0d7b': '\u0d28', + '\u0d7c': '\u0d30', + '\u0d7d': '\u0d32', + '\u0d7e': '\u0d33', + '\u0d7f': '\u0d15', + } + + def _canonicalize_chillus(self,text): + for chillu, char in MalayalamNormalizer.CHILLU_CHAR_MAP.items(): + text=text.replace(chillu,'{}\u0d4d'.format(char)) + return text + + def _correct_geminated_T(self,text): + return text.replace('\u0d31\u0d4d\u0d31','\u0d1f\u0d4d\u0d1f') + + def __init__(self,lang='ml',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False, + do_normalize_vowel_ending=False, + do_canonicalize_chillus=False, do_correct_geminated_T=False): + super(MalayalamNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending) + self.do_canonicalize_chillus=do_canonicalize_chillus + self.do_correct_geminated_T=do_correct_geminated_T + + def normalize(self,text): + + # Change from old encoding of chillus (till Unicode 5.0) to new encoding + text=text.replace('\u0d23\u0d4d\u200d','\u0d7a') + text=text.replace('\u0d28\u0d4d\u200d','\u0d7b') + text=text.replace('\u0d30\u0d4d\u200d','\u0d7c') + text=text.replace('\u0d32\u0d4d\u200d','\u0d7d') + text=text.replace('\u0d33\u0d4d\u200d','\u0d7e') + text=text.replace('\u0d15\u0d4d\u200d','\u0d7f') + + # Normalize chillus + if self.do_canonicalize_chillus: + text=self._canonicalize_chillus(text) + + # common normalization for Indic scripts + text=super(MalayalamNormalizer,self).normalize(text) + + # replace the poorna virama codes specific to script + # with generic Indic script codes + text=text.replace('\u0d64','\u0964') + text=text.replace('\u0d65','\u0965') + + # dependent vowels + text=text.replace('\u0d46\u0d3e','\u0d4a') + text=text.replace('\u0d47\u0d3e','\u0d4b') + + # au forms + text=text.replace('\u0d46\u0d57','\u0d4c') + text=text.replace('\u0d57','\u0d4c') + + # correct geminated T + if self.do_correct_geminated_T: + text=self._correct_geminated_T(text) + + # correct visarga + text=re.sub(r'([\u0d00-\u0d7f]):','\\1\u0d03',text) + + return text + +class UrduNormalizer(NormalizerI): + '''Uses UrduHack library. + https://docs.urduhack.com/en/stable/_modules/urduhack/normalization/character.html#normalize + ''' + + def __init__(self, lang, remove_nuktas=True): + self.lang = lang + self.remove_nuktas = remove_nuktas + + from urduhack.normalization import ( + remove_diacritics, + normalize_characters, + normalize_combine_characters + ) # TODO: Use only required normalizers + from urduhack.preprocessing import ( + normalize_whitespace, + digits_space, + all_punctuations_space, + english_characters_space + ) + + def normalize(self, text): + text = self._normalize_punctuations(text) + text = UrduNormalizer.normalize_whitespace(text) + if self.remove_nuktas: + text = UrduNormalizer.remove_diacritics(text) + text = UrduNormalizer.normalize_characters(text) + text = UrduNormalizer.normalize_combine_characters(text) + text = UrduNormalizer.digits_space(text) + text = UrduNormalizer.all_punctuations_space(text) + text = UrduNormalizer.english_characters_space(text) + return text + + +class IndicNormalizerFactory(object): + """ + Factory class to create language specific normalizers. + + """ + + def get_normalizer(self,language,**kwargs): + """ + Call the get_normalizer function to get the language specific normalizer + + Paramters: + |language: language code + |remove_nuktas: boolean, should the normalizer remove nukta characters + """ + normalizer=None + if language in ['hi','mr','sa','kK','ne','sd']: + normalizer=DevanagariNormalizer(lang=language, **kwargs) + elif language in ['ur']: + normalizer = UrduNormalizer(lang=language, **kwargs) + elif language in ['pa']: + normalizer=GurmukhiNormalizer(lang=language, **kwargs) + elif language in ['gu']: + normalizer=GujaratiNormalizer(lang=language, **kwargs) + elif language in ['bn']: + normalizer=BengaliNormalizer(lang=language, **kwargs) + elif language in ['as']: + normalizer=BengaliNormalizer(lang=language, **kwargs) + elif language in ['or']: + normalizer=OriyaNormalizer(lang=language, **kwargs) + elif language in ['ml']: + normalizer=MalayalamNormalizer(lang=language, **kwargs) + elif language in ['kn']: + normalizer=KannadaNormalizer(lang=language, **kwargs) + elif language in ['ta']: + normalizer=TamilNormalizer(lang=language, **kwargs) + elif language in ['te']: + normalizer=TeluguNormalizer(lang=language, **kwargs) + else: + normalizer=BaseNormalizer(lang=language, **kwargs) + + return normalizer + + def is_language_supported(self,language): + """ + Is the language supported? + """ + if language in ['hi','mr','sa','kK','ne','sd', + 'ur', + 'pa', + 'gu', + 'bn','as', + 'or', + 'ml', + 'kn', + 'ta', + 'te']: + return True + else: + return False + + +if __name__ == '__main__': + + if len(sys.argv)<4: + print("Usage: python normalize.py [] []") + sys.exit(1) + + language=sys.argv[3] + remove_nuktas=False + normalize_nasals='do_nothing' + if len(sys.argv)>=5: + remove_nuktas=bool(sys.argv[4]) + if len(sys.argv)>=6: + normalize_nasals=sys.argv[5] + + # create normalizer + factory=IndicNormalizerFactory() + normalizer=factory.get_normalizer(language,remove_nuktas=remove_nuktas,nasals_mode=normalize_nasals) + + # DO normalization + with codecs.open(sys.argv[1],'r','utf-8') as ifile: + with codecs.open(sys.argv[2],'w','utf-8') as ofile: + for line in ifile.readlines(): + normalized_line=normalizer.normalize(line) + ofile.write(normalized_line) + + ## gather status about normalization + #with codecs.open(sys.argv[1],'r','utf-8') as ifile: + # normalizer=DevanagariNormalizer() + # text=string.join(ifile.readlines(),sep='') + # normalizer.get_char_stats(text) diff --git a/indic_nlp_library/indicnlp/script/__init__.py b/indic_nlp_library/indicnlp/script/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/indic_nlp_library/indicnlp/script/english_script.py b/indic_nlp_library/indicnlp/script/english_script.py new file mode 100644 index 0000000000000000000000000000000000000000..65a77e3dfc8cb649c015d89c8d99814794c51425 --- /dev/null +++ b/indic_nlp_library/indicnlp/script/english_script.py @@ -0,0 +1,154 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +import pandas as pd +import numpy as np + +from indicnlp import common +from indicnlp.common import IndicNlpException + + +#### Maps from ARPABET to Internal Id +ARPABET_ID_MAP={} +ID_ARPABET_MAP={} + + +### +# Phonetic Information about script characters +### + +""" Phonetic data for English """ +ENGLISH_PHONETIC_DATA=None + +""" Phonetic vector for English""" +ENGLISH_PHONETIC_VECTORS=None + +""" Length of phonetic vector """ +PHONETIC_VECTOR_LENGTH=38 + +""" Start offset for the phonetic feature vector in the phonetic data vector """ +PHONETIC_VECTOR_START_OFFSET=6 + +## PHONETIC PROPERTIES in order in which they occur in the vector +## This list must be in sync with the keys in the PV_PROP_RANGES dictionary +PV_PROP=['basic_type', + 'vowel_length', + 'vowel_strength', + 'vowel_status', + 'consonant_type', + 'articulation_place', + 'aspiration', + 'voicing', + 'nasalization', + 'vowel_horizontal', + 'vowel_vertical', + 'vowel_roundness', + ] + +### +# Bit vector ranges for various properties +### + +PV_PROP_RANGES={ + 'basic_type': [0,6], + 'vowel_length': [6,8], + 'vowel_strength': [8,11], + 'vowel_status': [11,13], + 'consonant_type': [13,18], + 'articulation_place': [18,23], + 'aspiration': [23,25], + 'voicing': [25,27], + 'nasalization': [27,29], + 'vowel_horizontal': [29,32], + 'vowel_vertical': [32,36], + 'vowel_roundness': [36,38], + } + + +#### +# Indexes into the Phonetic Vector +#### +PVIDX_BT_VOWEL=0 +PVIDX_BT_CONSONANT=1 +PVIDX_BT_NUKTA=2 +PVIDX_BT_HALANT=3 +PVIDX_BT_ANUSVAAR=4 +PVIDX_BT_MISC=5 +PVIDX_BT_S=PVIDX_BT_VOWEL +PVIDX_BT_E=PVIDX_BT_MISC+1 + +PVIDX_VSTAT_DEP=12 + +#### +SCRIPT_RANGE_START=0x0D00 +## TBD +SCRIPT_RANGE_END=0x0D2E + + +def init(): + """ + To be called by library loader, do not call it in your program + """ + + global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET + + ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8') + + ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values + + PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1] + + ### Load mapping from ARPABET representation of phoneme to internal ID + global ARPABET_ID_MAP, ID_ARPABET_MAP + + with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile: + for ph_id, name in enumerate(iter(infile)): + name=name.strip() + ARPABET_ID_MAP[name]=ph_id + ID_ARPABET_MAP[ph_id]=name + + +def phoneme_to_offset(ph): + return ARPABET_ID_MAP[ph] + +def offset_to_phoneme(ph_id): + return ID_ARPABET_MAP[ph_id] + +def phoneme_to_enc(ph): + return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph)) + +def enc_to_phoneme(ph): + return offset_to_phoneme(enc_to_offset(ph)) + +def enc_to_offset(c): + return ord(c)-SCRIPT_RANGE_START + +def in_range(offset): + return offset>=SCRIPT_RANGE_START and offset=SCRIPT_OFFSET_START and o=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE) + +def in_coordinated_range(c,lang): + if not is_supported_language(lang): + raise IndicNlpException('Language {} not supported'.format(lang)) + return in_coordinated_range_offset(get_offset(c,lang)) + +def get_phonetic_info(lang): + if not is_supported_language(lang): + raise IndicNlpException('Language {} not supported'.format(lang)) + phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA + phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS + + return (phonetic_data, phonetic_vectors) + +def invalid_vector(): + ## TODO: check if np datatype is correct? + return np.array([0]*PHONETIC_VECTOR_LENGTH) + +def get_phonetic_feature_vector(c,lang): + + offset=get_offset(c,lang) + + if not in_coordinated_range_offset(offset): + return invalid_vector() + + phonetic_data, phonetic_vectors= get_phonetic_info(lang) + + if phonetic_data.iloc[offset]['Valid Vector Representation']==0: + return invalid_vector() + + return phonetic_vectors[offset] + +def get_phonetic_feature_vector_offset(offset,lang): + + if not in_coordinated_range_offset(offset): + return invalid_vector() + + phonetic_data, phonetic_vectors= get_phonetic_info(lang) + + if phonetic_data.iloc[offset]['Valid Vector Representation']==0: + return invalid_vector() + + return phonetic_vectors[offset] + +### Unary operations on vectors +def is_valid(v): + return np.sum(v)>0 + +def is_vowel(v): + return v[PVIDX_BT_VOWEL]==1 + +def is_consonant(v): + return v[PVIDX_BT_CONSONANT]==1 + +def is_halant(v): + return v[PVIDX_BT_HALANT]==1 + +def is_nukta(v): + return v[PVIDX_BT_NUKTA]==1 + +def is_anusvaar(v): + return v[PVIDX_BT_ANUSVAAR]==1 + +def is_misc(v): + return v[PVIDX_BT_MISC]==1 + +def is_dependent_vowel(v): + return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1 + +def is_plosive(v): + return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1 + +### Binary operations on phonetic vectors + +def or_vectors(v1,v2): + return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ]) + +def xor_vectors(v1,v2): + return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ]) + +### Getting properties from phonetic vectors + +def get_property_vector(v,prop_name): + return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]] + +def get_property_value(v,prop_name): + factor_bits=get_property_vector(v,prop_name).tolist() + + v=0 + c=1 + for b in factor_bits[::-1]: + v+=(c*b) + c=c*2.0 + + return int(v) + +def lcsr_indic(srcw,tgtw,slang,tlang): + """ + compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. + This works for Indic scripts by mapping both languages to a common script + + srcw: source language string + tgtw: source language string + slang: source language + tlang: target language + """ + score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) + + for si,sc in enumerate(srcw,1): + for ti,tc in enumerate(tgtw,1): + so=get_offset(sc,slang) + to=get_offset(tc,tlang) + + if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to: + score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 + elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc: + score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 + else: + score_mat[si,ti]= max( + score_mat[si,ti-1], + score_mat[si-1,ti]) + + return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) + +def lcsr_any(srcw,tgtw): + """ + LCSR computation if both languages have the same script + """ + score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) + + for si,sc in enumerate(srcw,1): + for ti,tc in enumerate(tgtw,1): + + if sc==tc: + score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 + else: + score_mat[si,ti]= max( + score_mat[si,ti-1], + score_mat[si-1,ti]) + + return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) + +def lcsr(srcw,tgtw,slang,tlang): + """ + compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. + + srcw: source language string + tgtw: source language string + slang: source language + tlang: target language + """ + + if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang): + return lcsr_any(srcw,tgtw,slang,tlang) + else: + return lcsr_indic(srcw,tgtw) + + + diff --git a/indic_nlp_library/indicnlp/script/phonetic_sim.py b/indic_nlp_library/indicnlp/script/phonetic_sim.py new file mode 100644 index 0000000000000000000000000000000000000000..7a05739d00953c7b739355aedb2f7db03e32e6e0 --- /dev/null +++ b/indic_nlp_library/indicnlp/script/phonetic_sim.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +from indicnlp import loader +from indicnlp import langinfo +from indicnlp.script.indic_scripts import * +import numpy as np +import gzip +import pandas as pd +import sys + +def equal(v1,v2): + return 0.0 if np.sum( xor_vectors(v1, v2)) > 0 else 1.0 + +def dice(v1,v2): + dotprod=2*float(np.dot( v1, v2.T )) + return dotprod/float(len(v1)+len(v2)) + +def jaccard(v1,v2): + dotprod=float(np.dot( v1, v2.T )) + return dotprod/float(len(v1)+len(v2)-dotprod) + +def cosine(v1,v2): + dotprod=float(np.dot( v1, v2.T )) + norm1=float(np.dot( v1, v1.T )) + norm2=float(np.dot( v2, v2.T )) + return ((dotprod)/(np.sqrt(norm1*norm2)+0.00001)) + +def dotprod(v1,v2): + return float(np.dot( v1, v2.T )) + +def sim1(v1,v2,base=5.0): + return np.power(base,dotprod(v1,v2)) + +def softmax(v1,v2): + return sim1(v1,v2,np.e) + +def create_similarity_matrix(sim_func,slang,tlang,normalize=True): + + dim=langinfo.COORDINATED_RANGE_END_INCLUSIVE-langinfo.COORDINATED_RANGE_START_INCLUSIVE+1 + sim_mat=np.zeros((dim,dim)) + + for offset1 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1): + v1=get_phonetic_feature_vector(offset_to_char(offset1,slang),slang) + for offset2 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1): + v2=get_phonetic_feature_vector(offset_to_char(offset2,tlang),tlang) + sim_mat[offset1,offset2]=sim_func(v1,v2) + + if normalize: + sums=np.sum(sim_mat, axis=1) + sim_mat=(sim_mat.transpose()/sums).transpose() + + return sim_mat + diff --git a/indic_nlp_library/indicnlp/syllable/__init__.py b/indic_nlp_library/indicnlp/syllable/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/indic_nlp_library/indicnlp/syllable/syllabifier.py b/indic_nlp_library/indicnlp/syllable/syllabifier.py new file mode 100644 index 0000000000000000000000000000000000000000..de538452f3bd7ce424c5abfa5ac91ab5cfa0d818 --- /dev/null +++ b/indic_nlp_library/indicnlp/syllable/syllabifier.py @@ -0,0 +1,302 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +import codecs, sys +from indicnlp.script import indic_scripts as si +import re + +chillu_char_map= { + '\u0d7a': '\u0d23', + '\u0d7b': '\u0d28', + '\u0d7c': '\u0d30', + '\u0d7d': '\u0d32', + '\u0d7e': '\u0d33', + '\u0d7f': '\u0d15', + } + +char_chillu_map= {} +for k,v in chillu_char_map.items(): + char_chillu_map[v]=k + +def normalize_malayalam(word): + + word_mask=re.sub(r'[0-9]','0',word) + + # instead of chillu characters, use consonant+halant + for chillu,char in chillu_char_map.items(): + word=word.replace(chillu,'{}\u0d4d'.format(char)) + word_mask=word_mask.replace(chillu,'41') + + word_mask=re.sub(r'[^0-9]','0',word_mask) + + return word, word_mask + +def denormalize_malayalam(word, word_mask): + + word=list(word) + word_mask=list(word_mask) + + ## pattern 4 + idx=0 + while idx>=0: + try: + idx=word_mask.index('4',idx) + word[idx:idx+2]=char_chillu_map[word[idx]] + word_mask[idx:idx+2]='0' + start=idx + except ValueError as e: + break + + return ''.join(word) + +def normalize_punjabi(word): + word_mask=re.sub(r'[0-9]','0',word) + + ## replace tippi with anusvaar + word=word.replace('\u0a70','\u0a02') + word_mask=word_mask.replace('\u0a70','2') + + ## replace addak+consonant with consonat+halant+consonant + word=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',word) + word_mask=re.sub(r'\u0a71(.)','311',word_mask) + + word_mask=re.sub(r'[^0-9]','0',word_mask) + + return word, word_mask + +def denormalize_punjabi(word, word_mask): + + word=list(word) + word_mask=list(word_mask) + + ## pattern 2 + idx=0 + while idx>=0: + try: + idx=word_mask.index('2',idx) + word[idx]='\u0a70' + word_mask[idx]='0' + start=idx + except ValueError as e: + break + + ## pattern 3 + idx=0 + while idx>=0: + try: + idx=word_mask.index('3',idx) + word[idx:idx+3]='\u0a71{}'.format(word[idx]) + word_mask[idx:idx+3]='00' + start=idx + except ValueError as e: + break + + return ''.join(word) + +def char_backoff(syllables_list,vocab): + syllables_final=[] + + if vocab is None: + syllables_final=syllables_list + else: + for s in syllables_list: + if s in vocab: + syllables_final.append(s) + else: + for x in s: + syllables_final.append(x) + + return syllables_final + + +def orthographic_syllabify_improved(word,lang,vocab=None): + + word_mask=['0']*len(word) + + if lang=='ml': + word, word_mask = normalize_malayalam(word) + word=word + elif lang=='pa': + word, word_mask = normalize_punjabi(word) + + p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] + + syllables=[] + syllables_mask=[] + + for i in range(len(word)): + v=p_vectors[i] + + syllables.append(word[i]) + syllables_mask.append(word_mask[i]) + + ### simplified syllabification + #if i+1= 0: + print('Warning') + + if lang=='ml': + syllables = denormalize_malayalam(syllables,syllables_mask) + elif lang=='pa': + syllables = denormalize_punjabi(syllables,syllables_mask) + + syllables_list = syllables.strip().split(' ') + return(char_backoff(syllables_list,vocab)) + +def orthographic_syllabify(word,lang,vocab=None): + + p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] + + syllables=[] + + for i in range(len(word)): + v=p_vectors[i] + + syllables.append(word[i]) + + ### simplified syllabification + #if i+1?\u0964\u0965' +pat_la=re.compile(r'[ ](['+left_attach+r'])') + +right_attach=r'#$(\[{<@' +pat_ra=re.compile(r'(['+right_attach+r'])[ ]') + +lr_attach=r'-/\\' +pat_lra=re.compile(r'[ ](['+lr_attach+r'])[ ]') + +#donknow=u'&*+=^_|~' + +## date, numbers, section/article numbering +## TODO: handle indic numbers +pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+') + +### e-mail address +#pat_num=re.compile(ur'[a-zA-Z]+[ ]? + +def trivial_detokenize_indic(text): + """detokenize string for Indian language scripts using Brahmi-derived scripts + + A trivial detokenizer which: + + - decides whether punctuation attaches to left/right or both + - handles number sequences + - handles quotes smartly (deciding left or right attachment) + + Args: + text (str): tokenized text to process + + Returns: + str: detokenized string + """ + + s=text + ### some normalizations + + #numbers and dates + new_s='' + prev=0 + for m in pat_num_seq.finditer(s): + start=m.start() + end=m.end() + if start>prev: + new_s=new_s+s[prev:start] + new_s=new_s+s[start:end].replace(' ','') + prev=end + + new_s=new_s+s[prev:] + s=new_s + + ### consective single quotes or backslashes become double quotes + #s=s.replace("' '", "''") + #s=s.replace("` `", '``') + + s=pat_lra.sub('\\1',s) + s=pat_la.sub('\\1',s) + s=pat_ra.sub('\\1',s) + + # assumes well formedness of quotes and alternates between right and left attach + + alt_attach='\'"`' + for punc in alt_attach: + cnt=0 + out_str=[] + for c in s: + if c == punc: + if cnt%2==0: + out_str.append('@RA') + else: + out_str.append('@LA') + cnt+=1 + else: + out_str.append(c) + + s=''.join(out_str).replace('@RA ',punc).replace(' @LA',punc + ).replace('@RA',punc).replace('@LA',punc) + + return s + +def trivial_detokenize(text,lang='hi'): + """detokenize string for languages of the Indian subcontinent + + A trivial detokenizer which: + + - decides whether punctuation attaches to left/right or both + - handles number sequences + - handles quotes smartly (deciding left or right attachment) + + Args: + text (str): tokenized text to process + + Returns: + str: detokenized string + + Raises: + IndicNlpException: If language is not supported + """ + if lang=='ur': + raise IndicNlpException('No detokenizer available for Urdu') + else: + return trivial_detokenize_indic(text) + +# if __name__ == '__main__': + +# if len(sys.argv)<4: +# print("Usage: python indic_detokenize.py ") +# sys.exit(1) + +# with open(sys.argv[1],'r', encoding='utf-8') as ifile: +# with open(sys.argv[2],'w', encoding='utf-8') as ofile: +# for line in ifile: +# detokenized_line=trivial_detokenize(line,sys.argv[3]) +# ofile.write(detokenized_line) diff --git a/indic_nlp_library/indicnlp/tokenize/indic_tokenize.py b/indic_nlp_library/indicnlp/tokenize/indic_tokenize.py new file mode 100644 index 0000000000000000000000000000000000000000..0c3864776382c468ff863bb6d5ef8d2180cd782f --- /dev/null +++ b/indic_nlp_library/indicnlp/tokenize/indic_tokenize.py @@ -0,0 +1,111 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +#Program for tokenizing Indian language input +# +# @author Anoop Kunchukuttan +# +""" +Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers +are supported (see `trivial_tokenize`). Major Indian language punctuations are +handled. +""" +import string, re, sys + +from indicnlp.common import IndicNlpException + +### tokenizer patterns +triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])') +triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])') + +## date, numbers, section/article numbering +pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+') + +def trivial_tokenize_indic(text): + """tokenize string for Indian language scripts using Brahmi-derived scripts + + A trivial tokenizer which just tokenizes on the punctuation boundaries. + This also includes punctuations for the Indian language scripts (the + purna virama and the deergha virama). This is a language independent + tokenizer + + Args: + text (str): text to tokenize + + Returns: + list: list of tokens + + """ + tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' ')) +# return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ') + + s=re.sub(r'[ ]+',' ',tok_str).strip(' ') + + # do not tokenize numbers and dates + new_s='' + prev=0 + for m in pat_num_seq.finditer(s): + start=m.start() + end=m.end() + if start>prev: + new_s=new_s+s[prev:start] + new_s=new_s+s[start:end].replace(' ','') + prev=end + + new_s=new_s+s[prev:] + s=new_s + + return s.split(' ') + +def trivial_tokenize_urdu(text): + """tokenize Urdu string + + A trivial tokenizer which just tokenizes on the punctuation boundaries. + This also includes punctuations for the Urdu script. + These punctuations characters were identified from the Unicode database + for Arabic script by looking for punctuation symbols. + + Args: + text (str): text to tokenize + + Returns: + list: list of tokens + """ + tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' ')) + return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ') + +def trivial_tokenize(text,lang='hi'): + """trivial tokenizer for Indian languages using Brahmi for Arabic scripts + + A trivial tokenizer which just tokenizes on the punctuation boundaries. + Major punctuations specific to Indian langauges are handled. + These punctuations characters were identified from the Unicode database. + + Args: + text (str): text to tokenize + lang (str): ISO 639-2 language code + + Returns: + list: list of tokens + """ + if lang=='ur': + return trivial_tokenize_urdu(text) + else: + return trivial_tokenize_indic(text) + +# if __name__ == '__main__': + +# if len(sys.argv)<4: +# print("Usage: python indic_tokenize.py ") +# sys.exit(1) + +# with open(sys.argv[1],'r', encoding='utf-8') as ifile: +# with open(sys.argv[2],'w', encoding='utf-8') as ofile: +# for line in ifile: +# tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3])) +# ofile.write(tokenized_line) diff --git a/indic_nlp_library/indicnlp/tokenize/sentence_tokenize.py b/indic_nlp_library/indicnlp/tokenize/sentence_tokenize.py new file mode 100644 index 0000000000000000000000000000000000000000..7e02e1c7639152b685f27bd1d7599880bce44127 --- /dev/null +++ b/indic_nlp_library/indicnlp/tokenize/sentence_tokenize.py @@ -0,0 +1,268 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +#Program for sentence splitting of Indian language input +# +# @author Anoop Kunchukuttan +# +""" +Sentence splitter for Indian languages. Contains a rule-based +sentence splitter that can understand common non-breaking phrases +in many Indian languages. +""" + +import re + +from indicnlp.transliterate import unicode_transliterate +from indicnlp import langinfo + + +## for language which have danda as delimiter +## period is not part of the sentence delimiters +DELIM_PAT_DANDA=re.compile(r'[\?!\u0964\u0965]') + +## for languages which don't have danda as delimiter +DELIM_PAT_NO_DANDA=re.compile(r'[\.\?!\u0964\u0965]') + +## pattern to check for presence of danda in text +CONTAINS_DANDA=re.compile(r'[\u0964\u0965]') + +def is_acronym_abbvr(text,lang): + """Is the text a non-breaking phrase + + Args: + text (str): text to check for non-breaking phrase + lang (str): ISO 639-2 language code + + Returns: + boolean: true if `text` is a non-breaking phrase + """ + + ack_chars = { + ## acronym for latin characters + 'ए', 'ऎ', + 'बी', 'बि', + 'सी', 'सि', + 'डी', 'डि', + 'ई', 'इ', + 'एफ', 'ऎफ', + 'जी', 'जि', + 'एच','ऎच', + 'आई', 'आइ','ऐ', + 'जे', 'जॆ', + 'के', 'कॆ', + 'एल', 'ऎल', + 'एम','ऎम', + 'एन','ऎन', + 'ओ', 'ऒ', + 'पी', 'पि', + 'क्यू', 'क्यु', + 'आर', + 'एस','ऎस', + 'टी', 'टि', + 'यू', 'यु', + 'वी', 'वि', 'व्ही', 'व्हि', + 'डब्ल्यू', 'डब्ल्यु', + 'एक्स','ऎक्स', + 'वाय', + 'जेड', 'ज़ेड', + ## add halant to the previous English character mappings. + 'एफ्', + 'ऎफ्', + 'एच्', + 'ऎच्', + 'एल्', + 'ऎल्', + 'एम्', + 'ऎम्', + 'एन्', + 'ऎन्', + 'आर्', + 'एस्', + 'ऎस्', + 'एक्स्', + 'ऎक्स्', + 'वाय्', + 'जेड्', 'ज़ेड्', + + #Indic vowels + 'ऄ', + 'अ', + 'आ', + 'इ', + 'ई', + 'उ', + 'ऊ', + 'ऋ', + 'ऌ', + 'ऍ', + 'ऎ', + 'ए', + 'ऐ', + 'ऑ', + 'ऒ', + 'ओ', + 'औ', + 'ॠ', + 'ॡ', + + #Indic consonants + 'क', + 'ख', + 'ग', + 'घ', + 'ङ', + 'च', + 'छ', + 'ज', + 'झ', + 'ञ', + 'ट', + 'ठ', + 'ड', + 'ढ', + 'ण', + 'त', + 'थ', + 'द', + 'ध', + 'न', + 'ऩ', + 'प', + 'फ', + 'ब', + 'भ', + 'म', + 'य', + 'र', + 'ऱ', + 'ल', + 'ळ', + 'ऴ', + 'व', + 'श', + 'ष', + 'स', + 'ह', + + ## abbreviation + 'श्री', + 'डॉ', + 'कु', + 'चि', + 'सौ', + } + + return unicode_transliterate.UnicodeIndicTransliterator.transliterate(text,lang,'hi') in ack_chars + +def sentence_split(text,lang,delim_pat='auto'): ## New signature + """split the text into sentences + + A rule-based sentence splitter for Indian languages written in + Brahmi-derived scripts. The text is split at sentence delimiter + boundaries. The delimiters can be configured by passing appropriate + parameters. + + The sentence splitter can identify non-breaking phrases like + single letter, common abbreviations/honorofics for some Indian + languages. + + Args: + text (str): text to split into sentence + lang (str): ISO 639-2 language code + delim_pat (str): regular expression to identify sentence delimiter characters. If set to 'auto', the delimiter pattern is chosen automatically based on the language and text. + + + Returns: + list: list of sentences identified from the input text + """ + + #print('Input: {}'.format(delim_pat)) + if delim_pat=='auto': + if langinfo.is_danda_delim(lang): + # in modern texts it is possible that period is used as delimeter + # instead of DANDA. Hence, a check. Use danda delimiter pattern + # only if text contains at least one danda + if CONTAINS_DANDA.search(text) is None: + delim_pat=DELIM_PAT_NO_DANDA + #print('LANG has danda delim. TEXT_CONTAINS_DANDA: FALSE --> DELIM_PAT_NO_DANDA') + else: + delim_pat=DELIM_PAT_DANDA + #print('LANG has danda delim. TEXT_CONTAINS_DANDA: TRUE --> DELIM_PAT_DANDA') + else: + delim_pat=DELIM_PAT_NO_DANDA + #print('LANG has no danda delim --> DELIM_PAT_NO_DANDA') + + ## otherwise, assume the caller set the delimiter pattern + + ### Phase 1: break on sentence delimiters. + cand_sentences=[] + begin=0 + text = text.strip() + for mo in delim_pat.finditer(text): + p1=mo.start() + p2=mo.end() + + ## NEW + if p1>0 and text[p1-1].isnumeric(): + continue + + end=p1+1 + s= text[begin:end].strip() + if len(s)>0: + cand_sentences.append(s) + begin=p1+1 + + s= text[begin:].strip() + if len(s)>0: + cand_sentences.append(s) + + if not delim_pat.search('.'): + ## run phase 2 only if delimiter pattern contains period + #print('No need to run phase2') + return cand_sentences +# print(cand_sentences) +# print('====') + +# return cand_sentences + + ### Phase 2: Address the fact that '.' may not always be a sentence delimiter + ### Method: If there is a run of lines containing only a word (optionally) and '.', + ### merge these lines as well one sentence preceding and succeeding this run of lines. + final_sentences=[] + sen_buffer='' + bad_state=False + + for i, sentence in enumerate(cand_sentences): + words=sentence.split(' ') + #if len(words)<=2 and words[-1]=='.': + if len(words)==1 and sentence[-1]=='.': + bad_state=True + sen_buffer = sen_buffer + ' ' + sentence + ## NEW condition + elif sentence[-1]=='.' and is_acronym_abbvr(words[-1][:-1],lang): + if len(sen_buffer)>0 and not bad_state: + final_sentences.append(sen_buffer) + bad_state=True + sen_buffer = sentence + elif bad_state: + sen_buffer = sen_buffer + ' ' + sentence + if len(sen_buffer)>0: + final_sentences.append(sen_buffer) + sen_buffer='' + bad_state=False + else: ## good state + if len(sen_buffer)>0: + final_sentences.append(sen_buffer) + sen_buffer=sentence + bad_state=False + + if len(sen_buffer)>0: + final_sentences.append(sen_buffer) + + return final_sentences diff --git a/indic_nlp_library/indicnlp/transliterate/__init__.py b/indic_nlp_library/indicnlp/transliterate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/indic_nlp_library/indicnlp/transliterate/acronym_transliterator.py b/indic_nlp_library/indicnlp/transliterate/acronym_transliterator.py new file mode 100644 index 0000000000000000000000000000000000000000..8b237bc8f7c0d6a7f3a47885765b65c625808585 --- /dev/null +++ b/indic_nlp_library/indicnlp/transliterate/acronym_transliterator.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +#Program to transliterate acronyms from one Latin script to Indic languages +# +# @author Anoop Kunchukuttan +# + +from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator +import string +import random + +class LatinToIndicAcronymTransliterator(object): + + LATIN_TO_DEVANAGARI_TRANSTABLE = str.maketrans({ + 'a':'ए', + 'b':'बी', + 'c':'सी', + 'd':'डी', + 'e':'ई', + 'f':'एफ', + 'g':'जी', + 'h':'एच', + 'i':'आई', + 'j':'जे', + 'k':'के', + 'l':'एल', + 'm':'एम', + 'n':'एन', + 'o':'ओ', + 'p':'पी', + 'q':'क्यू', + 'r':'आर', + 's':'एस', + 't':'टी', + 'u':'यू', + 'v':'वी', + 'w':'डब्ल्यू', + 'x':'एक्स', + 'y':'वाय', + 'z':'जेड', + }) + + # a_unichr=ord('a') + # alphabet = [ chr(a_unichr+n) for n in range(26) ] + LATIN_ALPHABET = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] + + @staticmethod + def get_transtable(): + return LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE + + @staticmethod + def transliterate(w,lang): + return UnicodeIndicTransliterator.transliterate(w.lower().translate(LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE),'hi',lang) + + @staticmethod + def generate_latin_acronyms(num_acronyms, min_len=2, max_len=6, strategy='random'): + """ + generate Latin acronyms in lower case + """ + + def sample_acronym(strategy='random'): + if strategy=='random': + slen=random.randint(min_len,max_len) + return ''.join(random.choices(LatinToIndicAcronymTransliterator.LATIN_ALPHABET,k=slen)) + + + return [ sample_acronym(strategy) for i in range(num_acronyms) ] + \ No newline at end of file diff --git a/indic_nlp_library/indicnlp/transliterate/script_unifier.py b/indic_nlp_library/indicnlp/transliterate/script_unifier.py new file mode 100644 index 0000000000000000000000000000000000000000..0ed2c6b86444be53238c79c73ecce8f9b746f2ad --- /dev/null +++ b/indic_nlp_library/indicnlp/transliterate/script_unifier.py @@ -0,0 +1,157 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts +# +# @author Anoop Kunchukuttan +# + +import sys +from indicnlp.normalize import indic_normalize +from indicnlp.transliterate import unicode_transliterate +from indicnlp import loader + +class AggressiveScriptUnifier(): + + def __init__(self,common_lang='hi',nasals_mode='to_nasal_consonants'): + self.common_lang=common_lang + self.nasals_mode=nasals_mode + self.do_normalize_chandras=True + self.do_normalize_vowel_ending=True + self.remove_nuktas=True + self.normalizer_map={} + self._init_normalizers() + + def _init_normalizers(self): + normalizer_factory=indic_normalize.IndicNormalizerFactory() + + ## for languages with common parameters + for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn']: + self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode, + do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, + do_normalize_vowel_ending=self.do_normalize_vowel_ending) + + ## for languages with language specific parameters + self.normalizer_map['pa']=normalizer_factory.get_normalizer('pa', nasals_mode=self.nasals_mode, + do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, + do_normalize_vowel_ending=self.do_normalize_vowel_ending, + do_canonicalize_addak=True, do_canonicalize_tippi=True, + do_replace_vowel_bases=True) + self.normalizer_map['or']=normalizer_factory.get_normalizer('or', nasals_mode=self.nasals_mode, + do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, + do_normalize_vowel_ending=self.do_normalize_vowel_ending, + do_remap_wa=True) + self.normalizer_map['as']=normalizer_factory.get_normalizer('as', nasals_mode=self.nasals_mode, + do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, + do_normalize_vowel_ending=self.do_normalize_vowel_ending, + do_remap_assamese_chars=True) + self.normalizer_map['ml']=normalizer_factory.get_normalizer('ml', nasals_mode=self.nasals_mode, + do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, + do_normalize_vowel_ending=self.do_normalize_vowel_ending, + do_canonicalize_chillus=True, do_correct_geminated_T=True) + + def transform(self,text,lang): + text=self.normalizer_map[lang].normalize(text) + text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) + return text + +class BasicScriptUnifier(): + + def __init__(self,common_lang='hi',nasals_mode='do_nothing'): + self.common_lang=common_lang + self.nasals_mode=nasals_mode + self.normalizer_map={} + self._init_normalizers() + + def _init_normalizers(self): + normalizer_factory=indic_normalize.IndicNormalizerFactory() + + for lang in ['hi','mr','sa','kK','ne','sd','bn','gu','ta','te','kn','pa','or','as','ml']: + self.normalizer_map[lang]=normalizer_factory.get_normalizer(lang, nasals_mode=self.nasals_mode) + + def transform(self,text,lang): + + if lang in self.normalizer_map: + text=self.normalizer_map[lang].normalize(text) + + text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) + return text + +class NaiveScriptUnifier(): + + def __init__(self,common_lang='hi'): + self.common_lang=common_lang + + def transform(self,text,lang): + + text=unicode_transliterate.UnicodeIndicTransliterator.transliterate(text, lang, self.common_lang) + return text + +if __name__ == '__main__': + + loader.load() + + if len(sys.argv)<=4: + print("Usage: python script_unifier ") + sys.exit(1) + + if sys.argv[1]=='aggressive': + + language=sys.argv[4] + + unifier=AggressiveScriptUnifier(nasals_mode='to_nasal_consonants') + + with open(sys.argv[2],'r',encoding='utf-8') as ifile: + with open(sys.argv[3],'w',encoding='utf-8') as ofile: + for i, line in enumerate(ifile.readlines()): + + line=line.strip() + transliterated_line=unifier.transform(line,language) + ofile.write(transliterated_line+'\n') + + elif sys.argv[1]=='moderate': + + language=sys.argv[4] + + unifier=AggressiveScriptUnifier(nasals_mode='do_nothing') + + with open(sys.argv[2],'r',encoding='utf-8') as ifile: + with open(sys.argv[3],'w',encoding='utf-8') as ofile: + for i, line in enumerate(ifile.readlines()): + + line=line.strip() + transliterated_line=unifier.transform(line,language) + ofile.write(transliterated_line+'\n') + + elif sys.argv[1]=='basic': + + language=sys.argv[4] + + unifier=BasicScriptUnifier() + + with open(sys.argv[2],'r',encoding='utf-8') as ifile: + with open(sys.argv[3],'w',encoding='utf-8') as ofile: + for i, line in enumerate(ifile.readlines()): + + line=line.strip() + transliterated_line=unifier.transform(line,language) + ofile.write(transliterated_line+'\n') + + elif sys.argv[1]=='naive': + + language=sys.argv[4] + + unifier=NaiveScriptUnifier() + + with open(sys.argv[2],'r',encoding='utf-8') as ifile: + with open(sys.argv[3],'w',encoding='utf-8') as ofile: + for i, line in enumerate(ifile.readlines()): + + line=line.strip() + transliterated_line=unifier.transform(line,language) + ofile.write(transliterated_line+'\n') diff --git a/indic_nlp_library/indicnlp/transliterate/sinhala_transliterator.py b/indic_nlp_library/indicnlp/transliterate/sinhala_transliterator.py new file mode 100644 index 0000000000000000000000000000000000000000..5d9d3530b1ea3cb1ad5d860352040cdbdb48ec46 --- /dev/null +++ b/indic_nlp_library/indicnlp/transliterate/sinhala_transliterator.py @@ -0,0 +1,171 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +class SinhalaDevanagariTransliterator(object): + """ + A Devanagari to Sinhala transliterator based on explicit Unicode Mapping + """ + + sinhala_devnag_map={ + '\u0d82':'\u0902', + '\u0d83':'\u0903', + '\u0d84':'\u0904', + '\u0d85':'\u0905', + '\u0d86':'\u0906', + '\u0d87':'\u090d', + '\u0d88':'\u090d', + '\u0d89':'\u0907', + '\u0d8a':'\u0908', + '\u0d8b':'\u0909', + '\u0d8c':'\u090a', + '\u0d8d':'\u090b', + '\u0d8f':'\u090c', + '\u0d91':'\u090e', + '\u0d92':'\u090f', + '\u0d93':'\u0910', + '\u0d94':'\u0912', + '\u0d95':'\u0913', + '\u0d96':'\u0914', + '\u0d9a':'\u0915', + '\u0d9b':'\u0916', + '\u0d9c':'\u0917', + '\u0d9d':'\u0918', + '\u0d9e':'\u0919', + '\u0d9f':'\u0919', + '\u0da0':'\u091a', + '\u0da1':'\u091b', + '\u0da2':'\u091c', + '\u0da3':'\u091d', + '\u0da4':'\u091e', + '\u0da5':'\u091e', + '\u0da6':'\u091e', + '\u0da7':'\u091f', + '\u0da8':'\u0920', + '\u0da9':'\u0921', + '\u0daa':'\u0922', + '\u0dab':'\u0923', + '\u0dac':'\u0923', + '\u0dad':'\u0924', + '\u0dae':'\u0925', + '\u0daf':'\u0926', + '\u0db0':'\u0927', + '\u0db1':'\u0928', + '\u0db2':'\u0928', + '\u0db3':'\u0928', + '\u0db4':'\u092a', + '\u0db5':'\u092b', + '\u0db6':'\u092c', + '\u0db7':'\u092d', + '\u0db8':'\u092e', + '\u0dba':'\u092f', + '\u0dbb':'\u0930', + '\u0dbd':'\u0932', + '\u0dc5':'\u0933', + '\u0dc0':'\u0935', + '\u0dc1':'\u0936', + '\u0dc2':'\u0937', + '\u0dc3':'\u0938', + '\u0dc4':'\u0939', + '\u0dcf':'\u093e', + '\u0dd0':'\u0949', + '\u0dd1':'\u0949', + '\u0dd2':'\u093f', + '\u0dd3':'\u0940', + '\u0dd4':'\u0941', + '\u0dd6':'\u0942', + '\u0dd8':'\u0943', + '\u0dd9':'\u0946', + '\u0dda':'\u0947', + '\u0ddb':'\u0948', + '\u0ddc':'\u094a', + '\u0ddd':'\u094b', + '\u0dde':'\u094c', + '\u0dca':'\u094d', + } + + devnag_sinhala_map={ + '\u0900':'\u0d82', + '\u0901':'\u0d82', + '\u0902':'\u0d82', + '\u0903':'\u0d83', + '\u0904':'\u0d84', + '\u0905':'\u0d85', + '\u0906':'\u0d86', + '\u0907':'\u0d89', + '\u0908':'\u0d8a', + '\u0909':'\u0d8b', + '\u090a':'\u0d8c', + '\u090b':'\u0d8d', + '\u090c':'\u0d8f', + '\u090d':'\u0d88', + '\u090e':'\u0d91', + '\u090f':'\u0d92', + '\u0910':'\u0d93', + '\u0912':'\u0d94', + '\u0913':'\u0d95', + '\u0914':'\u0d96', + '\u0915':'\u0d9a', + '\u0916':'\u0d9b', + '\u0917':'\u0d9c', + '\u0918':'\u0d9d', + '\u0919':'\u0d9e', + '\u091a':'\u0da0', + '\u091b':'\u0da1', + '\u091c':'\u0da2', + '\u091d':'\u0da3', + '\u091e':'\u0da4', + '\u091f':'\u0da7', + '\u0920':'\u0da8', + '\u0921':'\u0da9', + '\u0922':'\u0daa', + '\u0923':'\u0dab', + '\u0924':'\u0dad', + '\u0925':'\u0dae', + '\u0926':'\u0daf', + '\u0927':'\u0db0', + '\u0928':'\u0db1', + '\u0929':'\u0db1', + '\u092a':'\u0db4', + '\u092b':'\u0db5', + '\u092c':'\u0db6', + '\u092d':'\u0db7', + '\u092e':'\u0db8', + '\u092f':'\u0dba', + '\u0930':'\u0dbb', + '\u0932':'\u0dbd', + '\u0933':'\u0dc5', + '\u0935':'\u0dc0', + '\u0936':'\u0dc1', + '\u0937':'\u0dc2', + '\u0938':'\u0dc3', + '\u0939':'\u0dc4', + '\u093e':'\u0dcf', + '\u0949':'\u0dd1', + '\u093f':'\u0dd2', + '\u0940':'\u0dd3', + '\u0941':'\u0dd4', + '\u0942':'\u0dd6', + '\u0943':'\u0dd8', + '\u0946':'\u0dd9', + '\u0947':'\u0dda', + '\u0948':'\u0ddb', + '\u094a':'\u0ddc', + '\u094b':'\u0ddd', + '\u094c':'\u0dde', + '\u094d':'\u0dca', + + } + + @staticmethod + def devanagari_to_sinhala(text): + return ''.join([ SinhalaDevanagariTransliterator.devnag_sinhala_map.get(c,c) for c in text ]) + + @staticmethod + def sinhala_to_devanagari(text): + return ''.join([ SinhalaDevanagariTransliterator.sinhala_devnag_map.get(c,c) for c in text ]) + diff --git a/indic_nlp_library/indicnlp/transliterate/unicode_transliterate.py b/indic_nlp_library/indicnlp/transliterate/unicode_transliterate.py new file mode 100644 index 0000000000000000000000000000000000000000..9754b40821b519aeee669973156d970b18ef6f3b --- /dev/null +++ b/indic_nlp_library/indicnlp/transliterate/unicode_transliterate.py @@ -0,0 +1,347 @@ +# +# Copyright (c) 2013-present, Anoop Kunchukuttan +# All rights reserved. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +#Program for text written in one Indic script to another based on Unicode mappings. +# +# @author Anoop Kunchukuttan +# + +import sys, string, itertools, re, os +from collections import defaultdict + +from indicnlp import common +from indicnlp import langinfo +from indicnlp.script import indic_scripts as isc +from indicnlp.transliterate.sinhala_transliterator import SinhalaDevanagariTransliterator as sdt +import pandas as pd + +OFFSET_TO_ITRANS={} +ITRANS_TO_OFFSET=defaultdict(list) + +DUPLICATE_ITRANS_REPRESENTATIONS={} + + +def init(): + """ + To be called by library loader, do not call it in your program + """ + + ### Load the ITRANS-script offset map. The map was initially generated using the snippet below (uses the old itrans transliterator) + ### The map is modified as needed to accomodate extensions and corrections to the mappings + # + # base=0x900 + # l=[] + # for i in range(0,0x80): + # c=chr(base+i) + # itrans=ItransTransliterator.to_itrans(c,'hi') + # l.append((hex(i),c,itrans)) + # print(l) + # + # pd.DataFrame(l,columns=['offset_hex','devnag_char','itrans']).to_csv('offset_itrans_map.csv',index=False,encoding='utf-8') + + itrans_map_fname=os.path.join(common.get_resources_path(),'transliterate','offset_itrans_map.csv') + #itrans_map_fname=r'D:\src\python_sandbox\src\offset_itrans_map.csv' + itrans_df=pd.read_csv(itrans_map_fname,encoding='utf-8') + + global OFFSET_TO_ITRANS, ITRANS_TO_OFFSET, DUPLICATE_ITRANS_REPRESENTATIONS + + for r in itrans_df.iterrows(): + itrans=r[1]['itrans'] + o=int(r[1]['offset_hex'],base=16) + + OFFSET_TO_ITRANS[o]=itrans + + if langinfo.is_consonant_offset(o): + ### for consonants, strip the schwa - add halant offset + ITRANS_TO_OFFSET[itrans[:-1]].extend([o,0x4d]) + else: + ### the append assumes that the maatra always comes after independent vowel in the df + ITRANS_TO_OFFSET[itrans].append(o) + + + DUPLICATE_ITRANS_REPRESENTATIONS = { + 'A': 'aa', + 'I': 'ii', + 'U': 'uu', + 'RRi': 'R^i', + 'RRI': 'R^I', + 'LLi': 'L^i', + 'LLI': 'L^I', + 'L': 'ld', + 'w': 'v', + 'x': 'kSh', + 'gj': 'j~n', + 'dny': 'j~n', + '.n': '.m', + 'M': '.m', + 'OM': 'AUM' + } + +class UnicodeIndicTransliterator(object): + """ + Base class for rule-based transliteration among Indian languages. + + Script pair specific transliterators should derive from this class and override the transliterate() method. + They can call the super class 'transliterate()' method to avail of the common transliteration + """ + + @staticmethod + def _correct_tamil_mapping(offset): + # handle missing unaspirated and voiced plosives in Tamil script + # replace by unvoiced, unaspirated plosives + + # for first 4 consonant rows of varnamala + # exception: ja has a mapping in Tamil + if offset>=0x15 and offset<=0x28 and \ + offset!=0x1c and \ + not ( (offset-0x15)%5==0 or (offset-0x15)%5==4 ) : + subst_char=(offset-0x15)//5 + offset=0x15+5*subst_char + + # for 5th consonant row of varnamala + if offset in [ 0x2b, 0x2c, 0x2d]: + offset=0x2a + + # 'sh' becomes 'Sh' + if offset==0x36: + offset=0x37 + + return offset + + @staticmethod + def transliterate(text,lang1_code,lang2_code): + """ + convert the source language script (lang1) to target language script (lang2) + + text: text to transliterate + lang1_code: language 1 code + lang1_code: language 2 code + """ + if lang1_code in langinfo.SCRIPT_RANGES and lang2_code in langinfo.SCRIPT_RANGES: + + # if Sinhala is source, do a mapping to Devanagari first + if lang1_code=='si': + text=sdt.sinhala_to_devanagari(text) + lang1_code='hi' + + # if Sinhala is target, make Devanagiri the intermediate target + org_lang2_code='' + if lang2_code=='si': + lang2_code='hi' + org_lang2_code='si' + + trans_lit_text=[] + for c in text: + newc=c + offset=ord(c)-langinfo.SCRIPT_RANGES[lang1_code][0] + if offset >=langinfo.COORDINATED_RANGE_START_INCLUSIVE and offset <= langinfo.COORDINATED_RANGE_END_INCLUSIVE and c!='\u0964' and c!='\u0965': + if lang2_code=='ta': + # tamil exceptions + offset=UnicodeIndicTransliterator._correct_tamil_mapping(offset) + newc=chr(langinfo.SCRIPT_RANGES[lang2_code][0]+offset) + + trans_lit_text.append(newc) + + # if Sinhala is source, do a mapping to Devanagari first + if org_lang2_code=='si': + return sdt.devanagari_to_sinhala(''.join(trans_lit_text)) + + return ''.join(trans_lit_text) + else: + return text + +class ItransTransliterator(object): + """ + Transliterator between Indian scripts and ITRANS + """ + + @staticmethod + def to_itrans(text,lang_code): + if lang_code in langinfo.SCRIPT_RANGES: + if lang_code=='ml': + # Change from chillus characters to corresponding consonant+halant + text=text.replace('\u0d7a','\u0d23\u0d4d') + text=text.replace('\u0d7b','\u0d28\u0d4d') + text=text.replace('\u0d7c','\u0d30\u0d4d') + text=text.replace('\u0d7d','\u0d32\u0d4d') + text=text.replace('\u0d7e','\u0d33\u0d4d') + text=text.replace('\u0d7f','\u0d15\u0d4d') + + offsets = [ isc.get_offset(c,lang_code) for c in text ] + + ### naive lookup + # itrans_l = [ OFFSET_TO_ITRANS.get(o, '-' ) for o in offsets ] + itrans_l=[] + for o in offsets: + itrans=OFFSET_TO_ITRANS.get(o, chr(langinfo.SCRIPT_RANGES[lang_code][0]+o) ) + if langinfo.is_halanta_offset(o): + itrans='' + if len(itrans_l)>0: + itrans_l.pop() + elif langinfo.is_vowel_sign_offset(o) and len(itrans_l)>0: + itrans_l.pop() + itrans_l.extend(itrans) + + return ''.join(itrans_l) + + else: + return text + + @staticmethod + def from_itrans(text,lang): + """ + TODO: Document this method properly + TODO: A little hack is used to handle schwa: needs to be documented + TODO: check for robustness + """ + + MAXCODE=4 ### TODO: Needs to be fixed + + ## handle_duplicate_itrans_representations + for k, v in DUPLICATE_ITRANS_REPRESENTATIONS.items(): + if k in text: + text=text.replace(k,v) + + start=0 + match=None + solution=[] + + i=start+1 + while i<=len(text): + + itrans=text[start:i] + + # print('===') + # print('i: {}'.format(i)) + # if i0 and langinfo.is_halanta(solution[-1],lang): + offs=[offs[1]] ## dependent vowel + else: + offs=[offs[0]] ## independent vowel + + c=''.join([ langinfo.offset_to_char(x,lang) for x in offs ]) + match=(i,c) + + elif len(itrans)==1: ## unknown character + match=(i,itrans) + elif i ") + sys.exit(1) + + if sys.argv[1]=='transliterate': + + src_language=sys.argv[4] + tgt_language=sys.argv[5] + + with open(sys.argv[2],'r', encoding='utf-8') as ifile: + with open(sys.argv[3],'w', encoding='utf-8') as ofile: + for line in ifile.readlines(): + transliterated_line=UnicodeIndicTransliterator.transliterate(line,src_language,tgt_language) + ofile.write(transliterated_line) + + elif sys.argv[1]=='romanize': + + language=sys.argv[4] + + ### temp fix to replace anusvara with corresponding nasal + #r1_nasal=re.compile(ur'\u0902([\u0915-\u0918])') + #r2_nasal=re.compile(ur'\u0902([\u091a-\u091d])') + #r3_nasal=re.compile(ur'\u0902([\u091f-\u0922])') + #r4_nasal=re.compile(ur'\u0902([\u0924-\u0927])') + #r5_nasal=re.compile(ur'\u0902([\u092a-\u092d])') + + with open(sys.argv[2],'r', encoding='utf-8') as ifile: + with open(sys.argv[3],'w', encoding='utf-8') as ofile: + for line in ifile.readlines(): + ### temp fix to replace anusvara with corresponding nasal + #line=r1_nasal.sub(u'\u0919\u094D\\1',line) + #line=r2_nasal.sub(u'\u091e\u094D\\1',line) + #line=r3_nasal.sub(u'\u0923\u094D\\1',line) + #line=r4_nasal.sub(u'\u0928\u094D\\1',line) + #line=r5_nasal.sub(u'\u092e\u094D\\1',line) + + transliterated_line=ItransTransliterator.to_itrans(line,language) + + ## temp fix to replace 'ph' to 'F' to match with Urdu transliteration scheme + transliterated_line=transliterated_line.replace('ph','f') + + ofile.write(transliterated_line) + + elif sys.argv[1]=='indicize': + + language=sys.argv[4] + + with open(sys.argv[2],'r', encoding='utf-8') as ifile: + with open(sys.argv[3],'w', encoding='utf-8') as ofile: + for line in ifile.readlines(): + transliterated_line=ItransTransliterator.from_itrans(line,language) + ofile.write(transliterated_line) + diff --git a/indic_nlp_library/indicnlp/version.txt b/indic_nlp_library/indicnlp/version.txt new file mode 100644 index 0000000000000000000000000000000000000000..453a698ed5ba350bb9e62e4c9995bb0828ed0cd3 --- /dev/null +++ b/indic_nlp_library/indicnlp/version.txt @@ -0,0 +1 @@ +0.81 diff --git a/indic_nlp_library/requirements.txt b/indic_nlp_library/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d92c692126bd50f3ed49db616c7f108513ac30b2 --- /dev/null +++ b/indic_nlp_library/requirements.txt @@ -0,0 +1,5 @@ +sphinx-argparse +sphinx_rtd_theme +morfessor +pandas +numpy diff --git a/indic_nlp_library/setup.py b/indic_nlp_library/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..8b132dc2faeab6c863c6d5ecf04863b2191afdcb --- /dev/null +++ b/indic_nlp_library/setup.py @@ -0,0 +1,48 @@ +import setuptools +from pkg_resources import parse_requirements +import pathlib +import os + +def write_version_py(): + with open(os.path.join("indicnlp", "version.txt")) as f: + version = f.read().strip() + + # write version info to fairseq/version.py + with open(os.path.join("indicnlp", "version.py"), "w") as f: + f.write('__version__ = "{}"\n'.format(version)) + return version + +with open("README.md", "r") as fh: + long_description = fh.read() + +version=write_version_py() + +setuptools.setup( + name="indic_nlp_library", # Replace with your own username + version=version, + author="Anoop Kunchukuttan", + author_email="anoop.kunchukuttan@gmail.com", + description="The goal of the Indic NLP Library is to build Python based libraries for common"\ + ' text processing and Natural Language Processing in Indian languages.', + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/anoopkunchukuttan/indic_nlp_library", + # project_urls={ + # "Bug Tracker": "https://bugs.example.com/HelloWorld/", + # "Documentation": "https://docs.example.com/HelloWorld/", + # "Source Code": "https://code.example.com/HelloWorld/", + # }, + packages=setuptools.find_packages(), + license='MIT', + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.5', + download_url='https://github.com/anoopkunchukuttan/indic_nlp_library/archive/master.zip', + install_requires=[ + str(requirement) for requirement + in parse_requirements(pathlib.Path('requirements.txt').open()) + ] +) diff --git a/indic_nlp_library/test_data/morph/mr.txt b/indic_nlp_library/test_data/morph/mr.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7c7b6dceefd91c229ef5f080e11dc617a3e69f2 --- /dev/null +++ b/indic_nlp_library/test_data/morph/mr.txt @@ -0,0 +1,10 @@ +स्वच्छ उच्छ्वास आणि चमकदार दात हे आपले व्यक्तिमत्व खुलवतील . +दातांमुळे आपला आत्मविश्वाससुद्धा वाढतो . +आपल्या हिरड्यांच्या आणि दातांच्यामध्ये जीवाणू असतात . +हे दातांना अस्वच्छ आणि श्वासांना दुर्गंधित करतात . +इथे दिलेल्या काही सोप्या सूचना टिपांच्या मदतीने आपण आपल्या दातांना आणि उच्छ्वासास स्वच्छ ठेवू शकतो . +दातांना नीट साफ करा . +दातांना नीट साफ करण्यासाठी दोन ते तीन मिनटांचा कालावधी लागतो . +परंतु बहुतेक लोक ह्याच्यासाठी एक मिनटापेक्षाही कमी वेळ देतात . +खूप पाणी प्या . +तोंड कोरडे पडल्यावर जीवाणू जोरात हल्ला करतात . diff --git a/indic_nlp_library/test_data/normalize/bn.txt b/indic_nlp_library/test_data/normalize/bn.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc89c90a2caca7bcaf6ca6648fffe46dce8bed6d --- /dev/null +++ b/indic_nlp_library/test_data/normalize/bn.txt @@ -0,0 +1 @@ +তাজা শ্বাস আর ঝকঝকে দাঁত আপনার ব্যক্তিত্বের পরিচয় দেয় ৷ diff --git a/indic_nlp_library/test_data/normalize/en.txt b/indic_nlp_library/test_data/normalize/en.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab170a8214ec2aa0c4fdea6ef3801515d29cc739 --- /dev/null +++ b/indic_nlp_library/test_data/normalize/en.txt @@ -0,0 +1 @@ +Fresh breath and shining teeth enhance your personality . diff --git a/indic_nlp_library/test_data/normalize/gu.txt b/indic_nlp_library/test_data/normalize/gu.txt new file mode 100644 index 0000000000000000000000000000000000000000..34be75d24711daac9fcafcaddb16ff72ec6ccac7 --- /dev/null +++ b/indic_nlp_library/test_data/normalize/gu.txt @@ -0,0 +1 @@ +તાજા શ્વાસ અને ચમક્તા દાંત તમારા વ્યક્તિત્વને નિખારે છે . diff --git a/indic_nlp_library/test_data/normalize/hi.txt b/indic_nlp_library/test_data/normalize/hi.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4823169ee87daaa8da509f91c7d9c5e210fa014 --- /dev/null +++ b/indic_nlp_library/test_data/normalize/hi.txt @@ -0,0 +1 @@ +ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं । diff --git a/indic_nlp_library/test_data/normalize/kK.txt b/indic_nlp_library/test_data/normalize/kK.txt new file mode 100644 index 0000000000000000000000000000000000000000..528ad7a726399964158fc46330958231967128d8 --- /dev/null +++ b/indic_nlp_library/test_data/normalize/kK.txt @@ -0,0 +1 @@ +ताजो स्वास आनी चकचकीत दांत तुमचें व्यक्तीमत्व परजळायतात . diff --git a/indic_nlp_library/test_data/normalize/ma.txt b/indic_nlp_library/test_data/normalize/ma.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ce02ae32daa5f2a26d280d18ce0ba66c8f624bf --- /dev/null +++ b/indic_nlp_library/test_data/normalize/ma.txt @@ -0,0 +1 @@ +ഉന്മേഷമുള്ള ശ്വാസവും , തിളങ്ങുന്ന പല്ലുകളും താങ്കളുടെ വ്യക്തിത്വത്തെ ശോഭിപ്പിക്കുന്നു . diff --git a/indic_nlp_library/test_data/normalize/mr.txt b/indic_nlp_library/test_data/normalize/mr.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb5b8b22ab4b25bcb1cdc5c69a0c0826e2e8e031 --- /dev/null +++ b/indic_nlp_library/test_data/normalize/mr.txt @@ -0,0 +1 @@ +स्वच्छ उच्छ्वास आणि चमकदार दात हे आपले व्यक्तिमत्व खुलवतील . diff --git a/indic_nlp_library/test_data/normalize/pa.txt b/indic_nlp_library/test_data/normalize/pa.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb3f043e622d66b1bc39e9cfa40313000b2fc99c --- /dev/null +++ b/indic_nlp_library/test_data/normalize/pa.txt @@ -0,0 +1 @@ + ਤਾਜ਼ੇ ਸਾਹ ਅਤੇ ਚਮਕਦੇ ਦੰਦ ਤੁਹਾਡੇ ਵਿਅਕਤਿਤਵ ਨੂੰ ਨਿਖਾਰਦੇ ਹਨ | diff --git a/indic_nlp_library/test_data/normalize/ta.txt b/indic_nlp_library/test_data/normalize/ta.txt new file mode 100644 index 0000000000000000000000000000000000000000..a07ee77bf42a798d1d5da8c0f0646319a19a96e3 --- /dev/null +++ b/indic_nlp_library/test_data/normalize/ta.txt @@ -0,0 +1 @@ +புத்துணர்ச்சியான சுவாசம் மற்றும் பளபளப்பான பற்கள் தங்களின் தோற்றத்தை நிர்ணயிக்கிறது . diff --git a/indic_nlp_library/test_data/normalize/te.txt b/indic_nlp_library/test_data/normalize/te.txt new file mode 100644 index 0000000000000000000000000000000000000000..f10ec6aedd75a38e5df825ab0f85cf0a73f033dd --- /dev/null +++ b/indic_nlp_library/test_data/normalize/te.txt @@ -0,0 +1 @@ +తాజాశ్వాస మరియు మిలమిల మెరిసే ద౦తాలు మీ వ్యక్తిత్వాన్ని వికసి౦పజేస్తాయి . diff --git a/indic_nlp_library/test_data/normalize/ur.txt b/indic_nlp_library/test_data/normalize/ur.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ffdd5d401193a94e955895f3b64a33f77fb8fd3 --- /dev/null +++ b/indic_nlp_library/test_data/normalize/ur.txt @@ -0,0 +1 @@ +تازہ سانسیں اور چمکتے دانت آپ کی شخصیت کو نکھارتے ہیں ۔ diff --git a/indic_nlp_library/test_data/tokenize/trivial.txt b/indic_nlp_library/test_data/tokenize/trivial.txt new file mode 100644 index 0000000000000000000000000000000000000000..e06665770ca5d14ce29cad42f1dbc4d52c41df74 --- /dev/null +++ b/indic_nlp_library/test_data/tokenize/trivial.txt @@ -0,0 +1,4 @@ +सांबा हो... फुटबॉल महासंग्राम सुरू! +मटा ऑनलाइन वृत्त । साओ पावलो +अवघ्या जगाचे लक्ष लागलेला फुटबॉलचा महासंग्राम अखेर सुरू झाला आहे. ब्राझीलमधील सर्वात मोठे महानगर असलेल्या साओ पावलो येथे ब्राझीलच्या संस्कृतीचे दर्शन घडवणाऱ्या रंगारंग कार्यक्रमाने फिफा वर्ल्डकपचे उद्घाटन झाले +'सांबा'च्या तालावर थिरकणारे हजारो कलावंत आणि पॉपस्टार जेनिफर लोपेझ, ब्राझीलियन स्टार क्लॉडिया लेइट्टे आणि पिटबूल यांचा धमाकेदार परफॉर्मन्स यामुळे स्टेडियममध्ये जमलेल्या फुटबॉल चाहत्यांच्या डोळ्यांचे पारणे फिटले. diff --git a/indic_nlp_library/test_data/transliterate.ipynb b/indic_nlp_library/test_data/transliterate.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2ab2dddb60cadea6a10ad9d9c890b6e72d764530 --- /dev/null +++ b/indic_nlp_library/test_data/transliterate.ipynb @@ -0,0 +1,153 @@ +{ + "metadata": { + "name": "" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "!pwd" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "/home/anoop/src/python/indic_nlp_library/src/indicnlp/transliterate\r\n" + ] + } + ], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import sys\n", + "sys.path.append('/home/anoop/src/python/indic_nlp_library/src')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 10 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from indicnlp.transliterate import itrans_transliterator\n", + "from indicnlp.transliterate import unicode_transliterate\n", + "from indicnlp.normalize import indic_normalize" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "print itrans_transliterator.transliterate('chakra', 'itrans',\n", + " 'devanagari', {'outputASCIIEncoded' : False})\n", + " " + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\u091a\u0915\u094d\u0930\n" + ] + } + ], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "text='jammuH pAk sainyattinR.e .ozhippiccu Sh.e .eTT perkk . parikkeR.hRu atirtta .eTT perkk parikkeRRu. atirtti grAma~N~NaLil ninn AyirattoLa.n per.e .ozhippiccu. jammu atirttiyil yuddhAntarIkShamAN nilanilkkunnat. atirtti grAma~N~NaLeyu.n sainika posRRukaLeyu.n lakShya.nvacc pAkistAnR.e AkramaNa.n tuTarukayAN. jammu kashmIril.e atirtti jillakaLAya kAtva, sAmpa .enniviTa~N~NaLilAN AkramaNa.n naTakkunnat.'\n", + "print unicode_transliterate.ItransTransliterator.from_itrans(text,'te')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\u0c1c\u0c2e\u0c4d\u0c2e\u0c41\u0c03 \u0c2a\u0c3e\u0c15\u0c4d \u0c38\u0c48\u0c28\u0c4d\u0c2f\u0c24\u0c4d\u0c24\u0c3f\u0c28\u0c4d\u0c31\u0c46 \u0c12\u0c34\u0c3f\u0c2a\u0c4d\u0c2a\u0c3f\u0c1a\u0c4d\u0c1a\u0c41 \u0c37\u0c46 \u0c0e\u0c1f\u0c4d\u0c1f\u0c4d \u0c2a\u0c47\u0c30\u0c4d\u0c15\u0c4d\u0c15\u0c4d \u0c64 \u0c2a\u0c30\u0c3f\u0c15\u0c4d\u0c15\u0c47\u0c31\u0c03\u0c31\u0c41 \u0c05\u0c24\u0c3f\u0c30\u0c4d\u0c24\u0c4d\u0c24 \u0c0e\u0c1f\u0c4d\u0c1f\u0c4d \u0c2a\u0c47\u0c30\u0c4d\u0c15\u0c4d\u0c15\u0c4d \u0c2a\u0c30\u0c3f\u0c15\u0c4d\u0c15\u0c47RR\u0c09\u0c64 \u0c05\u0c24\u0c3f\u0c30\u0c4d\u0c24\u0c4d\u0c24\u0c3f \u0c17\u0c4d\u0c30\u0c3e\u0c2e\u0c19\u0c4d\u0c19\u0c33\u0c3f\u0c32\u0c4d \u0c28\u0c3f\u0c28\u0c4d\u0c28\u0c4d \u0c06\u0c2f\u0c3f\u0c30\u0c24\u0c4d\u0c24\u0c4b\u0c33\u0c02 \u0c2a\u0c47\u0c30\u0c46 \u0c12\u0c34\u0c3f\u0c2a\u0c4d\u0c2a\u0c3f\u0c1a\u0c4d\u0c1a\u0c41\u0c64 \u0c1c\u0c2e\u0c4d\u0c2e\u0c41 \u0c05\u0c24\u0c3f\u0c30\u0c4d\u0c24\u0c4d\u0c24\u0c3f\u0c2f\u0c3f\u0c32\u0c4d \u0c2f\u0c41\u0c26\u0c4d\u0c27\u0c3e\u0c28\u0c4d\u0c24\u0c30\u0c40\u0c15\u0c4d\u0c37\u0c2e\u0c3e\u0c23\u0c4d \u0c28\u0c3f\u0c32\u0c28\u0c3f\u0c32\u0c4d\u0c15\u0c4d\u0c15\u0c41\u0c28\u0c4d\u0c28\u0c24\u0c64 \u0c05\u0c24\u0c3f\u0c30\u0c4d\u0c24\u0c4d\u0c24\u0c3f \u0c17\u0c4d\u0c30\u0c3e\u0c2e\u0c19\u0c4d\u0c19\u0c33\u0c47\u0c2f\u0c41\u0c02 \u0c38\u0c48\u0c28\u0c3f\u0c15 \u0c2a\u0c4b\u0c38\u0c4dRR\u0c09\u0c15\u0c33\u0c47\u0c2f\u0c41\u0c02 \u0c32\u0c15\u0c4d\u0c37\u0c4d\u0c2f\u0c02\u0c35\u0c1a\u0c4d\u0c1a\u0c4d \u0c2a\u0c3e\u0c15\u0c3f\u0c38\u0c4d\u0c24\u0c3e\u0c28\u0c4d\u0c31\u0c46 \u0c06\u0c15\u0c4d\u0c30\u0c2e\u0c23\u0c02 \u0c24\u0c41\u0c1f\u0c30\u0c41\u0c15\u0c2f\u0c3e\u0c23\u0c64 \u0c1c\u0c2e\u0c4d\u0c2e\u0c41 \u0c15\u0c36\u0c4d\u0c2e\u0c40\u0c30\u0c3f\u0c32\u0c46 \u0c05\u0c24\u0c3f\u0c30\u0c4d\u0c24\u0c4d\u0c24\u0c3f \u0c1c\u0c3f\u0c32\u0c4d\u0c32\u0c15\u0c33\u0c3e\u0c2f \u0c15\u0c3e\u0c24\u0c4d\u0c35, \u0c38\u0c3e\u0c2e\u0c4d\u0c2a \u0c0e\u0c28\u0c4d\u0c28\u0c3f\u0c35\u0c3f\u0c1f\u0c19\u0c4d\u0c19\u0c33\u0c3f\u0c32\u0c3e\u0c23\u0c4d \u0c06\u0c15\u0c4d\u0c30\u0c2e\u0c23\u0c02 \u0c28\u0c1f\u0c15\u0c4d\u0c15\u0c41\u0c28\u0c4d\u0c28\u0c24\u0c64\n" + ] + } + ], + "prompt_number": 19 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#mal=u'\u0d1c\u0d2e\u0d4d\u0d2e\u0d41: \u0d2a\u0d3e\u0d15\u0d4d \u0d38\u0d48\u0d28\u0d4d\u0d2f\u0d24\u0d4d\u0d24\u0d3f\u0d28\u0d4d\u0d31\u0d46 \u0d12\u0d34\u0d3f\u0d2a\u0d4d\u0d2a\u0d3f\u0d1a\u0d4d\u0d1a\u0d41 \u0d37\u0d46 \u0d0e\u0d1f\u0d4d\u0d1f\u0d4d \u0d2a\u0d47\u0d30\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d . \u0d2a\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d47\u0d31\u0d4d\u0d31\u0d41 \u0d05\u0d24\u0d3f\u0d30\u0d4d\u200d\u0d24\u0d4d\u0d24 \u0d0e\u0d1f\u0d4d\u0d1f\u0d4d \u0d2a\u0d47\u0d30\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d4d \u0d2a\u0d30\u0d3f\u0d15\u0d4d\u0d15\u0d47\u0d31\u0d4d\u0d31\u0d41. \u0d05\u0d24\u0d3f\u0d30\u0d4d\u200d\u0d24\u0d4d\u0d24\u0d3f \u0d17\u0d4d\u0d30\u0d3e\u0d2e\u0d19\u0d4d\u0d19\u0d33\u0d3f\u0d32\u0d4d\u200d \u0d28\u0d3f\u0d28\u0d4d\u0d28\u0d4d \u0d06\u0d2f\u0d3f\u0d30\u0d24\u0d4d\u0d24\u0d4b\u0d33\u0d02 \u0d2a\u0d47\u0d30\u0d46 \u0d12\u0d34\u0d3f\u0d2a\u0d4d\u0d2a\u0d3f\u0d1a\u0d4d\u0d1a\u0d41. \u0d1c\u0d2e\u0d4d\u0d2e\u0d41 \u0d05\u0d24\u0d3f\u0d30\u0d4d\u200d\u0d24\u0d4d\u0d24\u0d3f\u0d2f\u0d3f\u0d32\u0d4d\u200d \u0d2f\u0d41\u0d26\u0d4d\u0d27\u0d3e\u0d28\u0d4d\u0d24\u0d30\u0d40\u0d15\u0d4d\u0d37\u0d2e\u0d3e\u0d23\u0d4d \u0d28\u0d3f\u0d32\u0d28\u0d3f\u0d32\u0d4d\u200d\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d. \u0d05\u0d24\u0d3f\u0d30\u0d4d\u200d\u0d24\u0d4d\u0d24\u0d3f \u0d17\u0d4d\u0d30\u0d3e\u0d2e\u0d19\u0d4d\u0d19\u0d33\u0d47\u0d2f\u0d41\u0d02 \u0d38\u0d48\u0d28\u0d3f\u0d15 \u0d2a\u0d4b\u0d38\u0d4d\u0d31\u0d4d\u0d31\u0d41\u0d15\u0d33\u0d47\u0d2f\u0d41\u0d02 \u0d32\u0d15\u0d4d\u0d37\u0d4d\u0d2f\u0d02\u0d35\u0d1a\u0d4d\u0d1a\u0d4d \u0d2a\u0d3e\u0d15\u0d3f\u0d38\u0d4d\u0d24\u0d3e\u0d28\u0d4d\u0d31\u0d46 \u0d06\u0d15\u0d4d\u0d30\u0d2e\u0d23\u0d02 \u0d24\u0d41\u0d1f\u0d30\u0d41\u0d15\u0d2f\u0d3e\u0d23\u0d4d. \u0d1c\u0d2e\u0d4d\u0d2e\u0d41 \u0d15\u0d36\u0d4d\u0d2e\u0d40\u0d30\u0d3f\u0d32\u0d46 \u0d05\u0d24\u0d3f\u0d30\u0d4d\u200d\u0d24\u0d4d\u0d24\u0d3f \u0d1c\u0d3f\u0d32\u0d4d\u0d32\u0d15\u0d33\u0d3e\u0d2f \u0d15\u0d3e\u0d24\u0d4d\u0d35, \u0d38\u0d3e\u0d2e\u0d4d\u0d2a \u0d0e\u0d28\u0d4d\u0d28\u0d3f\u0d35\u0d3f\u0d1f\u0d19\u0d4d\u0d19\u0d33\u0d3f\u0d32\u0d3e\u0d23\u0d4d \u0d06\u0d15\u0d4d\u0d30\u0d2e\u0d23\u0d02 \u0d28\u0d1f\u0d15\u0d4d\u0d15\u0d41\u0d28\u0d4d\u0d28\u0d24\u0d4d.'\n", + "\n", + "text=u'\u09ac\u09be\u09b0\u09ac\u09be\u09b0 \u09b8\u0982\u0998\u09b0\u09cd\u09b7-\u09ac\u09bf\u09b0\u09a4\u09bf \u099a\u09c1\u0995\u09cd\u09a4\u09bf \u09b2\u0999\u09cd\u0998\u09a8 \u0995\u09b0\u099b\u09c7 \u09aa\u09be\u0995\u09bf\u09b8\u09cd\u09a4\u09be\u09a8\u0964 \u09ae\u09a8\u09c7 \u09b0\u09be\u0996\u09a4\u09c7 \u09b9\u09ac\u09c7, \u09ad\u09be\u09b0\u09a4\u09c7\u09b0\u0993 \u09b8\u09b9\u09cd\u09af \u0995\u09b0\u09be\u09b0 \u09b8\u09c0\u09ae\u09be \u0986\u099b\u09c7\u0964 \u098f\u0987 \u09ae\u09b0\u09cd\u09ae\u09c7 \u0987\u09b8\u09b2\u09be\u09ae\u09be\u09ac\u09be\u09a6\u0995\u09c7 \u09b9\u09c1\u0981\u09b6\u09bf\u09df\u09be\u09b0\u09bf \u09a6\u09bf\u09b2\u09c7\u09a8 \u09b8\u09cd\u09ac\u09b0\u09be\u09b7\u09cd\u099f\u09cd\u09b0\u09ae\u09a8\u09cd\u09a4\u09cd\u09b0\u09c0 \u09b0\u09be\u099c\u09a8\u09be\u09a5 \u09b8\u09bf\u0982\u0964 \u09e8\u09e6\u09e7\u09ea \u09b8\u09be\u09b2\u09c7 \u09eb\u09eb\u09e6 \u09ac\u09be\u09b0 \u09b8\u0982\u0998\u09b0\u09cd\u09b7-\u09ac\u09bf\u09b0\u09a4\u09bf \u09b2\u0999\u09cd\u0998\u09a8 \u0995\u09b0\u09c7 \u09b8\u09c0\u09ae\u09be\u09a8\u09cd\u09a4\u09c7 \u0997\u09cb\u09b2\u09be\u0997\u09c1\u09b2\u09bf \u099a\u09be\u09b2\u09bf\u09df\u09c7\u099b\u09c7 \u09aa\u09be\u0995\u09bf\u09b8\u09cd\u09a4\u09be\u09a8\u0964 \u09a8\u09a4\u09c1\u09a8 \u09ac\u099b\u09b0\u09c7\u09b0 \u09aa\u09cd\u09b0\u09a5\u09ae \u09a4\u09bf\u09a8\u09a6\u09bf\u09a8\u09c7 \u0987\u09a4\u09bf\u09ae\u09a7\u09cd\u09af\u09c7 \u09aa\u09be\u0981\u099a\u09ac\u09be\u09b0 \u09a4\u09c7\u09ae\u09a8 \u0998\u099f\u09a8\u09be \u0998\u099f\u09c7\u099b\u09c7\u0964 \u09af\u09a6\u09bf\u0993 \u09b8\u09c7-\u09a6\u09c7\u09b6\u09c7\u09b0 \u09aa\u09cd\u09b0\u09a7\u09be\u09a8\u09ae\u09a8\u09cd\u09a4\u09cd\u09b0\u09c0 \u09a8\u0993\u09df\u09be\u099c \u09b6\u09b0\u09bf\u09ab, \u09aa\u09cd\u09b0\u09a4\u09bf\u09b0\u0995\u09cd\u09b7\u09be\u09ae\u09a8\u09cd\u09a4\u09cd\u09b0\u09c0 \u0996\u09cb\u09df\u09be\u099c\u09be \u0986\u09b8\u09bf\u09ab \u09aa\u09cd\u09b0\u09ae\u09c1\u0996 \u09ac\u09b2\u09c7\u099b\u09c7\u09a8, \u09ad\u09be\u09b0\u09a4\u0987 \u09ac\u09bf\u09a8\u09be \u09aa\u09cd\u09b0\u09b0\u09cb\u099a\u09a8\u09be\u09df \u09b9\u09be\u09ae\u09b2\u09be \u099a\u09be\u09b2\u09be\u099a\u09cd\u099b\u09c7\u0964'\n", + "lang='bn'\n", + "\n", + "n=indic_normalize.IndicNormalizerFactory().get_normalizer(lang)\n", + "text=n.normalize(text)\n", + "\n", + "print unicode_transliterate.ItransTransliterator.to_itrans(text,lang)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "bArabAra sa.ngharSha-birati cukti la~Nghana karaChe pAkistAna. mane rAkhate habe, bhAratero sahya karAra sImA AChe. ei marme isalAmAbAdake hu\u0901shiya\u093cAri dilena sbarAShTramantrI rAjanAtha si.n. 2014 sAle 550 bAra sa.ngharSha-birati la~Nghana kare sImAnte golAguli cAliya\u093ceChe pAkistAna. natuna baCharera prathama tinadine itimadhye pA\u0901cabAra temana ghaTanA ghaTeChe. yadio se-deshera pradhAnamantrI noya\u093cAja sharipha, pratirakShAmantrI khoya\u093cAjA Asipha pramukha baleChena, bhArati binA prarocanAya\u093c hAmalA cAlAcChe.\n" + ] + } + ], + "prompt_number": 31 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x=u'\u0b9a\u0bc6\u0ba9\u0bcd\u0ba9\u0bc8'\n", + "\n", + "for c in x:\n", + " print u'{} {:x}'.format(c,ord(c))" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "\u0b9a b9a\n", + "\u0bc6 bc6\n", + "\u0ba9 ba9\n", + "\u0bcd bcd\n", + "\u0ba9 ba9\n", + "\u0bc8 bc8\n" + ] + } + ], + "prompt_number": 28 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/indic_nlp_resources/README.md b/indic_nlp_resources/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b6cf442624970e986e38d9a5587ccd4444e9f4fa --- /dev/null +++ b/indic_nlp_resources/README.md @@ -0,0 +1,18 @@ +# Indic NLP Resources + +The toolkit contains resources required by some components of the [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_resources) and other NLP resources for Indian languages. + +If you are looking for any other resources for Indian languages, please check the [Indic NLP Catalog](https://github.com/indicnlpweb/indicnlp_catalog) + +### Indic NLP Library related resources + +- Morphanalyzer models for Indian languages + +### Other NLP Resources +- Transliteration Models for transliteration involving Indian languages and English. + +### Version: 0.2 + +## License + +The models and resources are released under the MIT License diff --git a/indic_nlp_resources/morph/morfessor/bn.model b/indic_nlp_resources/morph/morfessor/bn.model new file mode 100644 index 0000000000000000000000000000000000000000..08a7291a401039a1d0da75c97465ccf68a468cab --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/bn.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec394bfd1f06ea20c7e512595dfb11fa5e81b4cda1a0a8e4ddcca78e5f3988bb +size 5539122 diff --git a/indic_nlp_resources/morph/morfessor/gu.model b/indic_nlp_resources/morph/morfessor/gu.model new file mode 100644 index 0000000000000000000000000000000000000000..95302ae972c25b7c79483b557415c4476ab936ea --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/gu.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ecdf8b85f3a1f57d6bf9f8c3ff3f8aa9b45dd4102a28d193796a66fd06abe87 +size 8308773 diff --git a/indic_nlp_resources/morph/morfessor/hi.model b/indic_nlp_resources/morph/morfessor/hi.model new file mode 100644 index 0000000000000000000000000000000000000000..8cf1d5b6520eece6264e6bfc5bba1d5ddb5ebd63 --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/hi.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a811ed8dd0d9504614a853a1ff0e5c67ec7d22e06621e13e4179c15713c36c65 +size 12283341 diff --git a/indic_nlp_resources/morph/morfessor/kK.model b/indic_nlp_resources/morph/morfessor/kK.model new file mode 100644 index 0000000000000000000000000000000000000000..3dc05cfae0e1caf07c0618a749ed620025e235af --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/kK.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef6a6d7dc934b79c5abde9015044cefb6e8e1738516ddd7383a4c117529cbfd3 +size 2044932 diff --git a/indic_nlp_resources/morph/morfessor/kn.model b/indic_nlp_resources/morph/morfessor/kn.model new file mode 100644 index 0000000000000000000000000000000000000000..d9ffc04e6836dc0f5f29b47dc6b8fe2371c2368f --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/kn.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaf75d5ff1c6b5dcebdf11207c7eb61b73ea63fe6594ddb06399575c013bc318 +size 41929803 diff --git a/indic_nlp_resources/morph/morfessor/ml.model b/indic_nlp_resources/morph/morfessor/ml.model new file mode 100644 index 0000000000000000000000000000000000000000..e9a9cffbe2f6e512536738a04d032d1445df7b87 --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/ml.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a752ca9381a5a47f2084a32036b0ba932fb8f5886a66d0d143b4e224b87b7ae7 +size 9925713 diff --git a/indic_nlp_resources/morph/morfessor/mr.model b/indic_nlp_resources/morph/morfessor/mr.model new file mode 100644 index 0000000000000000000000000000000000000000..0b06d5c9f12c9309a8e4c10706b97176f8a831e5 --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/mr.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab8c78fd189f7d3ba8f5a88f6c371ad7cc9e3b87cbc03805c5be3d1c436f211 +size 10937694 diff --git a/indic_nlp_resources/morph/morfessor/pa.model b/indic_nlp_resources/morph/morfessor/pa.model new file mode 100644 index 0000000000000000000000000000000000000000..b2bc74a625c53446c65c9a059399d5f2148f0fb4 --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/pa.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdb76c1e1fe72c811663d857a59af651325ff1925993b540700d92ea8d6b6437 +size 1046145 diff --git a/indic_nlp_resources/morph/morfessor/sa.model b/indic_nlp_resources/morph/morfessor/sa.model new file mode 100644 index 0000000000000000000000000000000000000000..3a445f13eb1e3c7538dadea50fb7b25c0babc587 --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/sa.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:424e92c79a48d68a707f29116bbf4e3125666b1f1684e12980dfcbcc4c9593e1 +size 1619836 diff --git a/indic_nlp_resources/morph/morfessor/ta.model b/indic_nlp_resources/morph/morfessor/ta.model new file mode 100644 index 0000000000000000000000000000000000000000..26dbce60f6ef01c6072d130883d562f836c8fbdf --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/ta.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89b885aa7e26d6bdb52623c83d60b719de256291dd28d7ea1300b93c27d5ebbc +size 40158081 diff --git a/indic_nlp_resources/morph/morfessor/te.model b/indic_nlp_resources/morph/morfessor/te.model new file mode 100644 index 0000000000000000000000000000000000000000..0ba1d9b040a524dc523044fe0213417223a87514 --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/te.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd8a9bcfb35e3fcbc0c79d86ddff4653477a60a9ea5c5cee21e5f0c417670cf7 +size 15880211 diff --git a/indic_nlp_resources/morph/morfessor/ur.model b/indic_nlp_resources/morph/morfessor/ur.model new file mode 100644 index 0000000000000000000000000000000000000000..d9ffc04e6836dc0f5f29b47dc6b8fe2371c2368f --- /dev/null +++ b/indic_nlp_resources/morph/morfessor/ur.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaf75d5ff1c6b5dcebdf11207c7eb61b73ea63fe6594ddb06399575c013bc318 +size 41929803 diff --git a/indic_nlp_resources/script/all_script_phonetic_data.csv b/indic_nlp_resources/script/all_script_phonetic_data.csv new file mode 100644 index 0000000000000000000000000000000000000000..d510a1d0467c77bdb4fa7cd34ef429d54ce1b125 --- /dev/null +++ b/indic_nlp_resources/script/all_script_phonetic_data.csv @@ -0,0 +1,113 @@ +Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded +900,0,ऀ,ऀ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +901,1,ँ,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +902,2,ं,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +903,3,ः,H,Should represent as pure aspiration and not as a vowel,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +904,4,ऄ,ऄ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +905,5,अ,a,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +906,6,आ,A,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +907,7,इ,i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +908,8,ई,I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +909,9,उ,u,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +90a,10,ऊ,uu,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +90b,11,ऋ,R^i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1 +90c,12,ऌ,LLi,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0 +90d,13,ऍ,ऍ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1 +90e,14,ऎ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +90f,15,ए,e,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +910,16,ऐ,ai,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1 +911,17,ऑ,ऑ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0 +912,18,ऒ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +913,19,ओ,o,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +914,20,औ,au,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1 +915,21,क,ka,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +916,22,ख,kha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +917,23,ग,ga,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +918,24,घ,gha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +919,25,ङ,~Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +91a,26,च,ca,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +91b,27,छ,Cha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +91c,28,ज,ja,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +91d,29,झ,jha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +91e,30,ञ,JNa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +91f,31,ट,Ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +920,32,ठ,Tha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +921,33,ड,Da,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +922,34,ढ,Dha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +923,35,ण,Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +924,36,त,ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +925,37,थ,tha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +926,38,द,da,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +927,39,ध,dha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +928,40,न,na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +929,41,ऩ,ऩ,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +92a,42,प,pa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +92b,43,फ,pha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +92c,44,ब,ba,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +92d,45,भ,bha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +92e,46,म,ma,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +92f,47,य,ya,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +930,48,र,ra,alveolar or dental- approximated by dental/ can also be considered a rhotic consonant (flap ie tap),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +931,49,ऱ,Ra,retroflex (trill),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +932,50,ल,la,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +933,51,ळ,La,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +934,52,ऴ,zha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +935,53,व,va,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +936,54,श,sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +937,55,ष,Sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +938,56,स,sa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +939,57,ह,ha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +93a,58,ऺ,ऺ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93b,59,ऻ,ऻ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93c,60,़,़,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93d,61,ऽ,.a,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93e,62,ा,A,,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +93f,63,ि,i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +940,64,ी,I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +941,65,ु,u,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +942,66,ू,uu,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +943,67,ृ,R^i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1 +944,68,ॄ,R^I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1 +945,69,ॅ,ॅ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1 +946,70,ॆ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +947,71,े,e,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +948,72,ै,ai,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1 +949,73,ॉ,ॉ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0 +94a,74,ॊ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +94b,75,ो,o,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +94c,76,ौ,au,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1 +94d,77,्,,,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +94e,78,ॎ,ॎ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +94f,79,ॏ,ॏ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +950,80,ॐ,AUM,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +951,81,॑,॑,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +952,82,॒,॒,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +953,83,॓,॓,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +954,84,॔,॔,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +955,85,ॕ,ॕ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +956,86,ॖ,ॖ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +957,87,ॗ,ॗ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +958,88,क़,क़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +959,89,ख़,ख़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95a,90,ग़,ग़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95b,91,ज़,ज़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95c,92,ड़,ड़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95d,93,ढ़,ढ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95e,94,फ़,फ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95f,95,य़,य़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +960,96,ॠ,R^I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1 +961,97,ॡ,L^I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0 +962,98,ॢ,LLi,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0 +963,99,ॣ,L^I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0 +964,100,।,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +965,101,॥,..,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +966,102,०,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +967,103,१,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +968,104,२,2,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +969,105,३,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96a,106,४,4,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96b,107,५,5,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96c,108,६,6,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96d,109,७,7,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96e,110,८,8,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96f,111,९,9,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/indic_nlp_resources/script/all_script_phonetic_data.xlsx b/indic_nlp_resources/script/all_script_phonetic_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..73cc3b86f2267204c0d4acfb47b459996eefc626 Binary files /dev/null and b/indic_nlp_resources/script/all_script_phonetic_data.xlsx differ diff --git a/indic_nlp_resources/script/arpabet.pdf b/indic_nlp_resources/script/arpabet.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1c1541aa6f7ff46a255db3dc1f076143a98c24bc Binary files /dev/null and b/indic_nlp_resources/script/arpabet.pdf differ diff --git a/indic_nlp_resources/script/english_arpabet_list.csv b/indic_nlp_resources/script/english_arpabet_list.csv new file mode 100644 index 0000000000000000000000000000000000000000..4417bff0e9abaab8778ed84eafe5b6d99f92e8d4 --- /dev/null +++ b/indic_nlp_resources/script/english_arpabet_list.csv @@ -0,0 +1,46 @@ +AO +AA +IY +UW +EH +IH +UH +AH +AX +AE +EY +AY +OW +AW +OY +P +B +T +D +K +G +CH +JH +F +V +TH +DH +S +Z +SH +ZH +HH +M +EM +N +EN +NG +ENG +L +EL +R +DX +NX +Y +W +Q diff --git a/indic_nlp_resources/script/english_script_phonetic_data.csv b/indic_nlp_resources/script/english_script_phonetic_data.csv new file mode 100644 index 0000000000000000000000000000000000000000..e95432eae86d0fb73ee0f6b0ec4fbe8a7365cdd1 --- /dev/null +++ b/indic_nlp_resources/script/english_script_phonetic_data.csv @@ -0,0 +1,47 @@ +Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded +900,0,,AO,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0 +901,1,,AA,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +902,2,,IY,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +903,3,,UW,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +904,4,ए,EH,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +905,5,इ,IH,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +906,6,उ,UH,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +907,7,अ,AH,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +908,8,अ,AX,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +909,9,ऍ,AE,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1 +90a,10,,EY,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +90b,11,ऐ,AY,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1 +90c,12,ओ,OW,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +90d,13,औ,AW,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1 +90e,14,,OY,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0 +90f,15,,P,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +910,16,,B,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +911,17,,T,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +912,18,,D,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +913,19,,K,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +914,20,,G,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +915,21,,CH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +916,22,,JH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +917,23,,F,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +918,24,,V,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +919,25,,TH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0 +91a,26,,DH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +91b,27,,S,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +91c,28,,Z,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +91d,29,,SH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +91e,30,,ZH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +91f,31,,HH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +920,32,,M,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +921,33,,EM,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +922,34,,N,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +923,35,,EN,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +924,36,,NG,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +925,37,,ENG,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +926,38,,L,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +927,39,,EL,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +928,40,,R,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +929,41,,DX,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +92a,42,,NX,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +92b,43,,Y,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +92c,44,,W,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +92d,45,,Q,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/indic_nlp_resources/script/english_script_phonetic_data.xlsx b/indic_nlp_resources/script/english_script_phonetic_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..4025b5e0404fbeaa32c469d7ded8669a64e3a7c0 Binary files /dev/null and b/indic_nlp_resources/script/english_script_phonetic_data.xlsx differ diff --git a/indic_nlp_resources/script/tamil_script_phonetic_data.csv b/indic_nlp_resources/script/tamil_script_phonetic_data.csv new file mode 100644 index 0000000000000000000000000000000000000000..218ea09e3f79e125774be2bb832ed5395f5bf963 --- /dev/null +++ b/indic_nlp_resources/script/tamil_script_phonetic_data.csv @@ -0,0 +1,113 @@ +Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded +900,0,ऀ,ऀ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +901,1,ँ,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +902,2,ं,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +903,3,ः,H,Should represent as pure aspiration and not as a vowel,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +904,4,ऄ,ऄ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +905,5,अ,a,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +906,6,आ,A,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +907,7,इ,i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +908,8,ई,I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +909,9,उ,u,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +90a,10,ऊ,uu,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +90b,11,ऋ,R^i,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +90c,12,ऌ,LLi,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +90d,13,ऍ,ऍ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1 +90e,14,ऎ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +90f,15,ए,e,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +910,16,ऐ,ai,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1 +911,17,ऑ,ऑ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0 +912,18,ऒ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +913,19,ओ,o,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +914,20,औ,au,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1 +915,21,क,ka,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0 +916,22,ख,kha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +917,23,ग,ga,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +918,24,घ,gha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +919,25,ङ,~Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +91a,26,च,ca,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0 +91b,27,छ,Cha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +91c,28,ज,ja,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +91d,29,झ,jha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +91e,30,ञ,JNa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +91f,31,ट,Ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0 +920,32,ठ,Tha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +921,33,ड,Da,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +922,34,ढ,Dha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +923,35,ण,Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +924,36,त,ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0 +925,37,थ,tha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +926,38,द,da,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +927,39,ध,dha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +928,40,न,na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +929,41,ऩ,ऩ,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +92a,42,प,pa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0 +92b,43,फ,pha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +92c,44,ब,ba,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +92d,45,भ,bha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +92e,46,म,ma,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0 +92f,47,य,ya,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +930,48,र,ra,alveolar or dental- approximated by dental/ can also be considered a rhotic consonant (flap ie tap),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +931,49,ऱ,Ra,retroflex (trill),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +932,50,ल,la,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +933,51,ळ,La,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +934,52,ऴ,zha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +935,53,व,va,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0 +936,54,श,sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +937,55,ष,Sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +938,56,स,sa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0 +939,57,ह,ha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0 +93a,58,ऺ,ऺ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93b,59,ऻ,ऻ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93c,60,़,़,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93d,61,ऽ,.a,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +93e,62,ा,A,,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1 +93f,63,ि,i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +940,64,ी,I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1 +941,65,ु,u,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +942,66,ू,uu,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0 +943,67,ृ,R^i,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +944,68,ॄ,R^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +945,69,ॅ,ॅ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1 +946,70,ॆ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +947,71,े,e,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1 +948,72,ै,ai,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1 +949,73,ॉ,ॉ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0 +94a,74,ॊ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +94b,75,ो,o,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0 +94c,76,ौ,au,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1 +94d,77,्,,,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +94e,78,ॎ,ॎ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +94f,79,ॏ,ॏ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +950,80,ॐ,AUM,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +951,81,॑,॑,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +952,82,॒,॒,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +953,83,॓,॓,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +954,84,॔,॔,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +955,85,ॕ,ॕ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +956,86,ॖ,ॖ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +957,87,ॗ,ॗ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +958,88,क़,क़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +959,89,ख़,ख़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95a,90,ग़,ग़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95b,91,ज़,ज़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95c,92,ड़,ड़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95d,93,ढ़,ढ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95e,94,फ़,फ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +95f,95,य़,य़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +960,96,ॠ,R^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1 +961,97,ॡ,L^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0 +962,98,ॢ,LLi,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +963,99,ॣ,L^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +964,100,।,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +965,101,॥,..,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +966,102,०,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +967,103,१,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +968,104,२,2,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +969,105,३,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96a,106,४,4,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96b,107,५,5,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96c,108,६,6,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96d,109,७,7,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96e,110,८,8,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +96f,111,९,9,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/indic_nlp_resources/script/tamil_script_phonetic_data.xlsx b/indic_nlp_resources/script/tamil_script_phonetic_data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..fd2b66c3f877466e21b33fec3923c1f36b21e454 Binary files /dev/null and b/indic_nlp_resources/script/tamil_script_phonetic_data.xlsx differ diff --git a/indic_nlp_resources/transliterate/README.md b/indic_nlp_resources/transliterate/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1f55e11e80f6fc5ebbf42dade0266e3d4ee06ce4 --- /dev/null +++ b/indic_nlp_resources/transliterate/README.md @@ -0,0 +1,45 @@ +# Transliteration Models for Indian languages +These are models for transliteration involving Indian languages. +The models are essentially Statistical Machine Translation systems trained using Moses over a +character-level parallel corpora of transliterations. Hence, you will need Moses to use these transliteration models. +The transliteration corpus has itself been mined in an unsupervised fashion from a translation corpus. + +Currently we have trained transliteration models for five language pairs: bn-hi, ta-hi, te-hi, en-hi and mr-hi + +Support for transliteration has been introduced in Moses from version 2.1 +So please ensure that you have minimum 2.1 version setup for Moses + +Commands to run the transliteration module using moses + +$moseshome/mosesdecoder/scripts/Transliteration/post-decoding-transliteration.pl \ +--moses-src-dir $moseshome/mosesdecoder --external-bin-dir $moseshome/tools \ +--transliteration-model-dir {path to transliteration model folder} --oov-file {path to file containing oov words, oovs are space separated with each line containing all oovs for the input line}\ + --input-file {input file to transliterated} --output-file {output file location} \ + --input-extension {input language code for eg. en} --output-extension {output language code for eg. hi} --language-model {path to language model} \ + --decoder $moseshome/mosesdecoder/bin/moses + +A sample execution of the model will be as follows: + +export moseshome={path to moses installation} +$moseshome/mosesdecoder/scripts/Transliteration/post-decoding-transliteration.pl \ +--moses-src-dir $moseshome/mosesdecoder --external-bin-dir $moseshome/tools \ +--transliteration-model-dir /home/ratish/project/nlp_resources/indic_nlp_resources/transliterate/en-hi \ +--oov-file /home/ratish/project/translit/input.oov \ + --input-file /home/ratish/project/translit/input.en \ + --output-file /home/ratish/project/translit/output.hi \ + --input-extension en --output-extension hi --language-model /home/ratish/project/translit/lm/nc.binlm.1 \ + --decoder $moseshome/mosesdecoder/bin/moses + +So far, we have seen the use of transliteration in a post-editing task for machine translation task. +In case, the models are needed for purely transliteration purpose, the input file and OOV file are the same. +Sample input file: +New Delhi is capital of India +India is worlds seventh largest nation in the World + +OOV file +New Delhi is capital of India +India is worlds seventh largest nation in the World + +On running the transliteration module, the output is: +न्यू डेल्ही इस कैपिटल आफ इंडिया +इंडिया इस वर्ल्ड सेवंथ लारगेस्ट नेशन इन थे वर्ल्ड diff --git a/indic_nlp_resources/transliterate/bn-hi.zip b/indic_nlp_resources/transliterate/bn-hi.zip new file mode 100644 index 0000000000000000000000000000000000000000..923952c19f7ae5f9712498c353d7ce7c3bf28bfb --- /dev/null +++ b/indic_nlp_resources/transliterate/bn-hi.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc80cbae205f3aab5320099a46737904dd816a84bf4f33fa843f909c678c93f6 +size 8113384 diff --git a/indic_nlp_resources/transliterate/en-hi.zip b/indic_nlp_resources/transliterate/en-hi.zip new file mode 100644 index 0000000000000000000000000000000000000000..efed0f0a1cb7ebed49ce33ee860749fd3dc0a817 --- /dev/null +++ b/indic_nlp_resources/transliterate/en-hi.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57a1b67ce6f709dd79988f5600fafdaeb4e82abb2defcf5ffad8072aab6cab9d +size 4165941 diff --git a/indic_nlp_resources/transliterate/mr-hi.zip b/indic_nlp_resources/transliterate/mr-hi.zip new file mode 100644 index 0000000000000000000000000000000000000000..ffa8cb09603fff405543b25e881ef8cddd14a825 --- /dev/null +++ b/indic_nlp_resources/transliterate/mr-hi.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e22a553878654d5a7f1a550541a940af9e83fe5621aea95edca64eee46f58c0 +size 4044852 diff --git a/indic_nlp_resources/transliterate/offset_itrans_map.csv b/indic_nlp_resources/transliterate/offset_itrans_map.csv new file mode 100644 index 0000000000000000000000000000000000000000..b25eb6c29beb6e3c4471ea1fada44b9d13e5c825 --- /dev/null +++ b/indic_nlp_resources/transliterate/offset_itrans_map.csv @@ -0,0 +1,129 @@ +offset_hex,devnag_char,itrans +0x0,ऀ,ऀ +0x1,ँ,ँ +0x2,ं,.m +0x3,ः,H +0x4,ऄ,ऄ +0x5,अ,a +0x6,आ,aa +0x7,इ,i +0x8,ई,ii +0x9,उ,u +0xa,ऊ,uu +0xb,ऋ,R^i +0xc,ऌ,L^i +0xd,ऍ,ऍ +0xe,ऎ,.e +0xf,ए,e +0x10,ऐ,ai +0x11,ऑ,ऑ +0x12,ऒ,.o +0x13,ओ,o +0x14,औ,au +0x15,क,ka +0x16,ख,kha +0x17,ग,ga +0x18,घ,gha +0x19,ङ,~Na +0x1a,च,cha +0x1b,छ,Cha +0x1c,ज,ja +0x1d,झ,jha +0x1e,ञ,~na +0x1f,ट,Ta +0x20,ठ,Tha +0x21,ड,Da +0x22,ढ,Dha +0x23,ण,Na +0x24,त,ta +0x25,थ,tha +0x26,द,da +0x27,ध,dha +0x28,न,na +0x29,ऩ,*na +0x2a,प,pa +0x2b,फ,pha +0x2c,ब,ba +0x2d,भ,bha +0x2e,म,ma +0x2f,य,ya +0x30,र,ra +0x31,ऱ,Ra +0x32,ल,la +0x33,ळ,lda +0x34,ऴ,zha +0x35,व,va +0x36,श,sha +0x37,ष,Sha +0x38,स,sa +0x39,ह,ha +0x3a,ऺ,ऺ +0x3b,ऻ,ऻ +0x3c,़,़ +0x3d,ऽ,.a +0x3e,ा,aa +0x3f,ि,i +0x40,ी,ii +0x41,ु,u +0x42,ू,uu +0x43,ृ,R^i +0x44,ॄ,R^I +0x45,ॅ,ॅ +0x46,ॆ,.e +0x47,े,e +0x48,ै,ai +0x49,ॉ,ॉ +0x4a,ॊ,.o +0x4b,ो,o +0x4c,ौ,au +0x4d,्, +0x4e,ॎ,ॎ +0x4f,ॏ,ॏ +0x50,ॐ,AUM +0x51,॑,॑ +0x52,॒,॒ +0x53,॓,॓ +0x54,॔,॔ +0x55,ॕ,ॕ +0x56,ॖ,ॖ +0x57,ॗ,ॗ +0x58,क़,क़ +0x59,ख़,ख़ +0x5a,ग़,ग़ +0x5b,ज़,ज़ +0x5c,ड़,ड़ +0x5d,ढ़,ढ़ +0x5e,फ़,फ़ +0x5f,य़,य़ +0x60,ॠ,R^I +0x61,ॡ,L^I +0x62,ॢ,L^i +0x63,ॣ,L^I +0x64,।,. +0x65,॥,.. +0x66,०,0 +0x67,१,1 +0x68,२,2 +0x69,३,3 +0x6a,४,4 +0x6b,५,5 +0x6c,६,6 +0x6d,७,7 +0x6e,८,8 +0x6f,९,9 +0x70,॰,॰ +0x71,ॱ,ॱ +0x72,ॲ,ॲ +0x73,ॳ,ॳ +0x74,ॴ,ॴ +0x75,ॵ,ॵ +0x76,ॶ,ॶ +0x77,ॷ,ॷ +0x78,ॸ,ॸ +0x79,ॹ,ॹ +0x7a,ॺ,ॺ +0x7b,ॻ,ॻ +0x7c,ॼ,ॼ +0x7d,ॽ,ॽ +0x7e,ॾ,ॾ +0x7f,ॿ,a diff --git a/indic_nlp_resources/transliterate/ta-hi.zip b/indic_nlp_resources/transliterate/ta-hi.zip new file mode 100644 index 0000000000000000000000000000000000000000..ad8c9f9cc6551a7597391286351e4766926c3a00 --- /dev/null +++ b/indic_nlp_resources/transliterate/ta-hi.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f70c35a150ed9f030e54b040d89733aea98d42589d60e379bb533ec49c58625b +size 3181455 diff --git a/indic_nlp_resources/transliterate/te-hi.zip b/indic_nlp_resources/transliterate/te-hi.zip new file mode 100644 index 0000000000000000000000000000000000000000..a792b889a5e995f840dd6c771fb3eb5ca0466aa4 --- /dev/null +++ b/indic_nlp_resources/transliterate/te-hi.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1162ee7713f25abd996e398f7723a84131f7090545e114d8919b17bb2b89009b +size 6785175 diff --git a/indictrans_fairseq_inference.ipynb b/indictrans_fairseq_inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c97f32a8984fa6995c8135598064b35817b596ea --- /dev/null +++ b/indictrans_fairseq_inference.ipynb @@ -0,0 +1,843 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "P0uptOB6U7GW", + "outputId": "988c867e-76ee-4a54-a232-e69abbc5c3db" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/content/testing\n" + ] + } + ], + "source": [ + "# create a seperate folder to store everything\n", + "!mkdir testing\n", + "%cd testing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kQFRiLtSalzt", + "outputId": "03070c7c-8299-46bf-de56-df09c3213a3f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'indicTrans'...\n", + "remote: Enumerating objects: 398, done.\u001b[K\n", + "remote: Counting objects: 100% (398/398), done.\u001b[K\n", + "remote: Compressing objects: 100% (267/267), done.\u001b[K\n", + "remote: Total 398 (delta 231), reused 251 (delta 126), pack-reused 0\u001b[K\n", + "Receiving objects: 100% (398/398), 1.41 MiB | 6.82 MiB/s, done.\n", + "Resolving deltas: 100% (231/231), done.\n", + "/content/testing/indicTrans\n", + "Cloning into 'indic_nlp_library'...\n", + "remote: Enumerating objects: 1325, done.\u001b[K\n", + "remote: Counting objects: 100% (147/147), done.\u001b[K\n", + "remote: Compressing objects: 100% (103/103), done.\u001b[K\n", + "remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n", + "Receiving objects: 100% (1325/1325), 9.57 MiB | 7.40 MiB/s, done.\n", + "Resolving deltas: 100% (688/688), done.\n", + "Cloning into 'indic_nlp_resources'...\n", + "remote: Enumerating objects: 133, done.\u001b[K\n", + "remote: Counting objects: 100% (7/7), done.\u001b[K\n", + "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", + "remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n", + "Receiving objects: 100% (133/133), 149.77 MiB | 23.46 MiB/s, done.\n", + "Resolving deltas: 100% (51/51), done.\n", + "Cloning into 'subword-nmt'...\n", + "remote: Enumerating objects: 580, done.\u001b[K\n", + "remote: Counting objects: 100% (4/4), done.\u001b[K\n", + "remote: Compressing objects: 100% (4/4), done.\u001b[K\n", + "remote: Total 580 (delta 0), reused 0 (delta 0), pack-reused 576\u001b[K\n", + "Receiving objects: 100% (580/580), 237.41 KiB | 1.57 MiB/s, done.\n", + "Resolving deltas: 100% (349/349), done.\n", + "/content/testing\n" + ] + } + ], + "source": [ + "# clone the repo for running evaluation\n", + "!git clone https://github.com/AI4Bharat/indicTrans.git\n", + "%cd indicTrans\n", + "# clone requirements repositories\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n", + "!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n", + "!git clone https://github.com/rsennrich/subword-nmt.git\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FHUQGCACVvVf", + "outputId": "67c7c3a0-f8bf-46a2-8214-e36556df989b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting sacremoses\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n", + "\u001b[K |████████████████████████████████| 901kB 3.9MB/s \n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n", + "Collecting mock\n", + " Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n", + "Collecting sacrebleu\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n", + "\u001b[K |████████████████████████████████| 61kB 8.3MB/s \n", + "\u001b[?25hCollecting tensorboardX\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/07/84/46421bd3e0e89a92682b1a38b40efc22dafb6d8e3d947e4ceefd4a5fabc7/tensorboardX-2.2-py2.py3-none-any.whl (120kB)\n", + "\u001b[K |████████████████████████████████| 122kB 35.5MB/s \n", + "\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n", + "Collecting indic-nlp-library\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n", + "\u001b[K |████████████████████████████████| 40kB 5.8MB/s \n", + "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n", + "Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n", + "Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n", + "Collecting portalocker==2.0.0\n", + " Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n", + "Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n", + "Collecting morfessor\n", + " Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n", + "Collecting sphinx-rtd-theme\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n", + "\u001b[K |████████████████████████████████| 9.2MB 28.0MB/s \n", + "\u001b[?25hCollecting sphinx-argparse\n", + " Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n", + "Collecting docutils<0.17\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n", + "\u001b[K |████████████████████████████████| 552kB 30.6MB/s \n", + "\u001b[?25hRequirement already satisfied: sphinx in /usr/local/lib/python3.7/dist-packages (from sphinx-rtd-theme->indic-nlp-library) (1.8.5)\n", + "Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.6.1)\n", + "Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.23.0)\n", + "Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.1.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (20.9)\n", + "Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.0)\n", + "Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (0.7.12)\n", + "Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.4)\n", + "Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.9.1)\n", + "Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.11.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2020.12.5)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.10)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.4.7)\n", + "Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.1.4)\n", + "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.0.1)\n", + "Building wheels for collected packages: sphinx-argparse\n", + " Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=d8804d903bcf829240052e806acb7c6051e0c240bddf22ef8bd4e4bd2abdfbac\n", + " Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n", + "Successfully built sphinx-argparse\n", + "\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n", + "Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, docutils, sphinx-rtd-theme, sphinx-argparse, indic-nlp-library\n", + " Found existing installation: docutils 0.17.1\n", + " Uninstalling docutils-0.17.1:\n", + " Successfully uninstalled docutils-0.17.1\n", + "Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.2\n", + "Cloning into 'fairseq'...\n", + "remote: Enumerating objects: 28243, done.\u001b[K\n", + "remote: Counting objects: 100% (62/62), done.\u001b[K\n", + "remote: Compressing objects: 100% (39/39), done.\u001b[K\n", + "remote: Total 28243 (delta 29), reused 44 (delta 22), pack-reused 28181\u001b[K\n", + "Receiving objects: 100% (28243/28243), 11.83 MiB | 8.53 MiB/s, done.\n", + "Resolving deltas: 100% (21233/21233), done.\n", + "/content/testing/fairseq\n", + "Obtaining file:///content/testing/fairseq\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.5.1)\n", + "Collecting hydra-core<1.1\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n", + "\u001b[K |████████████████████████████████| 133kB 4.1MB/s \n", + "\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (2019.12.20)\n", + "Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (0.29.23)\n", + "Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.14.5)\n", + "Collecting omegaconf<2.1\n", + " Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n", + "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.8.1+cu101)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (4.41.1)\n", + "Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+2fd9d8a) (1.19.5)\n", + "Requirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+2fd9d8a) (2.0.0)\n", + "Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+2fd9d8a) (5.1.3)\n", + "Collecting antlr4-python3-runtime==4.8\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n", + "\u001b[K |████████████████████████████████| 112kB 17.0MB/s \n", + "\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+2fd9d8a) (2.20)\n", + "Collecting PyYAML>=5.1.*\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n", + "\u001b[K |████████████████████████████████| 645kB 14.1MB/s \n", + "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+2fd9d8a) (3.7.4.3)\n", + "Requirement already satisfied: zipp>=0.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+2fd9d8a) (3.4.1)\n", + "Building wheels for collected packages: antlr4-python3-runtime\n", + " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=f9207fa94682c5ba5daa722d4103f4c9eb131c8dd86870ae9cf43f7df7a90154\n", + " Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n", + "Successfully built antlr4-python3-runtime\n", + "Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n", + " Found existing installation: PyYAML 3.13\n", + " Uninstalling PyYAML-3.13:\n", + " Successfully uninstalled PyYAML-3.13\n", + " Running setup.py develop for fairseq\n", + "Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n", + "/content/testing\n" + ] + } + ], + "source": [ + "# Install the necessary libraries\n", + "!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n", + "# Install fairseq from source\n", + "!git clone https://github.com/pytorch/fairseq.git\n", + "%cd fairseq\n", + "# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n", + "!pip install --editable ./\n", + "%cd .." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kKA8afhBawO5", + "outputId": "d346f462-d5d4-43a0-c29b-90aaab2fb4d2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2021-06-09 15:06:00-- https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.188.128, 64.233.189.128, 108.177.97.128, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.188.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4551079075 (4.2G) [application/zip]\n", + "Saving to: ‘indic-en.zip’\n", + "\n", + "indic-en.zip 100%[===================>] 4.24G 49.9MB/s in 1m 47s \n", + "\n", + "2021-06-09 15:07:48 (40.5 MB/s) - ‘indic-en.zip’ saved [4551079075/4551079075]\n", + "\n", + "Archive: indic-en.zip\n", + " creating: indic-en/\n", + " creating: indic-en/vocab/\n", + " inflating: indic-en/vocab/bpe_codes.32k.SRC \n", + " inflating: indic-en/vocab/vocab.SRC \n", + " inflating: indic-en/vocab/vocab.TGT \n", + " inflating: indic-en/vocab/bpe_codes.32k.TGT \n", + " creating: indic-en/final_bin/\n", + " inflating: indic-en/final_bin/dict.TGT.txt \n", + " inflating: indic-en/final_bin/dict.SRC.txt \n", + " creating: indic-en/model/\n", + " inflating: indic-en/model/checkpoint_best.pt \n", + "--2021-06-09 15:09:51-- https://storage.googleapis.com/samanantar-public/V0.2/models/en-indic.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.204.128, 64.233.188.128, 64.233.189.128, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.204.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4609212103 (4.3G) [application/zip]\n", + "Saving to: ‘en-indic.zip’\n", + "\n", + "en-indic.zip 100%[===================>] 4.29G 33.8MB/s in 1m 51s \n", + "\n", + "2021-06-09 15:11:44 (39.5 MB/s) - ‘en-indic.zip’ saved [4609212103/4609212103]\n", + "\n", + "Archive: en-indic.zip\n", + " creating: en-indic/\n", + " creating: en-indic/vocab/\n", + " inflating: en-indic/vocab/bpe_codes.32k.SRC \n", + " inflating: en-indic/vocab/vocab.SRC \n", + " inflating: en-indic/vocab/vocab.TGT \n", + " inflating: en-indic/vocab/bpe_codes.32k.TGT \n", + " creating: en-indic/final_bin/\n", + " inflating: en-indic/final_bin/dict.TGT.txt \n", + " inflating: en-indic/final_bin/dict.SRC.txt \n", + " creating: en-indic/model/\n", + " inflating: en-indic/model/checkpoint_best.pt \n", + "--2021-06-09 15:14:11-- https://storage.googleapis.com/samanantar-public/models/m2m.zip\n", + "Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.23.128, 74.125.203.128, 74.125.204.128, ...\n", + "Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.23.128|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 4081990185 (3.8G) [application/zip]\n", + "Saving to: ‘m2m.zip’\n", + "\n", + "m2m.zip 100%[===================>] 3.80G 41.5MB/s in 96s \n", + "\n", + "2021-06-09 15:15:48 (40.4 MB/s) - ‘m2m.zip’ saved [4081990185/4081990185]\n", + "\n", + "Archive: m2m.zip\n", + " creating: m2m/\n", + " creating: m2m/vocab/\n", + " inflating: m2m/vocab/vocab.SRC \n", + " inflating: m2m/vocab/vocab.TGT \n", + " inflating: m2m/vocab/bpe_codes.32k.SRC_TGT \n", + " creating: m2m/final_bin/\n", + " inflating: m2m/final_bin/dict.TGT.txt \n", + " inflating: m2m/final_bin/dict.SRC.txt \n", + " creating: m2m/model/\n", + " inflating: m2m/model/checkpoint_best.pt \n", + "/content/testing/indicTrans\n" + ] + } + ], + "source": [ + "# download the indictrans model\n", + "\n", + "\n", + "# downloading the indic-en model\n", + "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip\n", + "!unzip indic-en.zip\n", + "\n", + "# downloading the en-indic model\n", + "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip\n", + "!unzip en-indic.zip\n", + "\n", + "# downloading the indic-indic model\n", + "!wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip\n", + "!unzip m2m.zip\n", + "\n", + "%cd indicTrans/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Lg1sQFfyWJli" + }, + "outputs": [], + "source": [ + "# creating a text file and adding en sentences we can use for testing the model\n", + "!touch en_sentences.txt\n", + "!echo 'This bicycle is too small for you !!' >> en_sentences.txt\n", + "!echo \"I will directly meet you at the airport.\" >> en_sentences.txt\n", + "!echo 'If COVID-19 is spreading in your community, stay safe by taking some simple precautions, such as physical distancing, wearing a mask, keeping rooms well ventilated, avoiding crowds, cleaning your hands, and coughing into a bent elbow or tissue' >> en_sentences.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fLg9BWAGWvLU", + "outputId": "f3ca6f65-9a39-4d80-c25d-88806daf3e7b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jun 9 15:18:01 UTC 2021\n", + "Applying normalization and script conversion\n", + "100% 3/3 [00:00<00:00, 71.78it/s]\n", + "Number of sentences in input: 3\n", + "Applying BPE\n", + "Decoding\n", + "Extracting translations, script conversion and detokenization\n", + "Translation completed\n" + ] + } + ], + "source": [ + "# joint_translate takes src_file, output_fname, src_lang, tgt_lang, model_folder as inputs\n", + "# src_file -> input text file to be translated\n", + "# output_fname -> name of the output file (will get created) containing the model predictions\n", + "# src_lang -> source lang code of the input text ( in this case we are using en-indic model and hence src_lang would be 'en')\n", + "# tgt_lang -> target lang code of the input text ( tgt lang for en-indic model would be any of the 11 indic langs we trained on:\n", + "# as, bn, hi, gu, kn, ml, mr, or, pa, ta, te)\n", + "# supported languages are:\n", + "# as - assamese, bn - bengali, gu - gujarathi, hi - hindi, kn - kannada, \n", + "# ml - malayalam, mr - marathi, or - oriya, pa - punjabi, ta - tamil, te - telugu\n", + "\n", + "# model_dir -> the directory containing the model and the vocab files\n", + "\n", + "# Note: if the translation is taking a lot of time, please tune the buffer_size and batch_size parameter for fairseq-interactive defined inside this joint_translate script\n", + "\n", + "\n", + "# here we are translating the english sentences to tamil\n", + "!bash joint_translate.sh en_sentences.txt ta_outputs.txt 'en' 'ta' '../en-indic'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8QzkBCgeGZiH", + "outputId": "c150360c-6d01-4689-8c2e-9bdd0eba1504" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "இந்த சைக்கிள் உங்களுக்கு மிகவும் சிறியது!\n", + "விமான நிலையத்தில் உங்களை நேரில் சந்திக்கிறேன்.\n", + "உங்கள் சமூகத்தில் கோவிட்-19 பரவுகிறது என்றால், சில எளிய முன்னெச்சரிக்கை நடவடிக்கைகளான, தனி நபர் இடைவெளி, முகக்கவசம் அணிதல், அறைகளை நன்கு காற்றோட்டமாக வைத்திருத்தல், கூட்டத்தைத் தவிர்த்தல், கைகளைக் கழுவுதல், முழங்கை அல்லது திசுக்களில் இருமல் போன்றவற்றை மேற்கொள்வதன் மூலம் பாதுகாப்பாக இருங்கள்.\n" + ] + } + ], + "source": [ + "!cat ta_outputs.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c4v9BmbZao5d", + "outputId": "6efac2a3-5f79-4e72-821b-bc80702a7fa8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jun 9 15:21:31 UTC 2021\n", + "Applying normalization and script conversion\n", + "100% 3/3 [00:00<00:00, 88.59it/s]\n", + "Number of sentences in input: 3\n", + "Applying BPE\n", + "Decoding\n", + "Extracting translations, script conversion and detokenization\n", + "Translation completed\n" + ] + } + ], + "source": [ + "# Similarly, we can translate the english sentences to hindi\n", + "!bash joint_translate.sh en_sentences.txt hi_outputs.txt 'en' 'hi' '../en-indic'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pNNzyR_LfqIr", + "outputId": "095b9532-e76a-4451-dec9-4862566a4288" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "यह साइकिल तुम्हारे लिए बहुत छोटी है!\n", + "मैं आपसे एयरपोर्ट पर ही मिलने वाला हूं।\n", + "यदि आपके समुदाय में कोविड-19 फैल रहा है, तो कुछ सरल सावधानियां बरतें, जैसे शारीरिक दूरी बनाए रखना, मास्क पहनना, कमरों को अच्छी तरह से हवादार रखना, भीड़ से बचना, अपने हाथों को साफ करना और कोहनी या ऊतक को मोड़कर खांसते हुए सुरक्षित रहें\n" + ] + } + ], + "source": [ + "!cat hi_outputs.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PzjbDLBtaol9" + }, + "outputs": [], + "source": [ + "# creating a text file and adding hi sentences we can use for testing the model\n", + "!touch hi_sentences.txt\n", + "!echo 'तुम आज सुबह यहाँ क्यों आए?' >> hi_sentences.txt\n", + "!echo \"मेरे परिवार में हर कोई जल्दी उठता है।\" >> hi_sentences.txt\n", + "!echo ' स्वास्थ्य और परिवार कल्याण मंत्रालय द्वारा प्रदान की गई जानकारी और सलाह को सावधानी व सही तरीके से पालन कर वायरस के स्थानीय प्रसार को रोका जा सकता है।' >> hi_sentences.txt\n", + "\n", + "!touch ta_sentences.txt\n", + "!echo 'அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது' >> ta_sentences.txt\n", + "!echo \"இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\" >> ta_sentences.txt\n", + "!echo 'உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.' >> ta_sentences.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5uaOmKb8gmeN", + "outputId": "951bbdf9-61d0-4703-a8df-0c3fcb4e5bb3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jun 9 15:24:43 UTC 2021\n", + "Applying normalization and script conversion\n", + "100% 3/3 [00:00<00:00, 74.90it/s]\n", + "Number of sentences in input: 3\n", + "Applying BPE\n", + "Decoding\n", + "Extracting translations, script conversion and detokenization\n", + "Translation completed\n" + ] + } + ], + "source": [ + "# here we are translating the english sentences to hindi\n", + "!bash joint_translate.sh hi_sentences.txt en_outputs.txt 'hi' 'en' '../indic-en'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iLD7WPqmlSnC", + "outputId": "359050fa-6d35-4055-a9c5-13a15322c59e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Why did you come here this morning?\n", + "Everyone in my family gets up early.\n", + "The local spread of the virus can be curbed by following the information and advice provided by the Ministry of Health and Family Welfare in a careful and correct manner.\n" + ] + } + ], + "source": [ + "! cat en_outputs.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O3mJyj-QljWz", + "outputId": "1c0420e5-4b80-41d9-f09e-2fdff79bc7bd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jun 9 15:28:05 UTC 2021\n", + "Applying normalization and script conversion\n", + "100% 3/3 [00:00<00:00, 72.92it/s]\n", + "Number of sentences in input: 3\n", + "Applying BPE\n", + "Decoding\n", + "Extracting translations, script conversion and detokenization\n", + "Translation completed\n" + ] + } + ], + "source": [ + "# here we are translating the english sentences to tamil\n", + "!bash joint_translate.sh ta_sentences.txt en_outputs.txt 'ta' 'en' '../indic-en'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GapEJESiloD8", + "outputId": "dc8b2a8c-4f36-4bf9-d517-6826aa65da57" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "He seems to know us.\n", + "I couldnt find it anywhere.\n", + "If someone in your neighbourhood develops these symptoms, staying at home can help prevent the spread of the coronavirus infection.\n" + ] + } + ], + "source": [ + "! cat en_outputs.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ckfW2P6abcB3" + }, + "outputs": [], + "source": [ + "# we just rename the m2m_joint_vocab file here as joint_translate uses bpe_codes.32k.SRC\n", + "mv ../m2m/vocab/bpe_codes.32k.SRC_TGT ../m2m/vocab/bpe_codes.32k.SRC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H-3vPdCqSWoK", + "outputId": "d5a80c59-cc89-4910-a9ce-7317fac6bf8d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jun 9 15:39:26 UTC 2021\n", + "Applying normalization and script conversion\n", + "100% 3/3 [00:00<00:00, 63.53it/s]\n", + "Number of sentences in input: 3\n", + "Applying BPE\n", + "Decoding\n", + "Extracting translations, script conversion and detokenization\n", + "Translation completed\n" + ] + } + ], + "source": [ + "# here we are using the indic2indic model for translating the hindi sentences to tamil\n", + "!bash joint_translate.sh hi_sentences.txt ta_outputs.txt 'hi' 'ta' '../m2m'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "22yPo78Zb_oR", + "outputId": "4df17e93-9029-4020-8deb-0dbaf8bb0b27" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "तुम आज सुबह यहाँ क्यों आए?\n", + "मेरे परिवार में हर कोई जल्दी उठता है।\n", + " स्वास्थ्य और परिवार कल्याण मंत्रालय द्वारा प्रदान की गई जानकारी और सलाह को सावधानी व सही तरीके से पालन कर वायरस के स्थानीय प्रसार को रोका जा सकता है।\n" + ] + } + ], + "source": [ + " ! cat hi_sentences.txt # the hindi inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "onnfzTDESg2I", + "outputId": "1bc600d4-d3ff-40fa-d258-7d1c876bd49c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ஏன் இன்று காலையில் வந்தீர்கள்?\n", + "எனது குடும்பத்தில் உள்ள ஒவ்வொருவரும் விரைவில் எழுவார்கள்.\n", + "மத்திய சுகாதாரம் மற்றும் குடும்ப நல அமைச்சகத்தின் அறிவுறுத்தல்கள் மற்றும் தகவல்களைப் பின்பற்றுவதன் மூலம், உள்ளூர் அளவில் வைரஸ் பரவுவதைத் தடுக்க முடியும்.\n" + ] + } + ], + "source": [ + "! cat ta_outputs.txt # the tamil outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5klOcwi8SjGS", + "outputId": "bc4e47fa-ee1d-4da2-85ea-f7900cae7b48" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jun 9 15:45:53 UTC 2021\n", + "Applying normalization and script conversion\n", + "100% 3/3 [00:00<00:00, 82.25it/s]\n", + "Number of sentences in input: 3\n", + "Applying BPE\n", + "Decoding\n", + "Extracting translations, script conversion and detokenization\n", + "Translation completed\n" + ] + } + ], + "source": [ + "# here we are using the indic2indic model for translating the hindi sentences to tamil (same as above with reversing the direction)\n", + "!bash joint_translate.sh ta_sentences.txt hi_outputs.txt 'ta' 'hi' '../m2m'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4ifZhGkKc6oo", + "outputId": "a0112e2b-a54b-48ad-e3ae-a3d84c6d097e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது\n", + "இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\n", + "உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.\n" + ] + } + ], + "source": [ + "! cat ta_sentences.txt # the tamil inputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "v0x0YrWYSwwK", + "outputId": "4c37d699-5b8e-4ae7-9724-953d7e165035" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ऐसा लगता है कि वह हमें जानता है।\n", + "मुझे पता नहीं था कि यह कहां है।\n", + "अगर आपके आस-पास के किसी व्यक्ति में ऐसे लक्षण दिखाई देते हैं, तो घर पर रहने से कोरोना वायरस को फैलने से रोकने में मदद मिलेगी।\n" + ] + } + ], + "source": [ + "! cat hi_outputs.txt # the hi outputs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-xcnDOc4gNKC" + }, + "outputs": [], + "source": [ + "# to compute bleu scores for the predicitions with a reference file, use the following command\n", + "\n", + "# bash compute_bleu.sh pred_fname ref_fname src_lang tgt_lang\n", + "# arguments:\n", + "# pred_fname: file that contains model predictions\n", + "# ref_fname: file that contains references\n", + "# src_lang and tgt_lang : the source and target language" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9YK2BdwvrUgI" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "indictrans_fairseq_inference.ipynb", + "provenance": [] + }, + "interpreter": { + "hash": "3c7d4130300118f0c7487d576c6841c0dbbdeec039e1e658ac9b107412a09af0" + }, + "kernelspec": { + "display_name": "Python 3.7.7 64-bit", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/inference/__init__.py b/inference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/inference/custom_interactive.py b/inference/custom_interactive.py new file mode 100644 index 0000000000000000000000000000000000000000..1e167a450c10991fa30f885721f99f233c35416e --- /dev/null +++ b/inference/custom_interactive.py @@ -0,0 +1,298 @@ +# python wrapper for fairseq-interactive command line tool + +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Translate raw text with a trained model. Batches data on-the-fly. +""" + +import ast +from collections import namedtuple + +import torch +from fairseq import checkpoint_utils, options, tasks, utils +from fairseq.dataclass.utils import convert_namespace_to_omegaconf +from fairseq.token_generation_constraints import pack_constraints, unpack_constraints +from fairseq_cli.generate import get_symbols_to_strip_from_output + +import codecs + + +Batch = namedtuple("Batch", "ids src_tokens src_lengths constraints") +Translation = namedtuple("Translation", "src_str hypos pos_scores alignments") + + +def make_batches( + lines, cfg, task, max_positions, encode_fn, constrainted_decoding=False +): + def encode_fn_target(x): + return encode_fn(x) + + if constrainted_decoding: + # Strip (tab-delimited) contraints, if present, from input lines, + # store them in batch_constraints + batch_constraints = [list() for _ in lines] + for i, line in enumerate(lines): + if "\t" in line: + lines[i], *batch_constraints[i] = line.split("\t") + + # Convert each List[str] to List[Tensor] + for i, constraint_list in enumerate(batch_constraints): + batch_constraints[i] = [ + task.target_dictionary.encode_line( + encode_fn_target(constraint), + append_eos=False, + add_if_not_exist=False, + ) + for constraint in constraint_list + ] + + if constrainted_decoding: + constraints_tensor = pack_constraints(batch_constraints) + else: + constraints_tensor = None + + tokens, lengths = task.get_interactive_tokens_and_lengths(lines, encode_fn) + + itr = task.get_batch_iterator( + dataset=task.build_dataset_for_inference( + tokens, lengths, constraints=constraints_tensor + ), + max_tokens=cfg.dataset.max_tokens, + max_sentences=cfg.dataset.batch_size, + max_positions=max_positions, + ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, + ).next_epoch_itr(shuffle=False) + for batch in itr: + ids = batch["id"] + src_tokens = batch["net_input"]["src_tokens"] + src_lengths = batch["net_input"]["src_lengths"] + constraints = batch.get("constraints", None) + + yield Batch( + ids=ids, + src_tokens=src_tokens, + src_lengths=src_lengths, + constraints=constraints, + ) + + +class Translator: + def __init__( + self, data_dir, checkpoint_path, batch_size=25, constrained_decoding=False + ): + + self.constrained_decoding = constrained_decoding + self.parser = options.get_generation_parser(interactive=True) + # buffer_size is currently not used but we just initialize it to batch + # size + 1 to avoid any assertion errors. + if self.constrained_decoding: + self.parser.set_defaults( + path=checkpoint_path, + remove_bpe="subword_nmt", + num_workers=-1, + constraints="ordered", + batch_size=batch_size, + buffer_size=batch_size + 1, + ) + else: + self.parser.set_defaults( + path=checkpoint_path, + remove_bpe="subword_nmt", + num_workers=-1, + batch_size=batch_size, + buffer_size=batch_size + 1, + ) + args = options.parse_args_and_arch(self.parser, input_args=[data_dir]) + # we are explictly setting src_lang and tgt_lang here + # generally the data_dir we pass contains {split}-{src_lang}-{tgt_lang}.*.idx files from + # which fairseq infers the src and tgt langs(if these are not passed). In deployment we dont + # use any idx files and only store the SRC and TGT dictionaries. + args.source_lang = "SRC" + args.target_lang = "TGT" + # since we are truncating sentences to max_seq_len in engine, we can set it to False here + args.skip_invalid_size_inputs_valid_test = False + + # we have custom architechtures in this folder and we will let fairseq + # import this + args.user_dir = "model_configs" + self.cfg = convert_namespace_to_omegaconf(args) + + utils.import_user_module(self.cfg.common) + + if self.cfg.interactive.buffer_size < 1: + self.cfg.interactive.buffer_size = 1 + if self.cfg.dataset.max_tokens is None and self.cfg.dataset.batch_size is None: + self.cfg.dataset.batch_size = 1 + + assert ( + not self.cfg.generation.sampling + or self.cfg.generation.nbest == self.cfg.generation.beam + ), "--sampling requires --nbest to be equal to --beam" + assert ( + not self.cfg.dataset.batch_size + or self.cfg.dataset.batch_size <= self.cfg.interactive.buffer_size + ), "--batch-size cannot be larger than --buffer-size" + + # Fix seed for stochastic decoding + # if self.cfg.common.seed is not None and not self.cfg.generation.no_seed_provided: + # np.random.seed(self.cfg.common.seed) + # utils.set_torch_seed(self.cfg.common.seed) + + # if not self.constrained_decoding: + # self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu + # else: + # self.use_cuda = False + + self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu + + # Setup task, e.g., translation + self.task = tasks.setup_task(self.cfg.task) + + # Load ensemble + overrides = ast.literal_eval(self.cfg.common_eval.model_overrides) + self.models, self._model_args = checkpoint_utils.load_model_ensemble( + utils.split_paths(self.cfg.common_eval.path), + arg_overrides=overrides, + task=self.task, + suffix=self.cfg.checkpoint.checkpoint_suffix, + strict=(self.cfg.checkpoint.checkpoint_shard_count == 1), + num_shards=self.cfg.checkpoint.checkpoint_shard_count, + ) + + # Set dictionaries + self.src_dict = self.task.source_dictionary + self.tgt_dict = self.task.target_dictionary + + # Optimize ensemble for generation + for model in self.models: + if model is None: + continue + if self.cfg.common.fp16: + model.half() + if ( + self.use_cuda + and not self.cfg.distributed_training.pipeline_model_parallel + ): + model.cuda() + model.prepare_for_inference_(self.cfg) + + # Initialize generator + self.generator = self.task.build_generator(self.models, self.cfg.generation) + + # Handle tokenization and BPE + self.tokenizer = self.task.build_tokenizer(self.cfg.tokenizer) + self.bpe = self.task.build_bpe(self.cfg.bpe) + + # Load alignment dictionary for unknown word replacement + # (None if no unknown word replacement, empty if no path to align dictionary) + self.align_dict = utils.load_align_dict(self.cfg.generation.replace_unk) + + self.max_positions = utils.resolve_max_positions( + self.task.max_positions(), *[model.max_positions() for model in self.models] + ) + + def encode_fn(self, x): + if self.tokenizer is not None: + x = self.tokenizer.encode(x) + if self.bpe is not None: + x = self.bpe.encode(x) + return x + + def decode_fn(self, x): + if self.bpe is not None: + x = self.bpe.decode(x) + if self.tokenizer is not None: + x = self.tokenizer.decode(x) + return x + + def translate(self, inputs, constraints=None): + if self.constrained_decoding and constraints is None: + raise ValueError("Constraints cant be None in constrained decoding mode") + if not self.constrained_decoding and constraints is not None: + raise ValueError("Cannot pass constraints during normal translation") + if constraints: + constrained_decoding = True + modified_inputs = [] + for _input, constraint in zip(inputs, constraints): + modified_inputs.append(_input + f"\t{constraint}") + inputs = modified_inputs + else: + constrained_decoding = False + + start_id = 0 + results = [] + final_translations = [] + for batch in make_batches( + inputs, + self.cfg, + self.task, + self.max_positions, + self.encode_fn, + constrained_decoding, + ): + bsz = batch.src_tokens.size(0) + src_tokens = batch.src_tokens + src_lengths = batch.src_lengths + constraints = batch.constraints + if self.use_cuda: + src_tokens = src_tokens.cuda() + src_lengths = src_lengths.cuda() + if constraints is not None: + constraints = constraints.cuda() + + sample = { + "net_input": { + "src_tokens": src_tokens, + "src_lengths": src_lengths, + }, + } + + translations = self.task.inference_step( + self.generator, self.models, sample, constraints=constraints + ) + + list_constraints = [[] for _ in range(bsz)] + if constrained_decoding: + list_constraints = [unpack_constraints(c) for c in constraints] + for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): + src_tokens_i = utils.strip_pad(src_tokens[i], self.tgt_dict.pad()) + constraints = list_constraints[i] + results.append( + ( + start_id + id, + src_tokens_i, + hypos, + { + "constraints": constraints, + }, + ) + ) + + # sort output to match input order + for id_, src_tokens, hypos, _ in sorted(results, key=lambda x: x[0]): + src_str = "" + if self.src_dict is not None: + src_str = self.src_dict.string( + src_tokens, self.cfg.common_eval.post_process + ) + + # Process top predictions + for hypo in hypos[: min(len(hypos), self.cfg.generation.nbest)]: + hypo_tokens, hypo_str, alignment = utils.post_process_prediction( + hypo_tokens=hypo["tokens"].int().cpu(), + src_str=src_str, + alignment=hypo["alignment"], + align_dict=self.align_dict, + tgt_dict=self.tgt_dict, + remove_bpe="subword_nmt", + extra_symbols_to_ignore=get_symbols_to_strip_from_output( + self.generator + ), + ) + detok_hypo_str = self.decode_fn(hypo_str) + final_translations.append(detok_hypo_str) + return final_translations diff --git a/inference/engine.py b/inference/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..f06fcdc8b6063a5e901684460f5ea7b6168e4490 --- /dev/null +++ b/inference/engine.py @@ -0,0 +1,198 @@ +from os import truncate +from sacremoses import MosesPunctNormalizer +from sacremoses import MosesTokenizer +from sacremoses import MosesDetokenizer +from subword_nmt.apply_bpe import BPE, read_vocabulary +import codecs +from tqdm import tqdm +from indicnlp.tokenize import indic_tokenize +from indicnlp.tokenize import indic_detokenize +from indicnlp.normalize import indic_normalize +from indicnlp.transliterate import unicode_transliterate +from mosestokenizer import MosesSentenceSplitter +from indicnlp.tokenize import sentence_tokenize + +from inference.custom_interactive import Translator + + +INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"] + + +def split_sentences(paragraph, language): + if language == "en": + with MosesSentenceSplitter(language) as splitter: + return splitter([paragraph]) + elif language in INDIC: + return sentence_tokenize.sentence_split(paragraph, lang=language) + + +def add_token(sent, tag_infos): + """add special tokens specified by tag_infos to each element in list + + tag_infos: list of tuples (tag_type,tag) + + each tag_info results in a token of the form: __{tag_type}__{tag}__ + + """ + + tokens = [] + for tag_type, tag in tag_infos: + token = "__" + tag_type + "__" + tag + "__" + tokens.append(token) + + return " ".join(tokens) + " " + sent + + +def apply_lang_tags(sents, src_lang, tgt_lang): + tagged_sents = [] + for sent in sents: + tagged_sent = add_token(sent.strip(), [("src", src_lang), ("tgt", tgt_lang)]) + tagged_sents.append(tagged_sent) + return tagged_sents + + +def truncate_long_sentences(sents): + + MAX_SEQ_LEN = 200 + new_sents = [] + + for sent in sents: + words = sent.split() + num_words = len(words) + if num_words > MAX_SEQ_LEN: + print_str = " ".join(words[:5]) + " .... " + " ".join(words[-5:]) + sent = " ".join(words[:MAX_SEQ_LEN]) + print( + f"WARNING: Sentence {print_str} truncated to 200 tokens as it exceeds maximum length limit" + ) + + new_sents.append(sent) + return new_sents + + +class Model: + def __init__(self, expdir): + self.expdir = expdir + self.en_tok = MosesTokenizer(lang="en") + self.en_normalizer = MosesPunctNormalizer() + self.en_detok = MosesDetokenizer(lang="en") + self.xliterator = unicode_transliterate.UnicodeIndicTransliterator() + print("Initializing vocab and bpe") + self.vocabulary = read_vocabulary( + codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5 + ) + self.bpe = BPE( + codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"), + -1, + "@@", + self.vocabulary, + None, + ) + + print("Initializing model for translation") + # initialize the model + self.translator = Translator( + f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100 + ) + + # translate a batch of sentences from src_lang to tgt_lang + def batch_translate(self, batch, src_lang, tgt_lang): + + assert isinstance(batch, list) + preprocessed_sents = self.preprocess(batch, lang=src_lang) + bpe_sents = self.apply_bpe(preprocessed_sents) + tagged_sents = apply_lang_tags(bpe_sents, src_lang, tgt_lang) + tagged_sents = truncate_long_sentences(tagged_sents) + + translations = self.translator.translate(tagged_sents) + postprocessed_sents = self.postprocess(translations, tgt_lang) + + return postprocessed_sents + + # translate a paragraph from src_lang to tgt_lang + def translate_paragraph(self, paragraph, src_lang, tgt_lang): + + assert isinstance(paragraph, str) + sents = split_sentences(paragraph, src_lang) + + postprocessed_sents = self.batch_translate(sents, src_lang, tgt_lang) + + translated_paragraph = " ".join(postprocessed_sents) + + return translated_paragraph + + def preprocess_sent(self, sent, normalizer, lang): + if lang == "en": + return " ".join( + self.en_tok.tokenize( + self.en_normalizer.normalize(sent.strip()), escape=False + ) + ) + else: + # line = indic_detokenize.trivial_detokenize(line.strip(), lang) + return unicode_transliterate.UnicodeIndicTransliterator.transliterate( + " ".join( + indic_tokenize.trivial_tokenize( + normalizer.normalize(sent.strip()), lang + ) + ), + lang, + "hi", + ).replace(" ् ", "्") + + def preprocess(self, sents, lang): + """ + Normalize, tokenize and script convert(for Indic) + return number of sentences input file + + """ + + if lang == "en": + + # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")( + # delayed(preprocess_line)(line, None, lang) for line in tqdm(sents, total=num_lines) + # ) + processed_sents = [ + self.preprocess_sent(line, None, lang) for line in tqdm(sents) + ] + + else: + normfactory = indic_normalize.IndicNormalizerFactory() + normalizer = normfactory.get_normalizer(lang) + + # processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")( + # delayed(preprocess_line)(line, normalizer, lang) for line in tqdm(infile, total=num_lines) + # ) + processed_sents = [ + self.preprocess_sent(line, normalizer, lang) for line in tqdm(sents) + ] + + return processed_sents + + def postprocess(self, sents, lang, common_lang="hi"): + """ + parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. + + infname: fairseq log file + outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' + input_size: expected number of output sentences + lang: language + """ + postprocessed_sents = [] + + if lang == "en": + for sent in sents: + # outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") + postprocessed_sents.append(self.en_detok.detokenize(sent.split(" "))) + else: + for sent in sents: + outstr = indic_detokenize.trivial_detokenize( + self.xliterator.transliterate(sent, common_lang, lang), lang + ) + # outfile.write(outstr + "\n") + postprocessed_sents.append(outstr) + return postprocessed_sents + + def apply_bpe(self, sents): + + return [self.bpe.process_line(sent) for sent in sents] diff --git a/interface/index.html b/interface/index.html new file mode 100644 index 0000000000000000000000000000000000000000..dfe553d2c6f321c379641f3a2f464ac8b0ebca29 --- /dev/null +++ b/interface/index.html @@ -0,0 +1,202 @@ + + + + + + + AI4B Translation API + + + + + + + +
+
+ +
+

+ IndicTrans API +

+

+ Real-time Indian Language Text Translation with IndicTrans! +

+ +
+ +
+ +
+ + + +
+ +
+

+ From +

+ +
+

+

+   + To +

+ +
+

+
+ +

+
+

+ +

+

+ +

+
+ +
+
+ +
+
+ +
+
+ +
+ 15% +
+ +

+ +
{{ transcription_time }}
+ + +
+ +
+ + + + + + + + \ No newline at end of file diff --git a/interface/logo.png b/interface/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..8b924af80542daadb49ab5b72765bff3d9381d9f Binary files /dev/null and b/interface/logo.png differ diff --git a/joint_translate.sh b/joint_translate.sh new file mode 100644 index 0000000000000000000000000000000000000000..ce23dda9c7d05884a2289db921375ad25370824d --- /dev/null +++ b/joint_translate.sh @@ -0,0 +1,69 @@ +#!/bin/bash +echo `date` +infname=$1 +outfname=$2 +src_lang=$3 +tgt_lang=$4 +exp_dir=$5 +ref_fname=$6 + +SRC_PREFIX='SRC' +TGT_PREFIX='TGT' + +#`dirname $0`/env.sh +SUBWORD_NMT_DIR='subword-nmt' +model_dir=$exp_dir/model +data_bin_dir=$exp_dir/final_bin + +### normalization and script conversion + +echo "Applying normalization and script conversion" +input_size=`python scripts/preprocess_translate.py $infname $outfname.norm $src_lang true` +echo "Number of sentences in input: $input_size" + +### apply BPE to input file + +echo "Applying BPE" +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $exp_dir/vocab/bpe_codes.32k.${SRC_PREFIX} \ + --vocabulary $exp_dir/vocab/vocab.$SRC_PREFIX \ + --vocabulary-threshold 5 \ + < $outfname.norm \ + > $outfname._bpe + +# not needed for joint training +# echo "Adding language tags" +python scripts/add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang + +### run decoder + +echo "Decoding" + +src_input_bpe_fname=$outfname.bpe +tgt_output_fname=$outfname +fairseq-interactive $data_bin_dir \ + -s $SRC_PREFIX -t $TGT_PREFIX \ + --distributed-world-size 1 \ + --path $model_dir/checkpoint_best.pt \ + --batch-size 64 --buffer-size 2500 --beam 5 --remove-bpe \ + --skip-invalid-size-inputs-valid-test \ + --user-dir model_configs \ + --input $src_input_bpe_fname > $tgt_output_fname.log 2>&1 + + +echo "Extracting translations, script conversion and detokenization" +# this part reverses the transliteration from devnagiri script to target lang and then detokenizes it. +python scripts/postprocess_translate.py $tgt_output_fname.log $tgt_output_fname $input_size $tgt_lang true + +# This block is now moved to compute_bleu.sh for release with more documentation. +# if [ $src_lang == 'en' ]; then +# # indicnlp tokenize the output files before evaluation +# input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang` +# input_size=`python scripts/preprocess_translate.py $tgt_output_fname $tgt_output_fname.tok $tgt_lang` +# sacrebleu --tokenize none $ref_fname.tok < $tgt_output_fname.tok +# else +# # indic to en models +# sacrebleu $ref_fname < $tgt_output_fname +# fi +# echo `date` +echo "Translation completed" diff --git a/learn_bpe.sh b/learn_bpe.sh new file mode 100755 index 0000000000000000000000000000000000000000..3219ac8d5615643344237eaa0279af3fe7ced254 --- /dev/null +++ b/learn_bpe.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +expdir=$1 # EXPDIR +num_operations=${2:-32000} + +#`dirname $0`/env.sh +SUBWORD_NMT_DIR="subword-nmt" +data_dir="$expdir/data" +train_file=$data_dir/train +# num_operations=32000 + +echo Input file: $train_file + +mkdir -p $expdir/vocab + +echo "learning joint BPE" +cat $train_file.SRC $train_file.TGT > $train_file.ALL +python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \ + --input $train_file.ALL \ + -s $num_operations \ + -o $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --num-workers -1 + +echo "computing SRC vocab" +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --num-workers -1 \ + -i $train_file.SRC | \ +python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \ + > $expdir/vocab/vocab.tmp.SRC +python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.SRC $expdir/vocab/vocab.SRC +#rm $expdir/vocab/vocab.tmp.SRC + +echo "computing TGT vocab" +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --num-workers -1 \ + -i $train_file.TGT | \ +python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \ + > $expdir/vocab/vocab.tmp.TGT +python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.TGT $expdir/vocab/vocab.TGT +#rm $expdir/vocab/vocab.tmp.TGT + +rm $train_file.ALL diff --git a/learn_single_bpe.sh b/learn_single_bpe.sh new file mode 100644 index 0000000000000000000000000000000000000000..8d56edb8532e19ba0aadedf6bf9e8ae4b6828eb0 --- /dev/null +++ b/learn_single_bpe.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +expdir=$1 # EXPDIR +num_operations=${2:-32000} + +#`dirname $0`/env.sh +SUBWORD_NMT_DIR="subword-nmt" +data_dir="$expdir/data" +train_file=$data_dir/train +# num_operations=32000 + +echo Input file: $train_file + +mkdir -p $expdir/vocab + +echo "learning source BPE" + +python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \ + --input $train_file.SRC \ + -s $num_operations \ + -o $expdir/vocab/bpe_codes.32k.SRC\ + --num-workers -1 + +echo "learning target BPE" +python $SUBWORD_NMT_DIR/subword_nmt/learn_bpe.py \ + --input $train_file.TGT \ + -s $num_operations \ + -o $expdir/vocab/bpe_codes.32k.TGT\ + --num-workers -1 + +echo "computing SRC vocab" +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC \ + --num-workers -1 \ + -i $train_file.SRC | \ +python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \ + > $expdir/vocab/vocab.tmp.SRC +python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.SRC $expdir/vocab/vocab.SRC +rm $expdir/vocab/vocab.tmp.SRC + +echo "computing TGT vocab" +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.TGT \ + --num-workers -1 \ + -i $train_file.TGT | \ +python $SUBWORD_NMT_DIR/subword_nmt/get_vocab.py \ + > $expdir/vocab/vocab.tmp.TGT +python scripts/clean_vocab.py $expdir/vocab/vocab.tmp.TGT $expdir/vocab/vocab.TGT +rm $expdir/vocab/vocab.tmp.TGT diff --git a/legacy/apply_bpe_test_valid_notag.sh b/legacy/apply_bpe_test_valid_notag.sh new file mode 100644 index 0000000000000000000000000000000000000000..f152770c4ad7d5c13f72b492d50ffff238ff44f0 --- /dev/null +++ b/legacy/apply_bpe_test_valid_notag.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +expdir=$1 # EXPDIR +org_data_dir=$2 +langs=$3 + +#`dirname $0`/env.sh +SUBWORD_NMT_DIR="subword-nmt" +echo "Apply to each language" + +for dset in `echo test dev` +do + echo $dset + + in_dset_dir="$org_data_dir/$dset" + out_dset_dir="$expdir/bpe/$dset" + + for lang in $langs + do + + echo Apply BPE for $dset "-" $lang + + mkdir -p $out_dset_dir + + python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --vocabulary $expdir/vocab/vocab.SRC \ + --vocabulary-threshold 5 \ + < $in_dset_dir/$dset.$lang \ + > $out_dset_dir/$dset.$lang + + done +done diff --git a/legacy/apply_bpe_train_notag.sh b/legacy/apply_bpe_train_notag.sh new file mode 100644 index 0000000000000000000000000000000000000000..fa24a57dc2a8b26eed1aae66793f9a65c2712e26 --- /dev/null +++ b/legacy/apply_bpe_train_notag.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +expdir=$1 # EXPDIR + +#`dirname $0`/env.sh +SUBWORD_NMT_DIR="subword-nmt" + +data_dir="$expdir/data" +train_file=$data_dir/train +bpe_file=$expdir/bpe/train/train + +mkdir -p $expdir/bpe/train + +echo "Apply to SRC corpus" + +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --vocabulary $expdir/vocab/vocab.SRC \ + --vocabulary-threshold 5 \ + --num-workers "-1" \ + < $train_file.SRC \ + > $bpe_file.SRC + +echo "Apply to TGT corpus" + +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $expdir/vocab/bpe_codes.32k.SRC_TGT \ + --vocabulary $expdir/vocab/vocab.TGT \ + --vocabulary-threshold 5 \ + --num-workers "-1" \ + < $train_file.TGT \ + > $bpe_file.TGT + diff --git a/legacy/env.sh b/legacy/env.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c9611b0d11e821bdb17b612b64c3d14e208cc74 --- /dev/null +++ b/legacy/env.sh @@ -0,0 +1,17 @@ + +export SRC='' + +## Python env directory where fairseq is installed +export PYTHON_ENV='' + +export SUBWORD_NMT_DIR='' +export INDIC_RESOURCES_PATH='' +export INDIC_NLP_HOME='' + +export CUDA_HOME='' + +export PATH=$CUDA_HOME/bin:$INDIC_NLP_HOME:$PATH +export LD_LIBRARY_PATH=$CUDA_HOME/lib64 + +# set environment variable to control GPUS visible to the application +#export CUDA_VISIBLE_DEVICES="' diff --git a/legacy/indictrans_workflow.ipynb b/legacy/indictrans_workflow.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7e11ad28f9f788c5311d60d44f360a215a1da8d9 --- /dev/null +++ b/legacy/indictrans_workflow.ipynb @@ -0,0 +1,643 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import random\n", + "from tqdm.notebook import tqdm\n", + "from sacremoses import MosesPunctNormalizer\n", + "from sacremoses import MosesTokenizer\n", + "from sacremoses import MosesDetokenizer\n", + "from collections import defaultdict\n", + "import sacrebleu" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The path to the local git repo for Indic NLP library\n", + "INDIC_NLP_LIB_HOME=\"\"\n", + "\n", + "# The path to the local git repo for Indic NLP Resources\n", + "INDIC_NLP_RESOURCES=\"\"\n", + "\n", + "import sys\n", + "sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))\n", + "\n", + "from indicnlp import common\n", + "common.set_resources_path(INDIC_NLP_RESOURCES)\n", + "\n", + "from indicnlp import loader\n", + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import indicnlp\n", + "from indicnlp.tokenize import indic_tokenize\n", + "from indicnlp.tokenize import indic_detokenize\n", + "from indicnlp.normalize import indic_normalize\n", + "from indicnlp.transliterate import unicode_transliterate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "LANGS=[\n", + " \"bn\",\n", + " \"gu\",\n", + " \"hi\",\n", + " \"kn\",\n", + " \"ml\",\n", + " \"mr\",\n", + " \"or\",\n", + " \"pa\",\n", + " \"ta\",\n", + " \"te\", \n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(infname,outfname,lang):\n", + " \"\"\"\n", + " Preparing each corpus file: \n", + " - Normalization\n", + " - Tokenization \n", + " - Script coversion to Devanagari for Indic scripts\n", + " \"\"\"\n", + " \n", + " ### reading \n", + " with open(infname,'r',encoding='utf-8') as infile, \\\n", + " open(outfname,'w',encoding='utf-8') as outfile:\n", + " \n", + " if lang=='en':\n", + " en_tok=MosesTokenizer(lang='en')\n", + " en_normalizer = MosesPunctNormalizer()\n", + " for line in tqdm(infile): \n", + " outline=' '.join(\n", + " en_tok.tokenize( \n", + " en_normalizer.normalize(line.strip()), \n", + " escape=False ) )\n", + " outfile.write(outline+'\\n')\n", + " \n", + " else:\n", + " normfactory=indic_normalize.IndicNormalizerFactory()\n", + " normalizer=normfactory.get_normalizer(lang)\n", + " for line in tqdm(infile): \n", + " outline=unicode_transliterate.UnicodeIndicTransliterator.transliterate(\n", + " ' '.join(\n", + " indic_tokenize.trivial_tokenize(\n", + " normalizer.normalize(line.strip()), lang) ), lang, 'hi').replace(' ् ','्')\n", + "\n", + "\n", + " outfile.write(outline+'\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def add_token(sent, tag_infos):\n", + " \"\"\" add special tokens specified by tag_infos to each element in list\n", + "\n", + " tag_infos: list of tuples (tag_type,tag)\n", + "\n", + " each tag_info results in a token of the form: __{tag_type}__{tag}__\n", + "\n", + " \"\"\"\n", + "\n", + " tokens=[]\n", + " for tag_type, tag in tag_infos:\n", + " token = '__' + tag_type + '__' + tag + '__'\n", + " tokens.append(token)\n", + "\n", + " return ' '.join(tokens) + ' ' + sent \n", + "\n", + "\n", + "def concat_data(data_dir, outdir, lang_pair_list, out_src_lang='SRC', out_trg_lang='TGT'):\n", + " \"\"\"\n", + " data_dir: input dir, contains directories for language pairs named l1-l2\n", + " \"\"\"\n", + " os.makedirs(outdir,exist_ok=True)\n", + "\n", + " out_src_fname='{}/train.{}'.format(outdir,out_src_lang)\n", + " out_trg_fname='{}/train.{}'.format(outdir,out_trg_lang)\n", + "# out_meta_fname='{}/metadata.txt'.format(outdir)\n", + "\n", + " print()\n", + " print(out_src_fname)\n", + " print(out_trg_fname)\n", + "# print(out_meta_fname)\n", + "\n", + " ### concatenate train data \n", + " if os.path.isfile(out_src_fname):\n", + " os.unlink(out_src_fname)\n", + " if os.path.isfile(out_trg_fname):\n", + " os.unlink(out_trg_fname)\n", + "# if os.path.isfile(out_meta_fname):\n", + "# os.unlink(out_meta_fname)\n", + "\n", + " for src_lang, trg_lang in tqdm(lang_pair_list):\n", + " print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n", + "\n", + " in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n", + " in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n", + "\n", + " print(in_src_fname)\n", + " os.system('cat {} >> {}'.format(in_src_fname,out_src_fname))\n", + "\n", + " print(in_trg_fname)\n", + " os.system('cat {} >> {}'.format(in_trg_fname,out_trg_fname)) \n", + " \n", + " \n", + "# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n", + "# lpfile.write('\\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))\n", + " \n", + " corpus_stats(data_dir, outdir, lang_pair_list)\n", + " \n", + "def corpus_stats(data_dir, outdir, lang_pair_list):\n", + " \"\"\"\n", + " data_dir: input dir, contains directories for language pairs named l1-l2\n", + " \"\"\"\n", + "\n", + " with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n", + "\n", + " for src_lang, trg_lang in tqdm(lang_pair_list):\n", + " print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n", + "\n", + " in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n", + " # in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n", + "\n", + " print(in_src_fname)\n", + " corpus_size=0\n", + " with open(in_src_fname,'r',encoding='utf-8') as infile:\n", + " corpus_size=sum(map(lambda x:1,infile))\n", + " \n", + " lpfile.write('{}\\t{}\\t{}\\n'.format(src_lang,trg_lang,corpus_size))\n", + " \n", + "def generate_lang_tag_iterator(infname):\n", + " with open(infname,'r',encoding='utf-8') as infile:\n", + " for line in infile:\n", + " src,tgt,count=line.strip().split('\\t')\n", + " count=int(count)\n", + " for _ in range(count):\n", + " yield (src,tgt) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#### directory containing all experiments \n", + "## one directory per experiment \n", + "EXPBASEDIR=''\n", + "\n", + "### directory containing data\n", + "## contains 3 directories: train test dev\n", + "## train directory structure: \n", + "## - There is one directory for each language pair\n", + "## - Directory naming convention lang1-lang2 (you need another directory/softlink for lang2-lang1)\n", + "## - Each directory contains 6 files: {train,test,dev}.{lang1,lang2}\n", + "## test & dev directory structure \n", + "## - test: contains files {test.l1,test.l2,test.l3} - assumes parallel test files like the wat2021 dataset\n", + "## - valid: contains files {dev.l1,dev.l2,dev.l3} - assumes parallel test files like the wat2021 dataset\n", + "## All files are tokenized\n", + "ORG_DATA_DIR='{d}/consolidated_unique_preprocessed'.format(d=BASEDIR)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exp2 (M2O)\n", + "\n", + "- All *-en " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Params**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "expname='exp2_m2o_baseline'\n", + "expdir='{}/{}'.format(EXPBASEDIR,expname)\n", + "\n", + "lang_pair_list=[]\n", + "for lang in LANGS: \n", + " lang_pair_list.append([lang,'en'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create Train Corpus**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "indir='{}/train'.format(ORG_DATA_DIR)\n", + "outdir='{}/data'.format(expdir)\n", + "\n", + "# print(lang_pair_list)\n", + "concat_data(indir,outdir,lang_pair_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Learn BPE**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo ./learn_bpe.sh {expdir}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo ./apply_bpe_train_notag.sh {expdir}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo ./apply_bpe_test_valid_notag.sh {expdir} {ORG_DATA_DIR} {'\"'+' '.join(LANGS+['en'])+'\"'}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Add language tags to train**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset='train' \n", + "\n", + "src_fname='{expdir}/bpe/train/{dset}.SRC'.format(expdir=expdir,dset=dset)\n", + "tgt_fname='{expdir}/bpe/train/{dset}.TGT'.format(expdir=expdir,dset=dset)\n", + "meta_fname='{expdir}/data/lang_pairs.txt'.format(expdir=expdir,dset=dset)\n", + " \n", + "out_src_fname='{expdir}/final/{dset}.SRC'.format(expdir=expdir,dset=dset)\n", + "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(expdir=expdir,dset=dset)\n", + "\n", + "lang_tag_iterator=generate_lang_tag_iterator(meta_fname)\n", + "\n", + "print(expdir)\n", + "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n", + "\n", + "with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n", + " open(tgt_fname,'r',encoding='utf-8') as tgtfile, \\\n", + " open(out_src_fname,'w',encoding='utf-8') as outsrcfile, \\\n", + " open(out_tgt_fname,'w',encoding='utf-8') as outtgtfile: \n", + "\n", + " for (l1,l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator, srcfile, tgtfile)):\n", + " outsrcfile.write(add_token(src_sent.strip(),[('src',l1),('tgt',l2)]) + '\\n' )\n", + " outtgtfile.write(tgt_sent.strip()+'\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Add language tags to valid**\n", + "\n", + "- add language tags, create parallel corpus\n", + "- sample 20\\% for validation set \n", + "- Create final validation set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset='dev' \n", + "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n", + " expdir=expdir,dset=dset)\n", + "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n", + " expdir=expdir,dset=dset)\n", + "\n", + "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n", + "\n", + "print('Processing validation files') \n", + "consolidated_dset=[]\n", + "for l1, l2 in tqdm(lang_pair_list):\n", + " src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", + " expdir=expdir,dset=dset,lang=l1)\n", + " tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", + " expdir=expdir,dset=dset,lang=l2)\n", + "# print(src_fname)\n", + "# print(os.path.exists(src_fname))\n", + " with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n", + " open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n", + " for src_sent, tgt_sent in zip(srcfile,tgtfile):\n", + " consolidated_dset.append(\n", + " ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n", + " tgt_sent.strip() )\n", + " )\n", + "\n", + "print('Create validation set') \n", + "random.shuffle(consolidated_dset)\n", + "final_set=consolidated_dset[:len(consolidated_dset)//5] \n", + "\n", + "print('Original set size: {}'.format(len(consolidated_dset))) \n", + "print('Sampled set size: {}'.format(len(final_set))) \n", + "\n", + "print('Write validation set')\n", + "\n", + "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n", + " open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n", + " for src_sent, tgt_sent in final_set: \n", + " srcfile.write(src_sent+'\\n')\n", + " tgtfile.write(tgt_sent+'\\n')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Add language tags to test**\n", + "\n", + "- add language tags, create parallel corpus all M2O language pairs \n", + "- Create final test set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset='test' \n", + "out_src_fname='{expdir}/final/{dset}.SRC'.format(\n", + " expdir=expdir,dset=dset)\n", + "out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n", + " expdir=expdir,dset=dset)\n", + "\n", + "os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n", + "\n", + "print('Processing test files') \n", + "consolidated_dset=[]\n", + "for l1, l2 in tqdm(lang_pair_list):\n", + " src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", + " expdir=expdir,dset=dset,lang=l1)\n", + " tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n", + " expdir=expdir,dset=dset,lang=l2)\n", + "# print(src_fname)\n", + "# print(os.path.exists(src_fname))\n", + " with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n", + " open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n", + " for src_sent, tgt_sent in zip(srcfile,tgtfile):\n", + " consolidated_dset.append(\n", + " ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n", + " tgt_sent.strip() )\n", + " )\n", + "\n", + "print('Final set size: {}'.format(len(consolidated_dset))) \n", + " \n", + "print('Write test set')\n", + "print('testset truncated')\n", + "\n", + "with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n", + " open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n", + " for lno, (src_sent, tgt_sent) in enumerate(consolidated_dset,1):\n", + " \n", + " s=src_sent.strip().split(' ')\n", + " t=tgt_sent.strip().split(' ')\n", + " \n", + " if len(s) > 200 or len(t) > 200:\n", + " print('exp: {}, pair: ({},{}), lno: {}: lens: ({},{})'.format(expname,l1,l2,lno,len(s),len(t))) \n", + " \n", + " src_sent=' '.join( s[:min(len(s),200)] )\n", + " tgt_sent=' '.join( t[:min(len(t),200)] )\n", + " \n", + " srcfile.write(src_sent+'\\n')\n", + " tgtfile.write(tgt_sent+'\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Binarize data**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo ./binarize_training_exp.sh {expdir} SRC TGT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Training Command**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash \n", + "\n", + "python train.py {expdir}/final_bin \\\n", + " --arch transformer \\\n", + " --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1.0 \\\n", + " --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \\\n", + " --dropout 0.2 \\\n", + " --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \\\n", + " --max-tokens 8192 \\\n", + " --max-update 1000000 \\\n", + " --max-source-positions 200 \\\n", + " --max-target-positions 200 \\\n", + " --tensorboard-logdir {expdir}/tensorboard \\\n", + " --save-dir {expdir}/model \\\n", + " --required-batch-size-multiple 8 \\\n", + " --save-interval 1 \\\n", + " --keep-last-epochs 5 \\\n", + " --patience 5 \\\n", + " --fp16" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Cleanup**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# os.unlink('{}')\n", + "\n", + "to_delete=[\n", + " '{expdir}/data/train.SRC'.format(expdir=expdir,dset=dset),\n", + " '{expdir}/data/train.TGT'.format(expdir=expdir,dset=dset),\n", + " '{expdir}/bpe/train/train.SRC'.format(expdir=expdir,dset=dset),\n", + " '{expdir}/bpe/train/train.TGT'.format(expdir=expdir,dset=dset),\n", + "]`\n", + "\n", + "for fname in to_delete:\n", + " os.unlink(fname)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Evaluation**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dset='test' \n", + "consolidated_testoutput_fname='{expdir}/evaluations/test/default/test.SRC_TGT.TGT'.format(expdir=expdir)\n", + "consolidated_testoutput_log_fname='{}.log'.format(consolidated_testoutput_fname)\n", + "metrics_fname='{expdir}/evaluations/test/default/test.metrics.tsv'.format(expdir=expdir)\n", + " \n", + "test_set_size=2390\n", + "\n", + "consolidated_testoutput=[]\n", + "with open(consolidated_testoutput_log_fname,'r',encoding='utf-8') as hypfile:\n", + " consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),hypfile) ))\n", + " consolidated_testoutput.sort(key=lambda x: int(x.split('\\t')[0].split('-')[1]))\n", + " consolidated_testoutput=[ x.split('\\t')[2] for x in consolidated_testoutput ]\n", + "\n", + "os.makedirs('{expdir}/evaluations/test/default'.format(expdir=expdir),exist_ok=True)\n", + "\n", + "with open(consolidated_testoutput_fname,'w',encoding='utf-8') as finalhypfile:\n", + " for sent in consolidated_testoutput:\n", + " finalhypfile.write(sent+'\\n')\n", + "\n", + "print('Processing test files') \n", + "with open(metrics_fname,'w',encoding='utf-8') as metrics_file: \n", + " for i, (l1, l2) in enumerate(tqdm(lang_pair_list)):\n", + "\n", + " start=i*test_set_size\n", + " end=(i+1)*test_set_size\n", + " hyps=consolidated_testoutput[start:end]\n", + " ref_fname='{expdir}/{dset}/{dset}.{lang}'.format(\n", + " expdir=ORG_DATA_DIR,dset=dset,lang=l2)\n", + "\n", + " refs=[]\n", + " with open(ref_fname,'r',encoding='utf-8') as reffile:\n", + " refs.extend(map(lambda x:x.strip(),reffile))\n", + "\n", + " assert(len(hyps)==len(refs))\n", + "\n", + " bleu=sacrebleu.corpus_bleu(hyps,[refs],tokenize='none')\n", + "\n", + " print('{} {} {} {}'.format(l1,l2,bleu.score,bleu.prec_str))\n", + " metrics_file.write('{}\\t{}\\t{}\\t{}\\t{}\\n'.format(expname,l1,l2,bleu.score,bleu.prec_str))\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "toc": { + "base_numbering": 1, + "nav_menu": { + "height": "243.993px", + "width": "160px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/legacy/install_fairseq.sh b/legacy/install_fairseq.sh new file mode 100644 index 0000000000000000000000000000000000000000..275ab9574dabcd293a553dd50e46288d33025e7a --- /dev/null +++ b/legacy/install_fairseq.sh @@ -0,0 +1,45 @@ +#NVIDIA CUDA download +wget "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux" +wget "http://developer.download.nvidia.com/compute/cuda/10.0/Prod/patches/1/cuda_10.0.130.1_linux.run" + +## do not install drivers (See this: https://docs.nvidia.com/deploy/cuda-compatibility/index.html) +sudo sh "cuda_10.0.130_410.48_linux" +sudo sh "cuda_10.0.130.1_linux.run" + +#Set environment variables +export CUDA_HOME=/usr/local/cuda-10.0 +export PATH=$CUDA_HOME/bin:$PATH +export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH + +# Install pytorch 1.2 +python3 -m venv pytorch1.2 +source pytorch1.2/bin/activate +which pip3 +pip3 install torch==1.2.0 torchvision==0.4.0 + +# Install nccl +git clone https://github.com/NVIDIA/nccl.git +cd nccl +make src.build CUDA_HOME=$CUDA_HOME +sudo apt install build-essential devscripts debhelper fakeroot +make pkg.debian.build CUDA_HOME=$CUDA_HOME +sudo dpkg -i build/pkg/deb/libnccl2_2.7.8-1+cuda10.0_amd64.deb +sudo dpkg -i build/pkg/deb/libnccl-dev_2.7.8-1+cuda10.0_amd64.deb +sudo apt-get install -f + +# Install Apex +git clone https://github.com/NVIDIA/apex +cd apex +pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \ + --global-option="--deprecated_fused_adam" --global-option="--xentropy" \ + --global-option="--fast_multihead_attn" ./ + +# Install PyArrow +pip install pyarrow + +# Install fairseq +pip install --editable ./ + +# Install other dependencies +pip install sacrebleu +pip install tensorboardX --user diff --git a/legacy/run_inference.sh b/legacy/run_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..ff582a6c49d015cf36c82e8f20a755f6d1418ed8 --- /dev/null +++ b/legacy/run_inference.sh @@ -0,0 +1,80 @@ +src_lang=${1:-hi} +tgt_lang=${2:-en} +bucket_path=${3:-gs://ai4b-anuvaad-nmt/baselines/transformer-base/baselines-${src_lang}-${tgt_lang}} + +expdir=../baselines/baselines-${src_lang}-${tgt_lang} + +if [[ -d $expdir ]] +then + echo "$expdir exists on your filesystem. Please delete this if you have made some changes to the bucket files and trying to redownload" +else + mkdir -p $expdir + mkdir -p $expdir/model + cd ../baselines + gsutil -m cp -r $bucket_path/vocab $expdir + gsutil -m cp -r $bucket_path/final_bin $expdir + gsutil -m cp $bucket_path/model/checkpoint_best.pt $expdir/model + cd ../indicTrans +fi + + +if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then + #TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all) + TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news ) +elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then + # TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 all) + TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news ufal-ta) +elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then + # TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all) + TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018) +elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then + # TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all) + TEST_SETS=( wat2021-devtest wat2020-devtest wmt-news ) +elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then + TEST_SETS=( pmi ) +elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then + # TEST_SETS=( wat2021-devtest anuvaad-legal all) + TEST_SETS=( wat2021-devtest ) +elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then + # TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all) + TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018) +elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then + # TEST_SETS=( wat2021-devtest wat2020-devtest all) + TEST_SETS=( wat2021-devtest wat2020-devtest ) +elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then + TEST_SETS=( wat2021-devtest ) +elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then + TEST_SETS=( wat2021-devtest ) +elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then + # TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all ) + TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018) +fi + +if [ $src_lang == 'en' ]; then + indic_lang=$tgt_lang +else + indic_lang=$src_lang +fi + + +for tset in ${TEST_SETS[@]};do + echo $tset $src_lang $tgt_lang + if [ $tset == 'wat2021-devtest' ]; then + SRC_FILE=${expdir}/benchmarks/$tset/test.$src_lang + REF_FILE=${expdir}/benchmarks/$tset/test.$tgt_lang + else + SRC_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$src_lang + REF_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$tgt_lang + fi + RESULTS_DIR=${expdir}/results/$tset + + mkdir -p $RESULTS_DIR + + bash translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE + # for newline between different outputs + echo +done +# send the results to the bucket +gsutil -m cp -r $expdir/results $bucket_path +# clear up the space in the instance +# rm -r $expdir \ No newline at end of file diff --git a/legacy/run_joint_inference.sh b/legacy/run_joint_inference.sh new file mode 100644 index 0000000000000000000000000000000000000000..bf4668c9ecb6b1a1ef9b9b7871c6ee22d7865c0b --- /dev/null +++ b/legacy/run_joint_inference.sh @@ -0,0 +1,74 @@ +src_lang=${1:-en} +tgt_lang=${2:-indic} +bucket_path=${3:-gs://ai4b-anuvaad-nmt/models/transformer-4x/indictrans-${src_lang}-${tgt_lang}} + +mkdir -p ../baselines +expdir=../baselines/baselines-${src_lang}-${tgt_lang} + +if [[ -d $expdir ]] +then + echo "$expdir exists on your filesystem." +else + cd ../baselines + mkdir -p baselines-${src_lang}-${tgt_lang}/model + mkdir -p baselines-${src_lang}-${tgt_lang}/final_bin + cd baselines-${src_lang}-${tgt_lang}/model + gsutil -m cp $bucket_path/model/checkpoint_best.pt . + cd .. + gsutil -m cp $bucket_path/vocab . + gsutil -m cp $bucket_path/final_bin/dict.* final_bin + cd ../indicTrans +fi + + + + + +if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then + TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all) +elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then + TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 all) +elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then + TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all) +elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then + TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all) +elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then + TEST_SETS=( all ) +elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then + TEST_SETS=( wat2021-devtest anuvaad-legal all) +elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then + TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all) +elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then + TEST_SETS=( wat2021-devtest wat2020-devtest all) +elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then + TEST_SETS=( all ) +elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then + TEST_SETS=( all ) +elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then + TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all ) +fi + +if [ $src_lang == 'en' ]; then + indic_lang=$tgt_lang +else + indic_lang=$src_lang +fi + + +for tset in ${TEST_SETS[@]};do + echo $tset $src_lang $tgt_lang + if [ $tset == 'wat2021-devtest' ]; then + SRC_FILE=${expdir}/devtest/$tset/test.$src_lang + REF_FILE=${expdir}/devtest/$tset/test.$tgt_lang + else + SRC_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$src_lang + REF_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$tgt_lang + fi + RESULTS_DIR=${expdir}/results/$tset + + mkdir -p $RESULTS_DIR + + bash joint_translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE + # for newline between different outputs + echo +done diff --git a/legacy/tpu_training_instructions.md b/legacy/tpu_training_instructions.md new file mode 100644 index 0000000000000000000000000000000000000000..41c9092811f50188c21b459c3033a59d769be8c8 --- /dev/null +++ b/legacy/tpu_training_instructions.md @@ -0,0 +1,92 @@ +## Instructions to run on Google cloud TPUs +Before starting these steps, make sure to prepare the dataset (normalization -> bpe -> .. -> binarization) following the steps in indicTrans workflow or do these steps on a cpu instance before launching the tpu instance (to save time and costs) + +### Creating TPU instance + +- Create a cpu instance on gcp with `torch-xla` image like: +```bash +gcloud compute --project=${PROJECT_ID} instances create \ + --zone= \ + --machine-type=n1-standard-16 \ + --image-family=torch-xla \ + --image-project=ml-images \ + --boot-disk-size=200GB \ + --scopes=https://www.googleapis.com/auth/cloud-platform +``` +- Once the instance is created, Launch a Cloud TPU (from your cpu vm instance) using the following command (you can change the `accelerator_type` according to your needs): +```bash +gcloud compute tpus create \ +--zone= \ +--network=default \ +--version=pytorch-1.7 \ +--accelerator-type=v3-8 +``` + (or) +Create a new tpu using the GUI in https://console.cloud.google.com/compute/tpus and make sure to select `version` as `pytorch 1.7`. + +- Once the tpu is launched, identify its ip address: +```bash +# you can run this inside cpu instance and note down the IP address which is located under the NETWORK_ENDPOINTS column +gcloud compute tpus list --zone=us-central1-a +``` + (or) +Go to https://console.cloud.google.com/compute/tpus and note down ip address for the created TPU from the `interal ip` column + +### Installing Fairseq, getting data on the cpu instance + +- Activate the `torch xla 1.7` conda environment and install necessary libs for IndicTrans (**Excluding FairSeq**): +```bash +conda activate torch-xla-1.7 +pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow +``` +- Configure environment variables for TPU: +```bash +export TPU_IP_ADDRESS=ip-address; \ +export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470" +``` +- Download the prepared binarized data for FairSeq + +- Clone the latest version of Fairseq (this supports tpu) and install from source. There is an [issue](https://github.com/pytorch/fairseq/issues/3259) with the latest commit and hence we use a different commit to install from source (This may have been fixed in the latest master but we have not tested it.) +```bash +git clone https://github.com/pytorch/fairseq.git +git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d +pip install --editable ./ +``` + +- Start TPU training +```bash +# this is for using all tpu cores +export MKL_SERVICE_FORCE_INTEL=1 + +fairseq-train {expdir}/exp2_m2o_baseline/final_bin \ +--max-source-positions=200 \ +--max-target-positions=200 \ +--max-update=1000000 \ +--save-interval=5 \ +--arch=transformer \ +--attention-dropout=0.1 \ +--criterion=label_smoothed_cross_entropy \ +--source-lang=SRC \ +--lr-scheduler=inverse_sqrt \ +--skip-invalid-size-inputs-valid-test \ +--target-lang=TGT \ +--label-smoothing=0.1 \ +--update-freq=1 \ +--optimizer adam \ +--adam-betas '(0.9, 0.98)' \ +--warmup-init-lr 1e-07 \ +--lr 0.0005 \ +--warmup-updates 4000 \ +--dropout 0.2 \ +--weight-decay 0.0 \ +--tpu \ +--distributed-world-size 8 \ +--max-tokens 8192 \ +--num-batch-buckets 8 \ +--tensorboard-logdir {expdir}/exp2_m2o_baseline/tensorboard \ +--save-dir {expdir}/exp2_m2o_baseline/model \ +--keep-last-epochs 5 \ +--patience 5 +``` + +**Note** While training, we noticed that the training was slower on tpus, compared to using multiple GPUs, we have documented some issues and [filed an issue](https://github.com/pytorch/fairseq/issues/3317) at fairseq repo for advice. We'll update this section as we learn more about efficient training on TPUs. Also feel free to open an issue/pull request if you find a bug or know an efficient method to make code train faster on tpus. diff --git a/legacy/translate.sh b/legacy/translate.sh new file mode 100644 index 0000000000000000000000000000000000000000..d0526d75dce51208e51de9e8de6d35302466c12c --- /dev/null +++ b/legacy/translate.sh @@ -0,0 +1,70 @@ +#!/bin/bash +echo `date` +infname=$1 +outfname=$2 +src_lang=$3 +tgt_lang=$4 +exp_dir=$5 +ref_fname=$6 + +if [ $src_lang == 'en' ]; then + SRC_PREFIX='TGT' + TGT_PREFIX='SRC' +else + SRC_PREFIX='SRC' + TGT_PREFIX='TGT' +fi + +#`dirname $0`/env.sh +SUBWORD_NMT_DIR='subword-nmt' +model_dir=$exp_dir/model +data_bin_dir=$exp_dir/final_bin + +### normalization and script conversion + +echo "Applying normalization and script conversion" +input_size=`python preprocess_translate.py $infname $outfname.norm $src_lang` +echo "Number of sentences in input: $input_size" + +### apply BPE to input file + +echo "Applying BPE" +python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \ + -c $exp_dir/vocab/bpe_codes.32k.${SRC_PREFIX}_${TGT_PREFIX} \ + --vocabulary $exp_dir/vocab/vocab.$SRC_PREFIX \ + --vocabulary-threshold 5 \ + < $outfname.norm \ + > $outfname.bpe + +# not needed for joint training +# echo "Adding language tags" +# python add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang + +### run decoder + +echo "Decoding" + +src_input_bpe_fname=$outfname.bpe +tgt_output_fname=$outfname +fairseq-interactive $data_bin_dir \ + -s $SRC_PREFIX -t $TGT_PREFIX \ + --distributed-world-size 1 \ + --path $model_dir/checkpoint_best.pt \ + --batch-size 64 --buffer-size 2500 --beam 5 --remove-bpe \ + --skip-invalid-size-inputs-valid-test \ + --input $src_input_bpe_fname > $tgt_output_fname.log 2>&1 + + +echo "Extracting translations, script conversion and detokenization" +python postprocess_translate.py $tgt_output_fname.log $tgt_output_fname $input_size $tgt_lang +if [ $src_lang == 'en' ]; then + # indicnlp tokenize the output files before evaluation + input_size=`python preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang` + input_size=`python preprocess_translate.py $tgt_output_fname $tgt_output_fname.tok $tgt_lang` + sacrebleu --tokenize none $ref_fname.tok < $tgt_output_fname.tok +else + # indic to en models + sacrebleu $ref_fname < $tgt_output_fname +fi +echo `date` +echo "Translation completed" diff --git a/model_configs/__init__.py b/model_configs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ec41f7daeb7930e9df766abdd790c4c5b09b6d9 --- /dev/null +++ b/model_configs/__init__.py @@ -0,0 +1 @@ +from . import custom_transformer \ No newline at end of file diff --git a/model_configs/custom_transformer.py b/model_configs/custom_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b122e1bf5c81534aae35bb6235c1feaf45b7bada --- /dev/null +++ b/model_configs/custom_transformer.py @@ -0,0 +1,38 @@ +from fairseq.models import register_model_architecture +from fairseq.models.transformer import base_architecture + + +@register_model_architecture("transformer", "transformer_2x") +def transformer_big(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + base_architecture(args) + + +@register_model_architecture("transformer", "transformer_4x") +def transformer_huge(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1536) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1536) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + base_architecture(args) + + +@register_model_architecture("transformer", "transformer_9x") +def transformer_xlarge(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 2048) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 8192) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + base_architecture(args) diff --git a/prepare_data.sh b/prepare_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..0db64ee9966dc1c8d90209b8b7c3d8e842c8c200 --- /dev/null +++ b/prepare_data.sh @@ -0,0 +1,71 @@ +exp_dir=$1 +src_lang=$2 +tgt_lang=$3 +train_data_dir=${4:-"$exp_dir/$src_lang-$tgt_lang"} +devtest_data_dir=${5:-"$exp_dir/devtest/all/$src_lang-$tgt_lang"} + +echo "Running experiment ${exp_dir} on ${src_lang} to ${tgt_lang}" + +train_processed_dir=$exp_dir/data +devtest_processed_dir=$exp_dir/data + +out_data_dir=$exp_dir/final_bin + +mkdir -p $train_processed_dir +mkdir -p $devtest_processed_dir +mkdir -p $out_data_dir + +# train preprocessing +train_infname_src=$train_data_dir/train.$src_lang +train_infname_tgt=$train_data_dir/train.$tgt_lang +train_outfname_src=$train_processed_dir/train.SRC +train_outfname_tgt=$train_processed_dir/train.TGT +echo "Applying normalization and script conversion for train" +input_size=`python scripts/preprocess_translate.py $train_infname_src $train_outfname_src $src_lang` +input_size=`python scripts/preprocess_translate.py $train_infname_tgt $train_outfname_tgt $tgt_lang` +echo "Number of sentences in train: $input_size" + +# dev preprocessing +dev_infname_src=$devtest_data_dir/dev.$src_lang +dev_infname_tgt=$devtest_data_dir/dev.$tgt_lang +dev_outfname_src=$devtest_processed_dir/dev.SRC +dev_outfname_tgt=$devtest_processed_dir/dev.TGT +echo "Applying normalization and script conversion for dev" +input_size=`python scripts/preprocess_translate.py $dev_infname_src $dev_outfname_src $src_lang` +input_size=`python scripts/preprocess_translate.py $dev_infname_tgt $dev_outfname_tgt $tgt_lang` +echo "Number of sentences in dev: $input_size" + +# test preprocessing +test_infname_src=$devtest_data_dir/test.$src_lang +test_infname_tgt=$devtest_data_dir/test.$tgt_lang +test_outfname_src=$devtest_processed_dir/test.SRC +test_outfname_tgt=$devtest_processed_dir/test.TGT +echo "Applying normalization and script conversion for test" +input_size=`python scripts/preprocess_translate.py $test_infname_src $test_outfname_src $src_lang` +input_size=`python scripts/preprocess_translate.py $test_infname_tgt $test_outfname_tgt $tgt_lang` +echo "Number of sentences in test: $input_size" + +echo "Learning bpe. This will take a very long time depending on the size of the dataset" +echo `date` +# learn bpe for preprocessed_train files +bash learn_bpe.sh $exp_dir +echo `date` + +echo "Applying bpe" +bash apply_bpe_traindevtest_notag.sh $exp_dir + +mkdir -p $exp_dir/final + +# this is only required for joint training +# echo "Adding language tags" +# python scripts/add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang + +# this is imporatnt step if you are training with tpu and using num_batch_buckets +# the currnet implementation does not remove outliers before bucketing and hence +# removing these large sentences ourselves helps with getting better buckets +python scripts/remove_large_sentences.py $exp_dir/bpe/train.SRC $exp_dir/bpe/train.TGT $exp_dir/final/train.SRC $exp_dir/final/train.TGT +python scripts/remove_large_sentences.py $exp_dir/bpe/dev.SRC $exp_dir/bpe/dev.TGT $exp_dir/final/dev.SRC $exp_dir/final/dev.TGT +python scripts/remove_large_sentences.py $exp_dir/bpe/test.SRC $exp_dir/bpe/test.TGT $exp_dir/final/test.SRC $exp_dir/final/test.TGT + +echo "Binarizing data" +bash binarize_training_exp.sh $exp_dir SRC TGT diff --git a/prepare_data_joint_training.sh b/prepare_data_joint_training.sh new file mode 100644 index 0000000000000000000000000000000000000000..9b355b4775785ff32b20bedb4eab1411d07ac504 --- /dev/null +++ b/prepare_data_joint_training.sh @@ -0,0 +1,110 @@ +exp_dir=$1 +src_lang=$2 +tgt_lang=$3 +train_data_dir=${4:-"$exp_dir"} +devtest_data_dir=${5:-"$exp_dir/devtest/all"} + +echo "Running experiment ${exp_dir} on ${src_lang} to ${tgt_lang}" + + +train_processed_dir=$exp_dir/data +devtest_processed_dir=$exp_dir/data + +out_data_dir=$exp_dir/final_bin + +mkdir -p $train_processed_dir +mkdir -p $devtest_processed_dir +mkdir -p $out_data_dir +langs=(as bn hi gu kn ml mr or pa ta te) + +for lang in ${langs[@]};do + if [ $src_lang == en ]; then + tgt_lang=$lang + else + src_lang=$lang + fi + + train_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang + devtest_norm_dir=$exp_dir/norm/$src_lang-$tgt_lang + mkdir -p $train_norm_dir + mkdir -p $devtest_norm_dir + + # train preprocessing + train_infname_src=$train_data_dir/en-${lang}/train.$src_lang + train_infname_tgt=$train_data_dir/en-${lang}/train.$tgt_lang + train_outfname_src=$train_norm_dir/train.$src_lang + train_outfname_tgt=$train_norm_dir/train.$tgt_lang + echo "Applying normalization and script conversion for train" + # this is for preprocessing text and in for indic langs, we convert all scripts to devnagiri + input_size=`python scripts/preprocess_translate.py $train_infname_src $train_outfname_src $src_lang true` + input_size=`python scripts/preprocess_translate.py $train_infname_tgt $train_outfname_tgt $tgt_lang true` + echo "Number of sentences in train: $input_size" + + # dev preprocessing + dev_infname_src=$devtest_data_dir/en-${lang}/dev.$src_lang + dev_infname_tgt=$devtest_data_dir/en-${lang}/dev.$tgt_lang + dev_outfname_src=$devtest_norm_dir/dev.$src_lang + dev_outfname_tgt=$devtest_norm_dir/dev.$tgt_lang + echo "Applying normalization and script conversion for dev" + input_size=`python scripts/preprocess_translate.py $dev_infname_src $dev_outfname_src $src_lang true` + input_size=`python scripts/preprocess_translate.py $dev_infname_tgt $dev_outfname_tgt $tgt_lang true` + echo "Number of sentences in dev: $input_size" + + # test preprocessing + test_infname_src=$devtest_data_dir/en-${lang}/test.$src_lang + test_infname_tgt=$devtest_data_dir/en-${lang}/test.$tgt_lang + test_outfname_src=$devtest_norm_dir/test.$src_lang + test_outfname_tgt=$devtest_norm_dir/test.$tgt_lang + echo "Applying normalization and script conversion for test" + input_size=`python scripts/preprocess_translate.py $test_infname_src $test_outfname_src $src_lang true` + input_size=`python scripts/preprocess_translate.py $test_infname_tgt $test_outfname_tgt $tgt_lang true` + echo "Number of sentences in test: $input_size" +done +# this concatenates lang pair data and creates text files to keep track of number of lines in each lang pair. +# this is imp as for joint training, we will merge all the lang pairs and the indivitual lang lines info +# would be required for adding specific lang tags later. + +# the outputs of these scripts will be text file like this: +# +# lang1-lang2 n1 +# lang1-lang3 n2 + +python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'train' +python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'dev' +python scripts/concat_joint_data.py $exp_dir/norm $exp_dir/data $src_lang $tgt_lang 'test' + +# echo "Learning bpe. This will take a very long time depending on the size of the dataset" +echo `date` +# # learn bpe for preprocessed_train files +# for creating joint_vocab use this +# bash learn_bpe.sh $exp_dir + +# for sep vocab use this +bash learn_single_bpe.sh $exp_dir +echo `date` + +# echo "Applying bpe" +# apply the learnt bpe to the data +bash apply_bpe_traindevtest_notag.sh $exp_dir + +mkdir -p $exp_dir/final + +# # this is only required for joint training +# we apply language tags to the bpe segmented data +# +# if we are translating lang1 to lang2 then will become __src__ __tgt__ +echo "Adding language tags" +python scripts/add_joint_tags_translate.py $exp_dir 'train' +python scripts/add_joint_tags_translate.py $exp_dir 'dev' +python scripts/add_joint_tags_translate.py $exp_dir 'test' + +# # this is important step if you are training with tpu and using num_batch_buckets +# # the currnet implementation does not remove outliers before bucketing and hence +# # removing these large sentences ourselves helps with getting better buckets +# python scripts/remove_large_sentences.py $exp_dir/bpe/train.SRC $exp_dir/bpe/train.TGT $exp_dir/final/train.SRC $exp_dir/final/train.TGT +# python scripts/remove_large_sentences.py $exp_dir/bpe/dev.SRC $exp_dir/bpe/dev.TGT $exp_dir/final/dev.SRC $exp_dir/final/dev.TGT +# python scripts/remove_large_sentences.py $exp_dir/bpe/test.SRC $exp_dir/bpe/test.TGT $exp_dir/final/test.SRC $exp_dir/final/test.TGT + +# echo "Binarizing data" +# Binarize the training data for using with fairseq train +bash binarize_training_exp.sh $exp_dir SRC TGT diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..abba179d34a2ee22283bcadf6ff10b9fb546ad83 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +sacremoses +pandas +mock +sacrebleu +pyarrow +indic-nlp-library +mosestokenizer +subword-nmt +git+https://github.com/Open-Speech-EkStep/fairseq@v2-hydra \ No newline at end of file diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/add_joint_tags_translate.py b/scripts/add_joint_tags_translate.py new file mode 100644 index 0000000000000000000000000000000000000000..532731b38615cf847dc3dc1661c88641df55d673 --- /dev/null +++ b/scripts/add_joint_tags_translate.py @@ -0,0 +1,61 @@ +import sys +from tqdm import tqdm +import os + + +def add_token(sent, tag_infos): + """ add special tokens specified by tag_infos to each element in list + + tag_infos: list of tuples (tag_type,tag) + + each tag_info results in a token of the form: __{tag_type}__{tag}__ + + """ + + tokens = [] + for tag_type, tag in tag_infos: + token = '__' + tag_type + '__' + tag + '__' + tokens.append(token) + + return ' '.join(tokens) + ' ' + sent + + +def generate_lang_tag_iterator(infname): + with open(infname, 'r', encoding='utf-8') as infile: + for line in infile: + src, tgt, count = line.strip().split('\t') + count = int(count) + for _ in range(count): + yield (src, tgt) + + +if __name__ == '__main__': + + expdir = sys.argv[1] + dset = sys.argv[2] + + src_fname = '{expdir}/bpe/{dset}.SRC'.format( + expdir=expdir, dset=dset) + tgt_fname = '{expdir}/bpe/{dset}.TGT'.format( + expdir=expdir, dset=dset) + meta_fname = '{expdir}/data/{dset}_lang_pairs.txt'.format( + expdir=expdir, dset=dset) + + out_src_fname = '{expdir}/final/{dset}.SRC'.format( + expdir=expdir, dset=dset) + out_tgt_fname = '{expdir}/final/{dset}.TGT'.format( + expdir=expdir, dset=dset) + lang_tag_iterator = generate_lang_tag_iterator(meta_fname) + + os.makedirs('{expdir}/final'.format(expdir=expdir), exist_ok=True) + + with open(src_fname, 'r', encoding='utf-8') as srcfile, \ + open(tgt_fname, 'r', encoding='utf-8') as tgtfile, \ + open(out_src_fname, 'w', encoding='utf-8') as outsrcfile, \ + open(out_tgt_fname, 'w', encoding='utf-8') as outtgtfile: + + for (l1, l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator, + srcfile, tgtfile)): + outsrcfile.write(add_token(src_sent.strip(), [ + ('src', l1), ('tgt', l2)]) + '\n') + outtgtfile.write(tgt_sent.strip() + '\n') diff --git a/scripts/add_tags_translate.py b/scripts/add_tags_translate.py new file mode 100644 index 0000000000000000000000000000000000000000..53a82486849655c960795ffbfb2b2ba37f2923dc --- /dev/null +++ b/scripts/add_tags_translate.py @@ -0,0 +1,33 @@ +import sys + + +def add_token(sent, tag_infos): + """ add special tokens specified by tag_infos to each element in list + + tag_infos: list of tuples (tag_type,tag) + + each tag_info results in a token of the form: __{tag_type}__{tag}__ + + """ + + tokens = [] + for tag_type, tag in tag_infos: + token = '__' + tag_type + '__' + tag + '__' + tokens.append(token) + + return ' '.join(tokens) + ' ' + sent + + +if __name__ == '__main__': + + infname = sys.argv[1] + outfname = sys.argv[2] + src_lang = sys.argv[3] + tgt_lang = sys.argv[4] + + with open(infname, 'r', encoding='utf-8') as infile, \ + open(outfname, 'w', encoding='utf-8') as outfile: + for line in infile: + outstr = add_token( + line.strip(), [('src', src_lang), ('tgt', tgt_lang)]) + outfile.write(outstr + '\n') diff --git a/scripts/clean_vocab.py b/scripts/clean_vocab.py new file mode 100644 index 0000000000000000000000000000000000000000..64bbee491ce5aba2e11030a1f90cf745eb60bcc2 --- /dev/null +++ b/scripts/clean_vocab.py @@ -0,0 +1,19 @@ +import sys +import codecs + +def clean_vocab(in_vocab_fname, out_vocab_fname): + with codecs.open(in_vocab_fname, "r", encoding="utf-8") as infile, codecs.open( + out_vocab_fname, "w", encoding="utf-8" + ) as outfile: + for i, line in enumerate(infile): + fields = line.strip("\r\n ").split(" ") + if len(fields) == 2: + outfile.write(line) + if len(fields) != 2: + print("{}: {}".format(i, line.strip())) + for c in line: + print("{}:{}".format(c, hex(ord(c)))) + + +if __name__ == "__main__": + clean_vocab(sys.argv[1], sys.argv[2]) diff --git a/scripts/concat_joint_data.py b/scripts/concat_joint_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f1496177b0f47869e8e58ebdb0395c2c457e300a --- /dev/null +++ b/scripts/concat_joint_data.py @@ -0,0 +1,130 @@ +import os +from tqdm import tqdm +import sys + +LANGS = [ + "as", + "bn", + "gu", + "hi", + "kn", + "ml", + "mr", + "or", + "pa", + "ta", + "te", + #"ur" +] + + +def add_token(sent, tag_infos): + """ add special tokens specified by tag_infos to each element in list + + tag_infos: list of tuples (tag_type,tag) + + each tag_info results in a token of the form: __{tag_type}__{tag}__ + + """ + + tokens = [] + for tag_type, tag in tag_infos: + token = '__' + tag_type + '__' + tag + '__' + tokens.append(token) + + return ' '.join(tokens) + ' ' + sent + + +def concat_data(data_dir, outdir, lang_pair_list, + out_src_lang='SRC', out_trg_lang='TGT', split='train'): + """ + data_dir: input dir, contains directories for language pairs named l1-l2 + """ + os.makedirs(outdir, exist_ok=True) + + out_src_fname = '{}/{}.{}'.format(outdir, split, out_src_lang) + out_trg_fname = '{}/{}.{}'.format(outdir, split, out_trg_lang) +# out_meta_fname='{}/metadata.txt'.format(outdir) + + print() + print(out_src_fname) + print(out_trg_fname) +# print(out_meta_fname) + + # concatenate train data + if os.path.isfile(out_src_fname): + os.unlink(out_src_fname) + if os.path.isfile(out_trg_fname): + os.unlink(out_trg_fname) +# if os.path.isfile(out_meta_fname): +# os.unlink(out_meta_fname) + + for src_lang, trg_lang in tqdm(lang_pair_list): + print('src: {}, tgt:{}'.format(src_lang, trg_lang)) + + in_src_fname = '{}/{}-{}/{}.{}'.format( + data_dir, src_lang, trg_lang, split, src_lang) + in_trg_fname = '{}/{}-{}/{}.{}'.format( + data_dir, src_lang, trg_lang, split, trg_lang) + + if not os.path.exists(in_src_fname): + continue + if not os.path.exists(in_trg_fname): + continue + + print(in_src_fname) + os.system('cat {} >> {}'.format(in_src_fname, out_src_fname)) + + print(in_trg_fname) + os.system('cat {} >> {}'.format(in_trg_fname, out_trg_fname)) + + +# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: +# lpfile.write('\n'.join( [ '-'.join(x) for x in lang_pair_list ] )) + + corpus_stats(data_dir, outdir, lang_pair_list, split) + + +def corpus_stats(data_dir, outdir, lang_pair_list, split): + """ + data_dir: input dir, contains directories for language pairs named l1-l2 + """ + + with open('{}/{}_lang_pairs.txt'.format(outdir, split), 'w', encoding='utf-8') as lpfile: + + for src_lang, trg_lang in tqdm(lang_pair_list): + print('src: {}, tgt:{}'.format(src_lang, trg_lang)) + + in_src_fname = '{}/{}-{}/{}.{}'.format( + data_dir, src_lang, trg_lang, split, src_lang) + # in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang) + if not os.path.exists(in_src_fname): + continue + + print(in_src_fname) + corpus_size = 0 + with open(in_src_fname, 'r', encoding='utf-8') as infile: + corpus_size = sum(map(lambda x: 1, infile)) + + lpfile.write('{}\t{}\t{}\n'.format( + src_lang, trg_lang, corpus_size)) + + +if __name__ == '__main__': + + in_dir = sys.argv[1] + out_dir = sys.argv[2] + src_lang = sys.argv[3] + tgt_lang = sys.argv[4] + split = sys.argv[5] + lang_pair_list = [] + + if src_lang == 'en': + for lang in LANGS: + lang_pair_list.append(['en', lang]) + else: + for lang in LANGS: + lang_pair_list.append([lang, 'en']) + + concat_data(in_dir, out_dir, lang_pair_list, split=split) + diff --git a/scripts/extract_non_english_pairs.py b/scripts/extract_non_english_pairs.py new file mode 100644 index 0000000000000000000000000000000000000000..17fa8e6b882aad5bf91eb7b5f864369121d11d73 --- /dev/null +++ b/scripts/extract_non_english_pairs.py @@ -0,0 +1,108 @@ +from tqdm import tqdm +import os +from collections import defaultdict + + +def read_file(fname): + with open(fname, "r", encoding="utf-8") as infile: + for line in infile: + yield line.strip() + + +def extract_non_english_pairs(indir, outdir, LANGS): + """ + Extracts non-english pair parallel corpora + + indir: contains english centric data in the following form: + - directory named en-xx for language xx + - each directory contains a train.en and train.xx + outdir: output directory to store mined data for each pair. + One directory is created for each pair. + LANGS: list of languages in the corpus (other than English). + The language codes must correspond to the ones used in the + files and directories in indir. Prefarably, sort the languages + in this list in alphabetic order. outdir will contain data for xx-yy, + but not for yy-xx, so it will be convenient to have this list in sorted order. + """ + + for i in tqdm(range(len(LANGS) - 1)): + print() + for j in range(i + 1, len(LANGS)): + lang1 = LANGS[i] + lang2 = LANGS[j] + # print() + print("{} {}".format(lang1, lang2)) + + fname1 = "{}/en-{}/train.en".format(indir, lang1) + fname2 = "{}/en-{}/train.en".format(indir, lang2) + # print(fname1) + # print(fname2) + enset_l1 = set(read_file(fname1)) + common_en_set = enset_l1.intersection(read_file(fname2)) + + ## this block should be used if you want to consider multiple translations. + # il_fname1 = "{}/en-{}/train.{}".format(indir, lang1, lang1) + # en_lang1_dict = defaultdict(list) + # for en_line, il_line in zip(read_file(fname1), read_file(il_fname1)): + # if en_line in common_en_set: + # en_lang1_dict[en_line].append(il_line) + + # # this block should be used if you DONT to consider multiple translation. + il_fname1='{}/en-{}/train.{}'.format(indir,lang1,lang1) + en_lang1_dict={} + for en_line,il_line in zip(read_file(fname1),read_file(il_fname1)): + if en_line in common_en_set: + en_lang1_dict[en_line]=il_line + + os.makedirs("{}/{}-{}".format(outdir, lang1, lang2), exist_ok=True) + out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format( + o=outdir, l1=lang1, l2=lang2 + ) + out_l2_fname = "{o}/{l1}-{l2}/train.{l2}".format( + o=outdir, l1=lang1, l2=lang2 + ) + + il_fname2 = "{}/en-{}/train.{}".format(indir, lang2, lang2) + with open(out_l1_fname, "w", encoding="utf-8") as out_l1_file, open( + out_l2_fname, "w", encoding="utf-8" + ) as out_l2_file: + for en_line, il_line in zip(read_file(fname2), read_file(il_fname2)): + if en_line in en_lang1_dict: + + # this block should be used if you want to consider multiple tranlations. + for il_line_lang1 in en_lang1_dict[en_line]: + # lang1_line, lang2_line = il_line_lang1, il_line + # out_l1_file.write(lang1_line + "\n") + # out_l2_file.write(lang2_line + "\n") + + # this block should be used if you DONT to consider multiple translation. + lang1_line, lang2_line = en_lang1_dict[en_line], il_line + out_l1_file.write(lang1_line+'\n') + out_l2_file.write(lang2_line+'\n') + + +def get_extracted_stats(outdir, LANGS): + """ + gathers stats from the extracted directories + + outdir: output directory to store mined data for each pair. + One directory is created for each pair. + LANGS: list of languages in the corpus (other than languages). + The language codes must correspond to the ones used in the + files and directories in indir. Prefarably, sort the languages + in this list in alphabetic order. outdir will contain data for xx-yy, + """ + common_stats = [] + for i in tqdm(range(len(LANGS) - 1)): + for j in range(i + 1, len(LANGS)): + lang1 = LANGS[i] + lang2 = LANGS[j] + + out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format( + o=outdir, l1=lang1, l2=lang2 + ) + + cnt = sum([1 for _ in read_file(out_l1_fname)]) + common_stats.append((lang1, lang2, cnt)) + common_stats.append((lang2, lang1, cnt)) + return common_stats diff --git a/scripts/postprocess_translate.py b/scripts/postprocess_translate.py new file mode 100644 index 0000000000000000000000000000000000000000..9334aaadb21168cb42ac3ff5e34ded386f00e95c --- /dev/null +++ b/scripts/postprocess_translate.py @@ -0,0 +1,110 @@ +INDIC_NLP_LIB_HOME = "indic_nlp_library" +INDIC_NLP_RESOURCES = "indic_nlp_resources" +import sys + +from indicnlp import transliterate + +sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) +from indicnlp import common + +common.set_resources_path(INDIC_NLP_RESOURCES) +from indicnlp import loader + +loader.load() +from sacremoses import MosesPunctNormalizer +from sacremoses import MosesTokenizer +from sacremoses import MosesDetokenizer +from collections import defaultdict + +import indicnlp +from indicnlp.tokenize import indic_tokenize +from indicnlp.tokenize import indic_detokenize +from indicnlp.normalize import indic_normalize +from indicnlp.transliterate import unicode_transliterate + + +def postprocess( + infname, outfname, input_size, lang, common_lang="hi", transliterate=False +): + """ + parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. + + infname: fairseq log file + outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' + input_size: expected number of output sentences + lang: language + """ + + consolidated_testoutput = [] + # with open(infname,'r',encoding='utf-8') as infile: + # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) )) + # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1])) + # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ] + + consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)] + temp_testoutput = [] + with open(infname, "r", encoding="utf-8") as infile: + temp_testoutput = list( + map( + lambda x: x.strip().split("\t"), + filter(lambda x: x.startswith("H-"), infile), + ) + ) + temp_testoutput = list( + map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput) + ) + for sid, score, hyp in temp_testoutput: + consolidated_testoutput[sid] = (sid, score, hyp) + consolidated_testoutput = [x[2] for x in consolidated_testoutput] + + if lang == "en": + en_detok = MosesDetokenizer(lang="en") + with open(outfname, "w", encoding="utf-8") as outfile: + for sent in consolidated_testoutput: + outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") + else: + xliterator = unicode_transliterate.UnicodeIndicTransliterator() + with open(outfname, "w", encoding="utf-8") as outfile: + for sent in consolidated_testoutput: + if transliterate: + outstr = indic_detokenize.trivial_detokenize( + xliterator.transliterate(sent, common_lang, lang), lang + ) + else: + outstr = indic_detokenize.trivial_detokenize(sent, lang) + outfile.write(outstr + "\n") + + +if __name__ == "__main__": + # # The path to the local git repo for Indic NLP library + # INDIC_NLP_LIB_HOME="indic_nlp_library" + # INDIC_NLP_RESOURCES = "indic_nlp_resources" + # sys.path.append('{}'.format(INDIC_NLP_LIB_HOME)) + # common.set_resources_path(INDIC_NLP_RESOURCES) + # # The path to the local git repo for Indic NLP Resources + # INDIC_NLP_RESOURCES="" + + # sys.path.append('{}'.format(INDIC_NLP_LIB_HOME)) + # common.set_resources_path(INDIC_NLP_RESOURCES) + + # loader.load() + + infname = sys.argv[1] + outfname = sys.argv[2] + input_size = int(sys.argv[3]) + lang = sys.argv[4] + if len(sys.argv) == 5: + transliterate = False + elif len(sys.argv) == 6: + transliterate = sys.argv[5] + if transliterate.lower() == "true": + transliterate = True + else: + transliterate = False + else: + print(f"Invalid arguments: {sys.argv}") + exit() + + postprocess( + infname, outfname, input_size, lang, common_lang="hi", transliterate=transliterate + ) diff --git a/scripts/preprocess_translate.py b/scripts/preprocess_translate.py new file mode 100644 index 0000000000000000000000000000000000000000..8fbe3c275f7cb655d95125256260190d51b35ca7 --- /dev/null +++ b/scripts/preprocess_translate.py @@ -0,0 +1,172 @@ +INDIC_NLP_LIB_HOME = "indic_nlp_library" +INDIC_NLP_RESOURCES = "indic_nlp_resources" +import sys + +sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) +from indicnlp import common + +common.set_resources_path(INDIC_NLP_RESOURCES) +from indicnlp import loader + +loader.load() +from sacremoses import MosesPunctNormalizer +from sacremoses import MosesTokenizer +from sacremoses import MosesDetokenizer +from collections import defaultdict + +from tqdm import tqdm +from joblib import Parallel, delayed + +from indicnlp.tokenize import indic_tokenize +from indicnlp.tokenize import indic_detokenize +from indicnlp.normalize import indic_normalize +from indicnlp.transliterate import unicode_transliterate + + +en_tok = MosesTokenizer(lang="en") +en_normalizer = MosesPunctNormalizer() + + +def preprocess_line(line, normalizer, lang, transliterate=False): + if lang == "en": + return " ".join( + en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False) + ) + elif transliterate: + # line = indic_detokenize.trivial_detokenize(line.strip(), lang) + return unicode_transliterate.UnicodeIndicTransliterator.transliterate( + " ".join( + indic_tokenize.trivial_tokenize( + normalizer.normalize(line.strip()), lang + ) + ), + lang, + "hi", + ).replace(" ् ", "्") + else: + # we only need to transliterate for joint training + return " ".join( + indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), lang) + ) + + +def preprocess(infname, outfname, lang, transliterate=False): + """ + Normalize, tokenize and script convert(for Indic) + return number of sentences input file + + """ + + n = 0 + num_lines = sum(1 for line in open(infname, "r")) + if lang == "en": + with open(infname, "r", encoding="utf-8") as infile, open( + outfname, "w", encoding="utf-8" + ) as outfile: + + out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( + delayed(preprocess_line)(line, None, lang) + for line in tqdm(infile, total=num_lines) + ) + + for line in out_lines: + outfile.write(line + "\n") + n += 1 + + else: + normfactory = indic_normalize.IndicNormalizerFactory() + normalizer = normfactory.get_normalizer(lang) + # reading + with open(infname, "r", encoding="utf-8") as infile, open( + outfname, "w", encoding="utf-8" + ) as outfile: + + out_lines = Parallel(n_jobs=-1, backend="multiprocessing")( + delayed(preprocess_line)(line, normalizer, lang, transliterate) + for line in tqdm(infile, total=num_lines) + ) + + for line in out_lines: + outfile.write(line + "\n") + n += 1 + return n + + +def old_preprocess(infname, outfname, lang): + """ + Preparing each corpus file: + - Normalization + - Tokenization + - Script coversion to Devanagari for Indic scripts + """ + n = 0 + num_lines = sum(1 for line in open(infname, "r")) + # reading + with open(infname, "r", encoding="utf-8") as infile, open( + outfname, "w", encoding="utf-8" + ) as outfile: + + if lang == "en": + en_tok = MosesTokenizer(lang="en") + en_normalizer = MosesPunctNormalizer() + for line in tqdm(infile, total=num_lines): + outline = " ".join( + en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False) + ) + outfile.write(outline + "\n") + n += 1 + + else: + normfactory = indic_normalize.IndicNormalizerFactory() + normalizer = normfactory.get_normalizer(lang) + for line in tqdm(infile, total=num_lines): + outline = ( + unicode_transliterate.UnicodeIndicTransliterator.transliterate( + " ".join( + indic_tokenize.trivial_tokenize( + normalizer.normalize(line.strip()), lang + ) + ), + lang, + "hi", + ).replace(" ् ", "्") + ) + + outfile.write(outline + "\n") + n += 1 + return n + + +if __name__ == "__main__": + + # INDIC_NLP_LIB_HOME = "indic_nlp_library" + # INDIC_NLP_RESOURCES = "indic_nlp_resources" + # sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME)) + # common.set_resources_path(INDIC_NLP_RESOURCES) + + # data_dir = '../joint_training/v1' + # new_dir = data_dir + '.norm' + # for path, subdirs, files in os.walk(data_dir): + # for name in files: + # infile = os.path.join(path, name) + # lang = infile.split('.')[-1] + # outfile = os.path.join(path.replace(data_dir, new_dir), name) + # preprocess(infile, outfile, lang) + # loader.load() + + infname = sys.argv[1] + outfname = sys.argv[2] + lang = sys.argv[3] + + if len(sys.argv) == 4: + transliterate = False + elif len(sys.argv) == 5: + transliterate = sys.argv[4] + if transliterate.lower() == "true": + transliterate = True + else: + transliterate = False + else: + print(f"Invalid arguments: {sys.argv}") + exit() + print(preprocess(infname, outfname, lang, transliterate)) diff --git a/scripts/remove_large_sentences.py b/scripts/remove_large_sentences.py new file mode 100644 index 0000000000000000000000000000000000000000..a045f95df1af2d327104e73ae4ed90558d115058 --- /dev/null +++ b/scripts/remove_large_sentences.py @@ -0,0 +1,44 @@ +from tqdm import tqdm +import sys + + +def remove_large_sentences(src_path, tgt_path): + count = 0 + new_src_lines = [] + new_tgt_lines = [] + src_num_lines = sum(1 for line in open(src_path, "r", encoding="utf-8")) + tgt_num_lines = sum(1 for line in open(tgt_path, "r", encoding="utf-8")) + assert src_num_lines == tgt_num_lines + with open(src_path, encoding="utf-8") as f1, open(tgt_path, encoding="utf-8") as f2: + for src_line, tgt_line in tqdm(zip(f1, f2), total=src_num_lines): + src_tokens = src_line.strip().split(" ") + tgt_tokens = tgt_line.strip().split(" ") + if len(src_tokens) > 200 or len(tgt_tokens) > 200: + count += 1 + continue + new_src_lines.append(src_line) + new_tgt_lines.append(tgt_line) + return count, new_src_lines, new_tgt_lines + + +def create_txt(outFile, lines, add_newline=False): + outfile = open("{0}".format(outFile), "w", encoding="utf-8") + for line in lines: + if add_newline: + outfile.write(line + "\n") + else: + outfile.write(line) + outfile.close() + + +if __name__ == "__main__": + + src_path = sys.argv[1] + tgt_path = sys.argv[2] + new_src_path = sys.argv[3] + new_tgt_path = sys.argv[4] + + count, new_src_lines, new_tgt_lines = remove_large_sentences(src_path, tgt_path) + print(f'{count} lines removed due to seq_len > 200') + create_txt(new_src_path, new_src_lines) + create_txt(new_tgt_path, new_tgt_lines) diff --git a/scripts/remove_train_devtest_overlaps.py b/scripts/remove_train_devtest_overlaps.py new file mode 100644 index 0000000000000000000000000000000000000000..6107bb6b3e430457d55e65e19c95d4ef241035e1 --- /dev/null +++ b/scripts/remove_train_devtest_overlaps.py @@ -0,0 +1,265 @@ +import os +import string +import shutil +from itertools import permutations, chain +from collections import defaultdict +from tqdm import tqdm +import sys + +INDIC_LANGS = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"] +# we will be testing the overlaps of training data with all these benchmarks +# benchmarks = ['wat2021-devtest', 'wat2020-devtest', 'wat-2018', 'wmt-news', 'ufal-ta', 'pmi'] + + +def read_lines(path): + # if path doesnt exist, return empty list + if not os.path.exists(path): + return [] + with open(path, "r") as f: + lines = f.readlines() + return lines + + +def create_txt(outFile, lines): + add_newline = not "\n" in lines[0] + outfile = open("{0}".format(outFile), "w") + for line in lines: + if add_newline: + outfile.write(line + "\n") + else: + outfile.write(line) + + outfile.close() + + +def pair_dedup_files(src_file, tgt_file): + src_lines = read_lines(src_file) + tgt_lines = read_lines(tgt_file) + len_before = len(src_lines) + + src_dedupped, tgt_dedupped = pair_dedup_lists(src_lines, tgt_lines) + + len_after = len(src_dedupped) + num_duplicates = len_before - len_after + + print(f"Dropped duplicate pairs in {src_file} Num duplicates -> {num_duplicates}") + create_txt(src_file, src_dedupped) + create_txt(tgt_file, tgt_dedupped) + + +def pair_dedup_lists(src_list, tgt_list): + src_tgt = list(set(zip(src_list, tgt_list))) + src_deduped, tgt_deduped = zip(*src_tgt) + return src_deduped, tgt_deduped + + +def strip_and_normalize(line): + # lowercase line, remove spaces and strip punctuation + + # one of the fastest way to add an exclusion list and remove that + # list of characters from a string + # https://towardsdatascience.com/how-to-efficiently-remove-punctuations-from-a-string-899ad4a059fb + exclist = string.punctuation + "\u0964" + table_ = str.maketrans("", "", exclist) + + line = line.replace(" ", "").lower() + # dont use this method, it is painfully slow + # line = "".join([i for i in line if i not in string.punctuation]) + line = line.translate(table_) + return line + + +def expand_tupled_list(list_of_tuples): + # convert list of tuples into two lists + # https://stackoverflow.com/questions/8081545/how-to-convert-list-of-tuples-to-multiple-lists + # [(en, as), (as, bn), (bn, gu)] - > [en, as, bn], [as, bn, gu] + list_a, list_b = map(list, zip(*list_of_tuples)) + return list_a, list_b + + +def get_src_tgt_lang_lists(many2many=False): + if many2many is False: + SRC_LANGS = ["en"] + TGT_LANGS = INDIC_LANGS + else: + all_languages = INDIC_LANGS + ["en"] + # lang_pairs = list(permutations(all_languages, 2)) + + SRC_LANGS, TGT_LANGS = all_languages, all_languages + + return SRC_LANGS, TGT_LANGS + + +def normalize_and_gather_all_benchmarks(devtest_dir, many2many=False): + + # This is a dict of dict of lists + # the first keys are for lang-pair, the second keys are for src/tgt + # the values are the devtest lines. + # so devtest_pairs_normalized[en-as][src] will store src(en lines) + # so devtest_pairs_normalized[en-as][tgt] will store tgt(as lines) + devtest_pairs_normalized = defaultdict(lambda: defaultdict(list)) + SRC_LANGS, TGT_LANGS = get_src_tgt_lang_lists(many2many) + benchmarks = os.listdir(devtest_dir) + for dataset in benchmarks: + for src_lang in SRC_LANGS: + for tgt_lang in TGT_LANGS: + if src_lang == tgt_lang: + continue + if dataset == "wat2021-devtest": + # wat2021 dev and test sets have differnet folder structure + src_dev = read_lines(f"{devtest_dir}/{dataset}/dev.{src_lang}") + tgt_dev = read_lines(f"{devtest_dir}/{dataset}/dev.{tgt_lang}") + src_test = read_lines(f"{devtest_dir}/{dataset}/test.{src_lang}") + tgt_test = read_lines(f"{devtest_dir}/{dataset}/test.{tgt_lang}") + else: + src_dev = read_lines( + f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/dev.{src_lang}" + ) + tgt_dev = read_lines( + f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/dev.{tgt_lang}" + ) + src_test = read_lines( + f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/test.{src_lang}" + ) + tgt_test = read_lines( + f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/test.{tgt_lang}" + ) + + # if the tgt_pair data doesnt exist for a particular test set, + # it will be an empty list + if tgt_test == [] or tgt_dev == []: + # print(f'{dataset} does not have {src_lang}-{tgt_lang} data') + continue + + # combine both dev and test sets into one + src_devtest = src_dev + src_test + tgt_devtest = tgt_dev + tgt_test + + src_devtest = [strip_and_normalize(line) for line in src_devtest] + tgt_devtest = [strip_and_normalize(line) for line in tgt_devtest] + + devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"].extend( + src_devtest + ) + devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"].extend( + tgt_devtest + ) + + # dedup merged benchmark datasets + for src_lang in SRC_LANGS: + for tgt_lang in TGT_LANGS: + if src_lang == tgt_lang: + continue + src_devtest, tgt_devtest = ( + devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"], + devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"], + ) + # if the devtest data doesnt exist for the src-tgt pair then continue + if src_devtest == [] or tgt_devtest == []: + continue + src_devtest, tgt_devtest = pair_dedup_lists(src_devtest, tgt_devtest) + ( + devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"], + devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"], + ) = ( + src_devtest, + tgt_devtest, + ) + + return devtest_pairs_normalized + + +def remove_train_devtest_overlaps(train_dir, devtest_dir, many2many=False): + + devtest_pairs_normalized = normalize_and_gather_all_benchmarks( + devtest_dir, many2many + ) + + SRC_LANGS, TGT_LANGS = get_src_tgt_lang_lists(many2many) + + if not many2many: + all_src_sentences_normalized = [] + for key in devtest_pairs_normalized: + all_src_sentences_normalized.extend(devtest_pairs_normalized[key]["src"]) + # remove all duplicates. Now this contains all the normalized + # english sentences in all test benchmarks across all lang pair + all_src_sentences_normalized = list(set(all_src_sentences_normalized)) + else: + all_src_sentences_normalized = None + + src_overlaps = [] + tgt_overlaps = [] + for src_lang in SRC_LANGS: + for tgt_lang in TGT_LANGS: + if src_lang == tgt_lang: + continue + new_src_train = [] + new_tgt_train = [] + + pair = f"{src_lang}-{tgt_lang}" + src_train = read_lines(f"{train_dir}/{pair}/train.{src_lang}") + tgt_train = read_lines(f"{train_dir}/{pair}/train.{tgt_lang}") + + len_before = len(src_train) + if len_before == 0: + continue + + src_train_normalized = [strip_and_normalize(line) for line in src_train] + tgt_train_normalized = [strip_and_normalize(line) for line in tgt_train] + + if all_src_sentences_normalized: + src_devtest_normalized = all_src_sentences_normalized + else: + src_devtest_normalized = devtest_pairs_normalized[pair]["src"] + + tgt_devtest_normalized = devtest_pairs_normalized[pair]["tgt"] + + # compute all src and tgt super strict overlaps for a lang pair + overlaps = set(src_train_normalized) & set(src_devtest_normalized) + src_overlaps.extend(list(overlaps)) + + overlaps = set(tgt_train_normalized) & set(tgt_devtest_normalized) + tgt_overlaps.extend(list(overlaps)) + # dictionaries offer o(1) lookup + src_overlaps_dict = {} + tgt_overlaps_dict = {} + for line in src_overlaps: + src_overlaps_dict[line] = 1 + for line in tgt_overlaps: + tgt_overlaps_dict[line] = 1 + + # loop to remove the ovelapped data + idx = -1 + for src_line_norm, tgt_line_norm in tqdm( + zip(src_train_normalized, tgt_train_normalized), total=len_before + ): + idx += 1 + if src_overlaps_dict.get(src_line_norm, None): + continue + if tgt_overlaps_dict.get(tgt_line_norm, None): + continue + new_src_train.append(src_train[idx]) + new_tgt_train.append(tgt_train[idx]) + + len_after = len(new_src_train) + print( + f"Detected overlaps between train and devetest for {pair} is {len_before - len_after}" + ) + print(f"saving new files at {train_dir}/{pair}/") + create_txt(f"{train_dir}/{pair}/train.{src_lang}", new_src_train) + create_txt(f"{train_dir}/{pair}/train.{tgt_lang}", new_tgt_train) + + +if __name__ == "__main__": + train_data_dir = sys.argv[1] + # benchmarks directory should contains all the test sets + devtest_data_dir = sys.argv[2] + if len(sys.argv) == 3: + many2many = False + elif len(sys.argv) == 4: + many2many = sys.argv[4] + if many2many.lower() == "true": + many2many = True + else: + many2many = False + remove_train_devtest_overlaps(train_data_dir, devtest_data_dir, many2many) diff --git a/subword-nmt/.github/workflows/pythonpublish.yml b/subword-nmt/.github/workflows/pythonpublish.yml new file mode 100644 index 0000000000000000000000000000000000000000..21f2f01de1818fcb274e162b2cdc89116ed3c556 --- /dev/null +++ b/subword-nmt/.github/workflows/pythonpublish.yml @@ -0,0 +1,26 @@ +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - name: Set up Python + uses: actions/setup-python@v1 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/subword-nmt/.gitignore b/subword-nmt/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..b989be6ca157679d1b287b3ca700518423219392 --- /dev/null +++ b/subword-nmt/.gitignore @@ -0,0 +1,105 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/subword-nmt/CHANGELOG.md b/subword-nmt/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..9f6772079019214833a806a4de55564a491fa915 --- /dev/null +++ b/subword-nmt/CHANGELOG.md @@ -0,0 +1,49 @@ +CHANGELOG +--------- + +v0.3.8: + - multiprocessing support (get_vocab and apply_bpe) + - progress bar for learn_bpe + - seed parameter for deterministic BPE dropout + - ignore some unicode line separators which would crash subword-nmt + +v0.3.7: + - BPE dropout (Provilkov et al., 2019) + - more efficient glossaries (https://github.com/rsennrich/subword-nmt/pull/69) + +v0.3.6: + - fix to subword-bpe command encoding + +v0.3.5: + - fix to subword-bpe command under Python 2 + - wider support of --total-symbols argument + +v0.3.4: + - segment_tokens method to improve library usability (https://github.com/rsennrich/subword-nmt/pull/52) + - support regex glossaries (https://github.com/rsennrich/subword-nmt/pull/56) + - allow unicode separators (https://github.com/rsennrich/subword-nmt/pull/57) + - new option --total-symbols in learn-bpe (commit 61ad8) + - fix documentation (best practices) (https://github.com/rsennrich/subword-nmt/pull/60) + +v0.3: + - library is now installable via pip + - fix occasional problems with UTF-8 whitespace and new lines in learn_bpe and apply_bpe. + - do not silently convert UTF-8 newline characters into "\n" + - do not silently convert UTF-8 whitespace characters into " " + - UTF-8 whitespace and newline characters are now considered part of a word, and segmented by BPE + +v0.2: + - different, more consistent handling of end-of-word token (commit a749a7) (https://github.com/rsennrich/subword-nmt/issues/19) + - allow passing of vocabulary and frequency threshold to apply_bpe.py, preventing the production of OOV (or rare) subword units (commit a00db) + - made learn_bpe.py deterministic (commit 4c54e) + - various changes to make handling of UTF more consistent between Python versions + - new command line arguments for apply_bpe.py: + - '--glossaries' to prevent given strings from being affected by BPE + - '--merges' to apply a subset of learned BPE operations + - new command line arguments for learn_bpe.py: + - '--dict-input': rather than raw text file, interpret input as a frequency dictionary (as created by get_vocab.py). + + +v0.1: + - consistent cross-version unicode handling + - all scripts are now deterministic diff --git a/subword-nmt/LICENSE b/subword-nmt/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9d4a96d99da1956b4e862fc1db5ed1cb409fc7c4 --- /dev/null +++ b/subword-nmt/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 University of Edinburgh + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/subword-nmt/README.md b/subword-nmt/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3690de7788918ecf0e56f5d73b3f29616fd96cc3 --- /dev/null +++ b/subword-nmt/README.md @@ -0,0 +1,138 @@ +Subword Neural Machine Translation +================================== + +This repository contains preprocessing scripts to segment text into subword +units. The primary purpose is to facilitate the reproduction of our experiments +on Neural Machine Translation with subword units (see below for reference). + +INSTALLATION +------------ + +install via pip (from PyPI): + + pip install subword-nmt + +install via pip (from Github): + + pip install https://github.com/rsennrich/subword-nmt/archive/master.zip + +alternatively, clone this repository; the scripts are executable stand-alone. + + +USAGE INSTRUCTIONS +------------------ + +Check the individual files for usage instructions. + +To apply byte pair encoding to word segmentation, invoke these commands: + + subword-nmt learn-bpe -s {num_operations} < {train_file} > {codes_file} + subword-nmt apply-bpe -c {codes_file} < {test_file} > {out_file} + +To segment rare words into character n-grams, do the following: + + subword-nmt get-vocab --train_file {train_file} --vocab_file {vocab_file} + subword-nmt segment-char-ngrams --vocab {vocab_file} -n {order} --shortlist {size} < {test_file} > {out_file} + +The original segmentation can be restored with a simple replacement: + + sed -r 's/(@@ )|(@@ ?$)//g' + +If you cloned the repository and did not install a package, you can also run the individual commands as scripts: + + ./subword_nmt/learn_bpe.py -s {num_operations} < {train_file} > {codes_file} + +BEST PRACTICE ADVICE FOR BYTE PAIR ENCODING IN NMT +-------------------------------------------------- + +We found that for languages that share an alphabet, learning BPE on the +concatenation of the (two or more) involved languages increases the consistency +of segmentation, and reduces the problem of inserting/deleting characters when +copying/transliterating names. + +However, this introduces undesirable edge cases in that a word may be segmented +in a way that has only been observed in the other language, and is thus unknown +at test time. To prevent this, `apply_bpe.py` accepts a `--vocabulary` and a +`--vocabulary-threshold` option so that the script will only produce symbols +which also appear in the vocabulary (with at least some frequency). + +To use this functionality, we recommend the following recipe (assuming L1 and L2 +are the two languages): + +Learn byte pair encoding on the concatenation of the training text, and get resulting vocabulary for each: + + cat {train_file}.L1 {train_file}.L2 | subword-nmt learn-bpe -s {num_operations} -o {codes_file} + subword-nmt apply-bpe -c {codes_file} < {train_file}.L1 | subword-nmt get-vocab > {vocab_file}.L1 + subword-nmt apply-bpe -c {codes_file} < {train_file}.L2 | subword-nmt get-vocab > {vocab_file}.L2 + +more conventiently, you can do the same with with this command: + + subword-nmt learn-joint-bpe-and-vocab --input {train_file}.L1 {train_file}.L2 -s {num_operations} -o {codes_file} --write-vocabulary {vocab_file}.L1 {vocab_file}.L2 + +re-apply byte pair encoding with vocabulary filter: + + subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1 + subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L2 --vocabulary-threshold 50 < {train_file}.L2 > {train_file}.BPE.L2 + +as a last step, extract the vocabulary to be used by the neural network. Example with Nematus: + + nematus/data/build_dictionary.py {train_file}.BPE.L1 {train_file}.BPE.L2 + +[you may want to take the union of all vocabularies to support multilingual systems] + +for test/dev data, re-use the same options for consistency: + + subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {test_file}.L1 > {test_file}.BPE.L1 + +ADVANCED FEATURES +----------------- + +On top of the basic BPE implementation, this repository supports: + +- BPE dropout (Provilkov, Emelianenko and Voita, 2019): https://arxiv.org/abs/1910.13267 + use the argument `--dropout 0.1` for `subword-nmt apply-bpe` to randomly drop out possible merges. + Doing this on the training corpus can improve quality of the final system; at test time, use BPE without dropout. + In order to obtain reproducible results, argument `--seed` can be used to set the random seed. + + **Note:** In the original paper, the authors used BPE-Dropout on each new batch separately. You can copy the training corpus several times to get similar behavior to obtain multiple segmentations for the same sentence. + +- support for glossaries: + use the argument `--glossaries` for `subword-nmt apply-bpe` to provide a list of words and/or regular expressions + that should always be passed to the output without subword segmentation + +PUBLICATIONS +------------ + +The segmentation methods are described in: + +Rico Sennrich, Barry Haddow and Alexandra Birch (2016): + Neural Machine Translation of Rare Words with Subword Units + Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. + +HOW IMPLEMENTATION DIFFERS FROM Sennrich et al. (2016) +------------------------------------------------------ + +This repository implements the subword segmentation as described in Sennrich et al. (2016), +but since version 0.2, there is one core difference related to end-of-word tokens. + +In Sennrich et al. (2016), the end-of-word token `` is initially represented as a separate token, which can be merged with other subwords over time: + +``` +u n d +f u n d +``` + +Since 0.2, end-of-word tokens are initially concatenated with the word-final character: + +``` +u n d +f u n d +``` + +The new representation ensures that when BPE codes are learned from the above examples and then applied to new text, it is clear that a subword unit `und` is unambiguously word-final, and `un` is unambiguously word-internal, preventing the production of up to two different subword units from each BPE merge operation. + +`apply_bpe.py` is backward-compatible and continues to accept old-style BPE files. New-style BPE files are identified by having the following first line: `#version: 0.2` + +ACKNOWLEDGMENTS +--------------- +This project has received funding from Samsung Electronics Polska sp. z o.o. - Samsung R&D Institute Poland, and from the European Union’s Horizon 2020 research and innovation programme under grant agreement 645452 (QT21). diff --git a/subword-nmt/apply_bpe.py b/subword-nmt/apply_bpe.py new file mode 100755 index 0000000000000000000000000000000000000000..25996c808d02643c45d0ee0a837b5b291f8aa4f8 --- /dev/null +++ b/subword-nmt/apply_bpe.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Use operations learned with learn_bpe.py to encode a new text. +The text will not be smaller, but use only a fixed vocabulary, with rare words +encoded as variable-length sequences of subword units. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals, division + +import sys +import os +import inspect +import codecs +import io +import argparse +import re +import warnings +import random +import tempfile +from multiprocessing import Pool, cpu_count + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +class BPE(object): + + def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None): + + codes.seek(0) + offset=1 + + # check version information + firstline = codes.readline() + if firstline.startswith('#version:'): + self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")]) + offset += 1 + else: + self.version = (0, 1) + codes.seek(0) + + self.bpe_codes = [tuple(item.strip('\r\n ').split(' ')) for (n, item) in enumerate(codes.read().rstrip('\n').split('\n')) if (n < merges or merges == -1)] + + for i, item in enumerate(self.bpe_codes): + if len(item) != 2: + sys.stderr.write('Error: invalid line {0} in BPE codes file: {1}\n'.format(i+offset, ' '.join(item))) + sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n') + sys.exit(1) + + # some hacking to deal with duplicates (only consider first instance) + self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) + + self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()]) + + self.separator = separator + + self.vocab = vocab + + self.glossaries = glossaries if glossaries else [] + + self.glossaries_regex = re.compile('^({})$'.format('|'.join(glossaries))) if glossaries else None + + self.cache = {} + + def process_lines(self, filename, outfile, dropout=0, num_workers=1): + + if sys.version_info < (3, 0): + print("Parallel mode is only supported in Python3.") + sys.exit(1) + + if num_workers == 1: + _process_lines(self, filename, outfile, dropout, 0, 0) + elif num_workers > 1: + with open(filename, encoding="utf-8") as f: + size = os.fstat(f.fileno()).st_size + chunk_size = int(size / num_workers) + offsets = [0 for _ in range(num_workers + 1)] + for i in range(1, num_workers): + f.seek(chunk_size * i) + pos = f.tell() + while True: + try: + line = f.readline() + break + except UnicodeDecodeError: + pos -= 1 + f.seek(pos) + offsets[i] = f.tell() + assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'" + res_files = [] + pool = Pool(processes=num_workers) + for i in range(num_workers): + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + res_files.append(tmp) + pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1])) + pool.close() + pool.join() + for i in range(num_workers): + with open(res_files[i].name, encoding="utf-8") as fi: + for line in fi: + outfile.write(line) + os.remove(res_files[i].name) + else: + raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers)) + + def process_line(self, line, dropout=0): + """segment line, dealing with leading and trailing whitespace""" + + out = "" + + leading_whitespace = len(line)-len(line.lstrip('\r\n ')) + if leading_whitespace: + out += line[:leading_whitespace] + + out += self.segment(line, dropout) + + trailing_whitespace = len(line)-len(line.rstrip('\r\n ')) + if trailing_whitespace and trailing_whitespace != len(line): + out += line[-trailing_whitespace:] + + return out + + def segment(self, sentence, dropout=0): + """segment single sentence (whitespace-tokenized string) with BPE encoding""" + segments = self.segment_tokens(sentence.strip('\r\n ').split(' '), dropout) + return ' '.join(segments) + + def segment_tokens(self, tokens, dropout=0): + """segment a sequence of tokens with BPE encoding""" + output = [] + for word in tokens: + # eliminate double spaces + if not word: + continue + new_word = [out for segment in self._isolate_glossaries(word) + for out in encode(segment, + self.bpe_codes, + self.bpe_codes_reverse, + self.vocab, + self.separator, + self.version, + self.cache, + self.glossaries_regex, + dropout)] + + for item in new_word[:-1]: + output.append(item + self.separator) + output.append(new_word[-1]) + + return output + + def _isolate_glossaries(self, word): + word_segments = [word] + for gloss in self.glossaries: + word_segments = [out_segments for segment in word_segments + for out_segments in isolate_glossary(segment, gloss)] + return word_segments + +def _process_lines(bpe, filename, outfile, dropout, begin, end): + if isinstance(outfile, str): + fo = open(outfile, "w", encoding="utf-8") + else: + fo = outfile + with open(filename, encoding="utf-8") as f: + f.seek(begin) + line = f.readline() + while line: + pos = f.tell() + assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" + if end > 0 and pos > end: + break + fo.write(bpe.process_line(line, dropout)) + line = f.readline() + if isinstance(outfile, str): + fo.close() + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('apply-bpe', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input file (default: standard input).") + parser.add_argument( + '--codes', '-c', type=argparse.FileType('r'), metavar='PATH', + required=True, + help="File with BPE codes (created by learn_bpe.py).") + parser.add_argument( + '--merges', '-m', type=int, default=-1, + metavar='INT', + help="Use this many BPE operations (<= number of learned symbols)"+ + "default: Apply all the learned merge operations") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file (default: standard output)") + parser.add_argument( + '--separator', '-s', type=str, default='@@', metavar='STR', + help="Separator between non-final subword units (default: '%(default)s'))") + parser.add_argument( + '--vocabulary', type=argparse.FileType('r'), default=None, + metavar="PATH", + help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.") + parser.add_argument( + '--vocabulary-threshold', type=int, default=None, + metavar="INT", + help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV") + parser.add_argument( + '--dropout', type=float, default=0, + metavar="P", + help="Dropout BPE merge operations with probability P (Provilkov et al., 2019). Use this on training data only.") + parser.add_argument( + '--glossaries', type=str, nargs='+', default=None, + metavar="STR", + help="Glossaries. Words matching any of the words/regex provided in glossaries will not be affected "+ + "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords. "+ + "Can be provided as a list of words/regex after the --glossaries argument. Enclose each regex in quotes.") + parser.add_argument( + '--seed', type=int, default=None, + metavar="S", + help="Random seed for the random number generators (e.g. for BPE dropout with --dropout).") + parser.add_argument( + '--num-workers', type=int, default=1, + help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)") + + return parser + +def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries_regex=None, dropout=0): + """Encode word based on list of BPE merge operations, which are applied consecutively + """ + + if not dropout and orig in cache: + return cache[orig] + + if glossaries_regex and glossaries_regex.match(orig): + cache[orig] = (orig,) + return (orig,) + + if len(orig) == 1: + return orig + + if version == (0, 1): + word = list(orig) + [''] + elif version == (0, 2): # more consistent handling of word-final segments + word = list(orig[:-1]) + [orig[-1] + ''] + else: + raise NotImplementedError + + while len(word) > 1: + + # get list of symbol pairs; optionally apply dropout + pairs = [(bpe_codes[pair],i,pair) for (i,pair) in enumerate(zip(word, word[1:])) if (not dropout or random.random() > dropout) and pair in bpe_codes] + + if not pairs: + break + + #get first merge operation in list of BPE codes + bigram = min(pairs)[2] + + # find start position of all pairs that we want to merge + positions = [i for (rank,i,pair) in pairs if pair == bigram] + + i = 0 + new_word = [] + bigram = ''.join(bigram) + for j in positions: + # merges are invalid if they start before current position. This can happen if there are overlapping pairs: (x x x -> xx x) + if j < i: + continue + new_word.extend(word[i:j]) # all symbols before merged pair + new_word.append(bigram) # merged pair + i = j+2 # continue after merged pair + new_word.extend(word[i:]) # add all symbols until end of word + word = new_word + + # don't print end-of-word symbols + if word[-1] == '': + word = word[:-1] + elif word[-1].endswith(''): + word[-1] = word[-1][:-4] + + word = tuple(word) + if vocab: + word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator) + + cache[orig] = word + return word + +def recursive_split(segment, bpe_codes, vocab, separator, final=False): + """Recursively split segment into smaller units (by reversing BPE merges) + until all units are either in-vocabulary, or cannot be split futher.""" + + try: + if final: + left, right = bpe_codes[segment + ''] + right = right[:-4] + else: + left, right = bpe_codes[segment] + except: + #sys.stderr.write('cannot split {0} further.\n'.format(segment)) + yield segment + return + + if left + separator in vocab: + yield left + else: + for item in recursive_split(left, bpe_codes, vocab, separator, False): + yield item + + if (final and right in vocab) or (not final and right + separator in vocab): + yield right + else: + for item in recursive_split(right, bpe_codes, vocab, separator, final): + yield item + +def check_vocab_and_split(orig, bpe_codes, vocab, separator): + """Check for each segment in word if it is in-vocabulary, + and segment OOV segments into smaller units by reversing the BPE merge operations""" + + out = [] + + for segment in orig[:-1]: + if segment + separator in vocab: + out.append(segment) + else: + #sys.stderr.write('OOV: {0}\n'.format(segment)) + for item in recursive_split(segment, bpe_codes, vocab, separator, False): + out.append(item) + + segment = orig[-1] + if segment in vocab: + out.append(segment) + else: + #sys.stderr.write('OOV: {0}\n'.format(segment)) + for item in recursive_split(segment, bpe_codes, vocab, separator, True): + out.append(item) + + return out + + +def read_vocabulary(vocab_file, threshold): + """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold. + """ + + vocabulary = set() + + for line in vocab_file: + word, freq = line.strip('\r\n ').split(' ') + freq = int(freq) + if threshold == None or freq >= threshold: + vocabulary.add(word) + + return vocabulary + +def isolate_glossary(word, glossary): + """ + Isolate a glossary present inside a word. + + Returns a list of subwords. In which all 'glossary' glossaries are isolated + + For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is: + ['1934', 'USA', 'B', 'USA'] + """ + # regex equivalent of (if word == glossary or glossary not in word) + if re.match('^'+glossary+'$', word) or not re.search(glossary, word): + return [word] + else: + segments = re.split(r'({})'.format(glossary), word) + segments, ending = segments[:-1], segments[-1] + segments = list(filter(None, segments)) # Remove empty strings in regex group. + return segments + [ending.strip('\r\n ')] if ending != '' else segments + +if __name__ == '__main__': + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True) + + parser = create_parser() + args = parser.parse_args() + + if args.num_workers <= 0: + args.num_workers = cpu_count() + + # read/write files as UTF-8 + args.codes = codecs.open(args.codes.name, encoding='utf-8') + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + if args.vocabulary: + args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') + + if args.vocabulary: + vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) + else: + vocabulary = None + + if sys.version_info < (3, 0): + args.separator = args.separator.decode('UTF-8') + if args.glossaries: + args.glossaries = [g.decode('UTF-8') for g in args.glossaries] + if args.num_workers > 1: + args.num_workers = 1 + warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.") + + if args.seed is not None: + random.seed(args.seed) + + bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) + + if args.input.name == '' or args.num_workers == 1: + if args.num_workers > 1: + warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.") + for line in args.input: + args.output.write(bpe.process_line(line, args.dropout)) + else: + bpe.process_lines(args.input.name, args.output, args.dropout, args.num_workers) diff --git a/subword-nmt/get_vocab.py b/subword-nmt/get_vocab.py new file mode 100755 index 0000000000000000000000000000000000000000..76eb55904a0bf46c32d140848bda384dad584ca6 --- /dev/null +++ b/subword-nmt/get_vocab.py @@ -0,0 +1,82 @@ +#! /usr/bin/env python +from __future__ import print_function + +import os +import sys +import inspect +import warnings +import argparse +import codecs + +from collections import Counter + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('get-vocab', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Generates vocabulary") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Generates vocabulary") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input file (default: standard input).") + + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file (default: standard output)") + + return parser + +def get_vocab(train_file, vocab_file): + + c = Counter() + + for line in train_file: + for word in line.strip('\r\n ').split(' '): + if word: + c[word] += 1 + + for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): + vocab_file.write(key+" "+ str(f) + "\n") + +if __name__ == "__main__": + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + # read/write files as UTF-8 + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + get_vocab(args.input, args.output) \ No newline at end of file diff --git a/subword-nmt/learn_bpe.py b/subword-nmt/learn_bpe.py new file mode 100755 index 0000000000000000000000000000000000000000..7b01f046fa6b3fd8ba64b7658c23b6f80a4e6ba3 --- /dev/null +++ b/subword-nmt/learn_bpe.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. +Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary +of a text to a configurable number of symbols, with only a small increase in the number of tokens. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals + +import os +import sys +import inspect +import codecs +import re +import copy +import argparse +import warnings +import tempfile +from multiprocessing import Pool, cpu_count +from collections import defaultdict, Counter + +try: + from tqdm import tqdm +except ImportError: + def tqdm(iterator, *args, **kwargs): + return iterator + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('learn-bpe', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input text (default: standard input).") + + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file for BPE codes (default: standard output)") + parser.add_argument( + '--symbols', '-s', type=int, default=10000, + help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)") + parser.add_argument( + '--min-frequency', type=int, default=2, metavar='FREQ', + help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)') + parser.add_argument('--dict-input', action="store_true", + help="If set, input file is interpreted as a dictionary where each line contains a word-count pair") + parser.add_argument( + '--total-symbols', '-t', action="store_true", + help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).") + parser.add_argument( + '--num-workers', type=int, default=1, + help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)") + parser.add_argument( + '--verbose', '-v', action="store_true", + help="verbose mode.") + + return parser + +def get_vocabulary(fobj, is_dict=False, num_workers=1): + """Read text and return dictionary that encodes vocabulary + """ + vocab = Counter() + if is_dict: + for i, line in enumerate(fobj): + try: + word, count = line.strip('\r\n ').split(' ') + except: + print('Failed reading vocabulary file at line {0}: {1}'.format(i, line)) + sys.exit(1) + vocab[word] += int(count) + elif num_workers == 1 or fobj.name == '': + if num_workers > 1: + warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.") + for i, line in enumerate(fobj): + for word in line.strip('\r\n ').split(' '): + if word: + vocab[word] += 1 + elif num_workers > 1: + + if sys.version_info < (3, 0): + print("Parallel mode is only supported in Python3.") + sys.exit(1) + + with open(fobj.name, encoding="utf8") as f: + size = os.fstat(f.fileno()).st_size + chunk_size = int(size / num_workers) + offsets = [0 for _ in range(num_workers + 1)] + for i in range(1, num_workers): + f.seek(chunk_size * i) + pos = f.tell() + while True: + try: + line = f.readline() + break + except UnicodeDecodeError: + pos -= 1 + f.seek(pos) + offsets[i] = f.tell() + assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'" + + vocab_files = [] + pool = Pool(processes=num_workers) + for i in range(num_workers): + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + vocab_files.append(tmp) + pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1])) + pool.close() + pool.join() + import pickle + for i in range(num_workers): + with open(vocab_files[i].name, 'rb') as f: + vocab += pickle.load(f) + os.remove(vocab_files[i].name) + else: + raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers)) + return vocab + +def _get_vocabulary(infile, outfile, begin, end): + import pickle + vocab = Counter() + with open(infile, encoding="utf8") as f: + f.seek(begin) + line = f.readline() + while line: + pos = f.tell() + assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" + if end > 0 and pos > end: + break + for word in line.strip('\r\n ').split(' '): + if word: + vocab[word] += 1 + line = f.readline() + with open(outfile, 'wb') as f: + pickle.dump(vocab, f) + +def update_pair_statistics(pair, changed, stats, indices): + """Minimally update the indices and frequency of symbol pairs + + if we merge a pair of symbols, only pairs that overlap with occurrences + of this pair are affected, and need to be updated. + """ + stats[pair] = 0 + indices[pair] = defaultdict(int) + first, second = pair + new_pair = first+second + for j, word, old_word, freq in changed: + + # find all instances of pair, and update frequency/indices around it + i = 0 + while True: + # find first symbol + try: + i = old_word.index(first, i) + except ValueError: + break + # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2]) + if i < len(old_word)-1 and old_word[i+1] == second: + # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B" + if i: + prev = old_word[i-1:i+1] + stats[prev] -= freq + indices[prev][j] -= 1 + if i < len(old_word)-2: + # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B". + # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block + if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second: + nex = old_word[i+1:i+3] + stats[nex] -= freq + indices[nex][j] -= 1 + i += 2 + else: + i += 1 + + i = 0 + while True: + try: + # find new pair + i = word.index(new_pair, i) + except ValueError: + break + # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC" + if i: + prev = word[i-1:i+1] + stats[prev] += freq + indices[prev][j] += 1 + # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B" + # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block + if i < len(word)-1 and word[i+1] != new_pair: + nex = word[i:i+2] + stats[nex] += freq + indices[nex][j] += 1 + i += 1 + + +def get_pair_statistics(vocab): + """Count frequency of all symbol pairs, and create index""" + + # data structure of pair frequencies + stats = defaultdict(int) + + #index from pairs to words + indices = defaultdict(lambda: defaultdict(int)) + + for i, (word, freq) in enumerate(vocab): + prev_char = word[0] + for char in word[1:]: + stats[prev_char, char] += freq + indices[prev_char, char][i] += 1 + prev_char = char + + return stats, indices + + +def replace_pair(pair, vocab, indices): + """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'""" + first, second = pair + pair_str = ''.join(pair) + pair_str = pair_str.replace('\\','\\\\') + changes = [] + pattern = re.compile(r'(?'); + # version numbering allows bckward compatibility + outfile.write('#version: 0.2\n') + + vocab = get_vocabulary(infile, is_dict, num_workers) + vocab = dict([(tuple(x[:-1])+(x[-1]+'',) ,y) for (x,y) in vocab.items()]) + sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) + + stats, indices = get_pair_statistics(sorted_vocab) + big_stats = copy.deepcopy(stats) + + if total_symbols: + uniq_char_internal = set() + uniq_char_final = set() + for word in vocab: + for char in word[:-1]: + uniq_char_internal.add(char) + uniq_char_final.add(word[-1]) + sys.stderr.write('Number of word-internal characters: {0}\n'.format(len(uniq_char_internal))) + sys.stderr.write('Number of word-final characters: {0}\n'.format(len(uniq_char_final))) + sys.stderr.write('Reducing number of merge operations by {0}\n'.format(len(uniq_char_internal) + len(uniq_char_final))) + num_symbols -= len(uniq_char_internal) + len(uniq_char_final) + + # threshold is inspired by Zipfian assumption, but should only affect speed + threshold = max(stats.values()) / 10 + for i in tqdm(range(num_symbols)): + if stats: + most_frequent = max(stats, key=lambda x: (stats[x], x)) + + # we probably missed the best pair because of pruning; go back to full statistics + if not stats or (i and stats[most_frequent] < threshold): + prune_stats(stats, big_stats, threshold) + stats = copy.deepcopy(big_stats) + most_frequent = max(stats, key=lambda x: (stats[x], x)) + # threshold is inspired by Zipfian assumption, but should only affect speed + threshold = stats[most_frequent] * i/(i+10000.0) + prune_stats(stats, big_stats, threshold) + + if stats[most_frequent] < min_frequency: + sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency)) + break + + if verbose: + sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) + outfile.write('{0} {1}\n'.format(*most_frequent)) + changes = replace_pair(most_frequent, sorted_vocab, indices) + update_pair_statistics(most_frequent, changes, stats, indices) + stats[most_frequent] = 0 + if not i % 100: + prune_stats(stats, big_stats, threshold) + + +if __name__ == '__main__': + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + if args.num_workers <= 0: + args.num_workers = cpu_count() + + if sys.version_info < (3, 0) and args.num_workers > 1: + args.num_workers = 1 + warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.") + + # read/write files as UTF-8 + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols, num_workers=args.num_workers) diff --git a/subword-nmt/learn_joint_bpe_and_vocab.py b/subword-nmt/learn_joint_bpe_and_vocab.py new file mode 100755 index 0000000000000000000000000000000000000000..d75ff3d3f687c4f4776cc0246b05e3f6765374b2 --- /dev/null +++ b/subword-nmt/learn_joint_bpe_and_vocab.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. +This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus, +applies the learned operation to each and (optionally) returns the resulting vocabulary of each text. +The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals + +import sys +import os +import inspect +import codecs +import argparse +import tempfile +import warnings +from collections import Counter +from multiprocessing import cpu_count + +#hack to get imports working if running this as a script, or within a package +if __name__ == '__main__': + import learn_bpe + import apply_bpe +else: + from . import learn_bpe + from . import apply_bpe + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('learn-joint-bpe-and-vocab', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+', + metavar='PATH', + help="Input texts (multiple allowed).") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), required=True, + metavar='PATH', + help="Output file for BPE codes.") + parser.add_argument( + '--symbols', '-s', type=int, default=10000, + help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)") + parser.add_argument( + '--separator', type=str, default='@@', metavar='STR', + help="Separator between non-final subword units (default: '%(default)s')") + parser.add_argument( + '--write-vocabulary', type=argparse.FileType('w'), required=True, nargs = '+', default=None, + metavar='PATH', dest='vocab', + help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py') + parser.add_argument( + '--min-frequency', type=int, default=2, metavar='FREQ', + help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)') + parser.add_argument( + '--total-symbols', '-t', action="store_true", + help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).") + parser.add_argument( + '--num-workers', type=int, default=1, + help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)") + parser.add_argument( + '--verbose', '-v', action="store_true", + help="verbose mode.") + + return parser + +def learn_joint_bpe_and_vocab(args): + + if args.vocab and len(args.input) != len(args.vocab): + sys.stderr.write('Error: number of input files and vocabulary files must match\n') + sys.exit(1) + + # read/write files as UTF-8 + args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] + args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] + + # get combined vocabulary of all input texts + full_vocab = Counter() + for f in args.input: + full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers) + f.seek(0) + + vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] + + # learn BPE on combined vocabulary + with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: + learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols) + + with codecs.open(args.output.name, encoding='UTF-8') as codes: + bpe = apply_bpe.BPE(codes, separator=args.separator) + + # apply BPE to each training corpus and get vocabulary + for train_file, vocab_file in zip(args.input, args.vocab): + + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + + tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') + + train_file.seek(0) + bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers) + + tmpout.close() + tmpin = codecs.open(tmp.name, encoding='UTF-8') + + vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers) + tmpin.close() + os.remove(tmp.name) + + for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): + vocab_file.write("{0} {1}\n".format(key, freq)) + vocab_file.close() + + +if __name__ == '__main__': + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + if args.num_workers <= 0: + args.num_workers = cpu_count() + + if sys.version_info < (3, 0): + args.separator = args.separator.decode('UTF-8') + if args.num_workers > 1: + args.num_workers = 1 + warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.") + + assert(len(args.input) == len(args.vocab)) + + learn_joint_bpe_and_vocab(args) diff --git a/subword-nmt/setup.py b/subword-nmt/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..23d16db1a28778604a7bfacccebe5f113cf332cd --- /dev/null +++ b/subword-nmt/setup.py @@ -0,0 +1,38 @@ +from setuptools import setup, find_packages +import unittest +import codecs + +def test_suite(): + test_loader = unittest.TestLoader() + test_suite = test_loader.discover('subword_nmt/tests', pattern='test_*.py') + + return test_suite + + +setup( + name='subword_nmt', + version='0.3.8', + description='Unsupervised Word Segmentation for Neural Machine Translation and Text Generation', + long_description=(codecs.open("README.md", encoding='utf-8').read() + + "\n\n" + codecs.open("CHANGELOG.md", encoding='utf-8').read()), + long_description_content_type="text/markdown", + url='https://github.com/rsennrich/subword-nmt', + author='Rico Sennrich', + license='MIT', + test_suite='setup.test_suite', + classifiers=[ + 'Intended Audience :: Developers', + 'Topic :: Text Processing', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + ], + install_requires=['mock', + 'tqdm'], + packages=find_packages(), + entry_points={ + 'console_scripts': ['subword-nmt=subword_nmt.subword_nmt:main'], + }, + include_package_data=True +) diff --git a/subword-nmt/subword_nmt/__init__.py b/subword-nmt/subword_nmt/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/subword-nmt/subword_nmt/apply_bpe.py b/subword-nmt/subword_nmt/apply_bpe.py new file mode 100755 index 0000000000000000000000000000000000000000..25996c808d02643c45d0ee0a837b5b291f8aa4f8 --- /dev/null +++ b/subword-nmt/subword_nmt/apply_bpe.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Use operations learned with learn_bpe.py to encode a new text. +The text will not be smaller, but use only a fixed vocabulary, with rare words +encoded as variable-length sequences of subword units. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals, division + +import sys +import os +import inspect +import codecs +import io +import argparse +import re +import warnings +import random +import tempfile +from multiprocessing import Pool, cpu_count + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +class BPE(object): + + def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None): + + codes.seek(0) + offset=1 + + # check version information + firstline = codes.readline() + if firstline.startswith('#version:'): + self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")]) + offset += 1 + else: + self.version = (0, 1) + codes.seek(0) + + self.bpe_codes = [tuple(item.strip('\r\n ').split(' ')) for (n, item) in enumerate(codes.read().rstrip('\n').split('\n')) if (n < merges or merges == -1)] + + for i, item in enumerate(self.bpe_codes): + if len(item) != 2: + sys.stderr.write('Error: invalid line {0} in BPE codes file: {1}\n'.format(i+offset, ' '.join(item))) + sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n') + sys.exit(1) + + # some hacking to deal with duplicates (only consider first instance) + self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) + + self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()]) + + self.separator = separator + + self.vocab = vocab + + self.glossaries = glossaries if glossaries else [] + + self.glossaries_regex = re.compile('^({})$'.format('|'.join(glossaries))) if glossaries else None + + self.cache = {} + + def process_lines(self, filename, outfile, dropout=0, num_workers=1): + + if sys.version_info < (3, 0): + print("Parallel mode is only supported in Python3.") + sys.exit(1) + + if num_workers == 1: + _process_lines(self, filename, outfile, dropout, 0, 0) + elif num_workers > 1: + with open(filename, encoding="utf-8") as f: + size = os.fstat(f.fileno()).st_size + chunk_size = int(size / num_workers) + offsets = [0 for _ in range(num_workers + 1)] + for i in range(1, num_workers): + f.seek(chunk_size * i) + pos = f.tell() + while True: + try: + line = f.readline() + break + except UnicodeDecodeError: + pos -= 1 + f.seek(pos) + offsets[i] = f.tell() + assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'" + res_files = [] + pool = Pool(processes=num_workers) + for i in range(num_workers): + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + res_files.append(tmp) + pool.apply_async(_process_lines, (self, filename, tmp.name, dropout, offsets[i], offsets[i + 1])) + pool.close() + pool.join() + for i in range(num_workers): + with open(res_files[i].name, encoding="utf-8") as fi: + for line in fi: + outfile.write(line) + os.remove(res_files[i].name) + else: + raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers)) + + def process_line(self, line, dropout=0): + """segment line, dealing with leading and trailing whitespace""" + + out = "" + + leading_whitespace = len(line)-len(line.lstrip('\r\n ')) + if leading_whitespace: + out += line[:leading_whitespace] + + out += self.segment(line, dropout) + + trailing_whitespace = len(line)-len(line.rstrip('\r\n ')) + if trailing_whitespace and trailing_whitespace != len(line): + out += line[-trailing_whitespace:] + + return out + + def segment(self, sentence, dropout=0): + """segment single sentence (whitespace-tokenized string) with BPE encoding""" + segments = self.segment_tokens(sentence.strip('\r\n ').split(' '), dropout) + return ' '.join(segments) + + def segment_tokens(self, tokens, dropout=0): + """segment a sequence of tokens with BPE encoding""" + output = [] + for word in tokens: + # eliminate double spaces + if not word: + continue + new_word = [out for segment in self._isolate_glossaries(word) + for out in encode(segment, + self.bpe_codes, + self.bpe_codes_reverse, + self.vocab, + self.separator, + self.version, + self.cache, + self.glossaries_regex, + dropout)] + + for item in new_word[:-1]: + output.append(item + self.separator) + output.append(new_word[-1]) + + return output + + def _isolate_glossaries(self, word): + word_segments = [word] + for gloss in self.glossaries: + word_segments = [out_segments for segment in word_segments + for out_segments in isolate_glossary(segment, gloss)] + return word_segments + +def _process_lines(bpe, filename, outfile, dropout, begin, end): + if isinstance(outfile, str): + fo = open(outfile, "w", encoding="utf-8") + else: + fo = outfile + with open(filename, encoding="utf-8") as f: + f.seek(begin) + line = f.readline() + while line: + pos = f.tell() + assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" + if end > 0 and pos > end: + break + fo.write(bpe.process_line(line, dropout)) + line = f.readline() + if isinstance(outfile, str): + fo.close() + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('apply-bpe', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input file (default: standard input).") + parser.add_argument( + '--codes', '-c', type=argparse.FileType('r'), metavar='PATH', + required=True, + help="File with BPE codes (created by learn_bpe.py).") + parser.add_argument( + '--merges', '-m', type=int, default=-1, + metavar='INT', + help="Use this many BPE operations (<= number of learned symbols)"+ + "default: Apply all the learned merge operations") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file (default: standard output)") + parser.add_argument( + '--separator', '-s', type=str, default='@@', metavar='STR', + help="Separator between non-final subword units (default: '%(default)s'))") + parser.add_argument( + '--vocabulary', type=argparse.FileType('r'), default=None, + metavar="PATH", + help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.") + parser.add_argument( + '--vocabulary-threshold', type=int, default=None, + metavar="INT", + help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV") + parser.add_argument( + '--dropout', type=float, default=0, + metavar="P", + help="Dropout BPE merge operations with probability P (Provilkov et al., 2019). Use this on training data only.") + parser.add_argument( + '--glossaries', type=str, nargs='+', default=None, + metavar="STR", + help="Glossaries. Words matching any of the words/regex provided in glossaries will not be affected "+ + "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords. "+ + "Can be provided as a list of words/regex after the --glossaries argument. Enclose each regex in quotes.") + parser.add_argument( + '--seed', type=int, default=None, + metavar="S", + help="Random seed for the random number generators (e.g. for BPE dropout with --dropout).") + parser.add_argument( + '--num-workers', type=int, default=1, + help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)") + + return parser + +def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries_regex=None, dropout=0): + """Encode word based on list of BPE merge operations, which are applied consecutively + """ + + if not dropout and orig in cache: + return cache[orig] + + if glossaries_regex and glossaries_regex.match(orig): + cache[orig] = (orig,) + return (orig,) + + if len(orig) == 1: + return orig + + if version == (0, 1): + word = list(orig) + [''] + elif version == (0, 2): # more consistent handling of word-final segments + word = list(orig[:-1]) + [orig[-1] + ''] + else: + raise NotImplementedError + + while len(word) > 1: + + # get list of symbol pairs; optionally apply dropout + pairs = [(bpe_codes[pair],i,pair) for (i,pair) in enumerate(zip(word, word[1:])) if (not dropout or random.random() > dropout) and pair in bpe_codes] + + if not pairs: + break + + #get first merge operation in list of BPE codes + bigram = min(pairs)[2] + + # find start position of all pairs that we want to merge + positions = [i for (rank,i,pair) in pairs if pair == bigram] + + i = 0 + new_word = [] + bigram = ''.join(bigram) + for j in positions: + # merges are invalid if they start before current position. This can happen if there are overlapping pairs: (x x x -> xx x) + if j < i: + continue + new_word.extend(word[i:j]) # all symbols before merged pair + new_word.append(bigram) # merged pair + i = j+2 # continue after merged pair + new_word.extend(word[i:]) # add all symbols until end of word + word = new_word + + # don't print end-of-word symbols + if word[-1] == '': + word = word[:-1] + elif word[-1].endswith(''): + word[-1] = word[-1][:-4] + + word = tuple(word) + if vocab: + word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator) + + cache[orig] = word + return word + +def recursive_split(segment, bpe_codes, vocab, separator, final=False): + """Recursively split segment into smaller units (by reversing BPE merges) + until all units are either in-vocabulary, or cannot be split futher.""" + + try: + if final: + left, right = bpe_codes[segment + ''] + right = right[:-4] + else: + left, right = bpe_codes[segment] + except: + #sys.stderr.write('cannot split {0} further.\n'.format(segment)) + yield segment + return + + if left + separator in vocab: + yield left + else: + for item in recursive_split(left, bpe_codes, vocab, separator, False): + yield item + + if (final and right in vocab) or (not final and right + separator in vocab): + yield right + else: + for item in recursive_split(right, bpe_codes, vocab, separator, final): + yield item + +def check_vocab_and_split(orig, bpe_codes, vocab, separator): + """Check for each segment in word if it is in-vocabulary, + and segment OOV segments into smaller units by reversing the BPE merge operations""" + + out = [] + + for segment in orig[:-1]: + if segment + separator in vocab: + out.append(segment) + else: + #sys.stderr.write('OOV: {0}\n'.format(segment)) + for item in recursive_split(segment, bpe_codes, vocab, separator, False): + out.append(item) + + segment = orig[-1] + if segment in vocab: + out.append(segment) + else: + #sys.stderr.write('OOV: {0}\n'.format(segment)) + for item in recursive_split(segment, bpe_codes, vocab, separator, True): + out.append(item) + + return out + + +def read_vocabulary(vocab_file, threshold): + """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold. + """ + + vocabulary = set() + + for line in vocab_file: + word, freq = line.strip('\r\n ').split(' ') + freq = int(freq) + if threshold == None or freq >= threshold: + vocabulary.add(word) + + return vocabulary + +def isolate_glossary(word, glossary): + """ + Isolate a glossary present inside a word. + + Returns a list of subwords. In which all 'glossary' glossaries are isolated + + For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is: + ['1934', 'USA', 'B', 'USA'] + """ + # regex equivalent of (if word == glossary or glossary not in word) + if re.match('^'+glossary+'$', word) or not re.search(glossary, word): + return [word] + else: + segments = re.split(r'({})'.format(glossary), word) + segments, ending = segments[:-1], segments[-1] + segments = list(filter(None, segments)) # Remove empty strings in regex group. + return segments + [ending.strip('\r\n ')] if ending != '' else segments + +if __name__ == '__main__': + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True) + + parser = create_parser() + args = parser.parse_args() + + if args.num_workers <= 0: + args.num_workers = cpu_count() + + # read/write files as UTF-8 + args.codes = codecs.open(args.codes.name, encoding='utf-8') + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + if args.vocabulary: + args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') + + if args.vocabulary: + vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) + else: + vocabulary = None + + if sys.version_info < (3, 0): + args.separator = args.separator.decode('UTF-8') + if args.glossaries: + args.glossaries = [g.decode('UTF-8') for g in args.glossaries] + if args.num_workers > 1: + args.num_workers = 1 + warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.") + + if args.seed is not None: + random.seed(args.seed) + + bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) + + if args.input.name == '' or args.num_workers == 1: + if args.num_workers > 1: + warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.") + for line in args.input: + args.output.write(bpe.process_line(line, args.dropout)) + else: + bpe.process_lines(args.input.name, args.output, args.dropout, args.num_workers) diff --git a/subword-nmt/subword_nmt/bpe_toy.py b/subword-nmt/subword_nmt/bpe_toy.py new file mode 100755 index 0000000000000000000000000000000000000000..0421b255861cb56eb40bf58a8225807cc396e968 --- /dev/null +++ b/subword-nmt/subword_nmt/bpe_toy.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. +Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary +of a text to a configurable number of symbols, with only a small increase in the number of tokens. +This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets, +indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py). + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + + +import re +import sys +import collections + +def get_stats(vocab): + pairs = collections.defaultdict(int) + for word, freq in vocab.items(): + symbols = word.split() + for i in range(len(symbols)-1): + pairs[symbols[i],symbols[i+1]] += freq + return pairs + +def merge_vocab(pair, v_in): + v_out = {} + bigram_pattern = re.escape(' '.join(pair)) + p = re.compile(r'(?' : 5, 'l o w e r' : 2, + 'n e w e s t' : 6, 'w i d e s t' : 3} +num_merges = 15 +for i in range(num_merges): + pairs = get_stats(vocab) + try: + best = max(pairs, key=pairs.get) + except ValueError: + break + if pairs[best] < 2: + sys.stderr.write('no pair has frequency > 1. Stopping\n') + break + vocab = merge_vocab(best, vocab) + print(best) diff --git a/subword-nmt/subword_nmt/chrF.py b/subword-nmt/subword_nmt/chrF.py new file mode 100755 index 0000000000000000000000000000000000000000..3a35941d61b618a8b32d937b51f0d10071129bd6 --- /dev/null +++ b/subword-nmt/subword_nmt/chrF.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Compute chrF3 for machine translation evaluation + +Reference: +Maja Popović (2015). chrF: character n-gram F-score for automatic MT evaluation. In Proceedings of the Tenth Workshop on Statistical Machine Translationn, pages 392–395, Lisbon, Portugal. +""" + +from __future__ import print_function, unicode_literals, division + +import sys +import codecs +import io +import argparse + +from collections import defaultdict + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--ref', '-r', type=argparse.FileType('r'), required=True, + metavar='PATH', + help="Reference file") + parser.add_argument( + '--hyp', type=argparse.FileType('r'), metavar='PATH', + default=sys.stdin, + help="Hypothesis file (default: stdin).") + parser.add_argument( + '--beta', '-b', type=float, default=3, + metavar='FLOAT', + help="beta parameter (default: '%(default)s')") + parser.add_argument( + '--ngram', '-n', type=int, default=6, + metavar='INT', + help="ngram order (default: '%(default)s')") + parser.add_argument( + '--space', '-s', action='store_true', + help="take spaces into account (default: '%(default)s')") + parser.add_argument( + '--precision', action='store_true', + help="report precision (default: '%(default)s')") + parser.add_argument( + '--recall', action='store_true', + help="report recall (default: '%(default)s')") + + return parser + +def extract_ngrams(words, max_length=4, spaces=False): + + if not spaces: + words = ''.join(words.split()) + else: + words = words.strip() + + results = defaultdict(lambda: defaultdict(int)) + for length in range(max_length): + for start_pos in range(len(words)): + end_pos = start_pos + length + 1 + if end_pos <= len(words): + results[length][tuple(words[start_pos: end_pos])] += 1 + return results + + +def get_correct(ngrams_ref, ngrams_test, correct, total): + + for rank in ngrams_test: + for chain in ngrams_test[rank]: + total[rank] += ngrams_test[rank][chain] + if chain in ngrams_ref[rank]: + correct[rank] += min(ngrams_test[rank][chain], ngrams_ref[rank][chain]) + + return correct, total + + +def f1(correct, total_hyp, total_ref, max_length, beta=3, smooth=0): + + precision = 0 + recall = 0 + + for i in range(max_length): + if total_hyp[i] + smooth and total_ref[i] + smooth: + precision += (correct[i] + smooth) / (total_hyp[i] + smooth) + recall += (correct[i] + smooth) / (total_ref[i] + smooth) + + precision /= max_length + recall /= max_length + + return (1 + beta**2) * (precision*recall) / ((beta**2 * precision) + recall), precision, recall + +def main(args): + + correct = [0]*args.ngram + total = [0]*args.ngram + total_ref = [0]*args.ngram + for line in args.ref: + line2 = args.hyp.readline() + + ngrams_ref = extract_ngrams(line, max_length=args.ngram, spaces=args.space) + ngrams_test = extract_ngrams(line2, max_length=args.ngram, spaces=args.space) + + get_correct(ngrams_ref, ngrams_test, correct, total) + + for rank in ngrams_ref: + for chain in ngrams_ref[rank]: + total_ref[rank] += ngrams_ref[rank][chain] + + chrf, precision, recall = f1(correct, total, total_ref, args.ngram, args.beta) + + print('chrF3: {0:.4f}'.format(chrf)) + if args.precision: + print('chrPrec: {0:.4f}'.format(precision)) + if args.recall: + print('chrRec: {0:.4f}'.format(recall)) + +if __name__ == '__main__': + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True) + + parser = create_parser() + args = parser.parse_args() + + main(args) diff --git a/subword-nmt/subword_nmt/get_vocab.py b/subword-nmt/subword_nmt/get_vocab.py new file mode 100755 index 0000000000000000000000000000000000000000..76eb55904a0bf46c32d140848bda384dad584ca6 --- /dev/null +++ b/subword-nmt/subword_nmt/get_vocab.py @@ -0,0 +1,82 @@ +#! /usr/bin/env python +from __future__ import print_function + +import os +import sys +import inspect +import warnings +import argparse +import codecs + +from collections import Counter + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('get-vocab', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Generates vocabulary") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Generates vocabulary") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input file (default: standard input).") + + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file (default: standard output)") + + return parser + +def get_vocab(train_file, vocab_file): + + c = Counter() + + for line in train_file: + for word in line.strip('\r\n ').split(' '): + if word: + c[word] += 1 + + for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): + vocab_file.write(key+" "+ str(f) + "\n") + +if __name__ == "__main__": + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + # read/write files as UTF-8 + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + get_vocab(args.input, args.output) \ No newline at end of file diff --git a/subword-nmt/subword_nmt/learn_bpe.py b/subword-nmt/subword_nmt/learn_bpe.py new file mode 100755 index 0000000000000000000000000000000000000000..7b01f046fa6b3fd8ba64b7658c23b6f80a4e6ba3 --- /dev/null +++ b/subword-nmt/subword_nmt/learn_bpe.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. +Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary +of a text to a configurable number of symbols, with only a small increase in the number of tokens. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals + +import os +import sys +import inspect +import codecs +import re +import copy +import argparse +import warnings +import tempfile +from multiprocessing import Pool, cpu_count +from collections import defaultdict, Counter + +try: + from tqdm import tqdm +except ImportError: + def tqdm(iterator, *args, **kwargs): + return iterator + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('learn-bpe', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input text (default: standard input).") + + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file for BPE codes (default: standard output)") + parser.add_argument( + '--symbols', '-s', type=int, default=10000, + help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)") + parser.add_argument( + '--min-frequency', type=int, default=2, metavar='FREQ', + help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)') + parser.add_argument('--dict-input', action="store_true", + help="If set, input file is interpreted as a dictionary where each line contains a word-count pair") + parser.add_argument( + '--total-symbols', '-t', action="store_true", + help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).") + parser.add_argument( + '--num-workers', type=int, default=1, + help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)") + parser.add_argument( + '--verbose', '-v', action="store_true", + help="verbose mode.") + + return parser + +def get_vocabulary(fobj, is_dict=False, num_workers=1): + """Read text and return dictionary that encodes vocabulary + """ + vocab = Counter() + if is_dict: + for i, line in enumerate(fobj): + try: + word, count = line.strip('\r\n ').split(' ') + except: + print('Failed reading vocabulary file at line {0}: {1}'.format(i, line)) + sys.exit(1) + vocab[word] += int(count) + elif num_workers == 1 or fobj.name == '': + if num_workers > 1: + warnings.warn("In parallel mode, the input cannot be STDIN. Using 1 processor instead.") + for i, line in enumerate(fobj): + for word in line.strip('\r\n ').split(' '): + if word: + vocab[word] += 1 + elif num_workers > 1: + + if sys.version_info < (3, 0): + print("Parallel mode is only supported in Python3.") + sys.exit(1) + + with open(fobj.name, encoding="utf8") as f: + size = os.fstat(f.fileno()).st_size + chunk_size = int(size / num_workers) + offsets = [0 for _ in range(num_workers + 1)] + for i in range(1, num_workers): + f.seek(chunk_size * i) + pos = f.tell() + while True: + try: + line = f.readline() + break + except UnicodeDecodeError: + pos -= 1 + f.seek(pos) + offsets[i] = f.tell() + assert 0 <= offsets[i] < 1e20, "Bad new line separator, e.g. '\\r'" + + vocab_files = [] + pool = Pool(processes=num_workers) + for i in range(num_workers): + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + vocab_files.append(tmp) + pool.apply_async(_get_vocabulary, (fobj.name, tmp.name, offsets[i], offsets[i + 1])) + pool.close() + pool.join() + import pickle + for i in range(num_workers): + with open(vocab_files[i].name, 'rb') as f: + vocab += pickle.load(f) + os.remove(vocab_files[i].name) + else: + raise ValueError('`num_workers` is expected to be a positive number, but got {}.'.format(num_workers)) + return vocab + +def _get_vocabulary(infile, outfile, begin, end): + import pickle + vocab = Counter() + with open(infile, encoding="utf8") as f: + f.seek(begin) + line = f.readline() + while line: + pos = f.tell() + assert 0 <= pos < 1e20, "Bad new line separator, e.g. '\\r'" + if end > 0 and pos > end: + break + for word in line.strip('\r\n ').split(' '): + if word: + vocab[word] += 1 + line = f.readline() + with open(outfile, 'wb') as f: + pickle.dump(vocab, f) + +def update_pair_statistics(pair, changed, stats, indices): + """Minimally update the indices and frequency of symbol pairs + + if we merge a pair of symbols, only pairs that overlap with occurrences + of this pair are affected, and need to be updated. + """ + stats[pair] = 0 + indices[pair] = defaultdict(int) + first, second = pair + new_pair = first+second + for j, word, old_word, freq in changed: + + # find all instances of pair, and update frequency/indices around it + i = 0 + while True: + # find first symbol + try: + i = old_word.index(first, i) + except ValueError: + break + # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2]) + if i < len(old_word)-1 and old_word[i+1] == second: + # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B" + if i: + prev = old_word[i-1:i+1] + stats[prev] -= freq + indices[prev][j] -= 1 + if i < len(old_word)-2: + # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B". + # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block + if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second: + nex = old_word[i+1:i+3] + stats[nex] -= freq + indices[nex][j] -= 1 + i += 2 + else: + i += 1 + + i = 0 + while True: + try: + # find new pair + i = word.index(new_pair, i) + except ValueError: + break + # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC" + if i: + prev = word[i-1:i+1] + stats[prev] += freq + indices[prev][j] += 1 + # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B" + # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block + if i < len(word)-1 and word[i+1] != new_pair: + nex = word[i:i+2] + stats[nex] += freq + indices[nex][j] += 1 + i += 1 + + +def get_pair_statistics(vocab): + """Count frequency of all symbol pairs, and create index""" + + # data structure of pair frequencies + stats = defaultdict(int) + + #index from pairs to words + indices = defaultdict(lambda: defaultdict(int)) + + for i, (word, freq) in enumerate(vocab): + prev_char = word[0] + for char in word[1:]: + stats[prev_char, char] += freq + indices[prev_char, char][i] += 1 + prev_char = char + + return stats, indices + + +def replace_pair(pair, vocab, indices): + """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'""" + first, second = pair + pair_str = ''.join(pair) + pair_str = pair_str.replace('\\','\\\\') + changes = [] + pattern = re.compile(r'(?'); + # version numbering allows bckward compatibility + outfile.write('#version: 0.2\n') + + vocab = get_vocabulary(infile, is_dict, num_workers) + vocab = dict([(tuple(x[:-1])+(x[-1]+'',) ,y) for (x,y) in vocab.items()]) + sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) + + stats, indices = get_pair_statistics(sorted_vocab) + big_stats = copy.deepcopy(stats) + + if total_symbols: + uniq_char_internal = set() + uniq_char_final = set() + for word in vocab: + for char in word[:-1]: + uniq_char_internal.add(char) + uniq_char_final.add(word[-1]) + sys.stderr.write('Number of word-internal characters: {0}\n'.format(len(uniq_char_internal))) + sys.stderr.write('Number of word-final characters: {0}\n'.format(len(uniq_char_final))) + sys.stderr.write('Reducing number of merge operations by {0}\n'.format(len(uniq_char_internal) + len(uniq_char_final))) + num_symbols -= len(uniq_char_internal) + len(uniq_char_final) + + # threshold is inspired by Zipfian assumption, but should only affect speed + threshold = max(stats.values()) / 10 + for i in tqdm(range(num_symbols)): + if stats: + most_frequent = max(stats, key=lambda x: (stats[x], x)) + + # we probably missed the best pair because of pruning; go back to full statistics + if not stats or (i and stats[most_frequent] < threshold): + prune_stats(stats, big_stats, threshold) + stats = copy.deepcopy(big_stats) + most_frequent = max(stats, key=lambda x: (stats[x], x)) + # threshold is inspired by Zipfian assumption, but should only affect speed + threshold = stats[most_frequent] * i/(i+10000.0) + prune_stats(stats, big_stats, threshold) + + if stats[most_frequent] < min_frequency: + sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency)) + break + + if verbose: + sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) + outfile.write('{0} {1}\n'.format(*most_frequent)) + changes = replace_pair(most_frequent, sorted_vocab, indices) + update_pair_statistics(most_frequent, changes, stats, indices) + stats[most_frequent] = 0 + if not i % 100: + prune_stats(stats, big_stats, threshold) + + +if __name__ == '__main__': + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + if args.num_workers <= 0: + args.num_workers = cpu_count() + + if sys.version_info < (3, 0) and args.num_workers > 1: + args.num_workers = 1 + warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.") + + # read/write files as UTF-8 + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols, num_workers=args.num_workers) diff --git a/subword-nmt/subword_nmt/learn_joint_bpe_and_vocab.py b/subword-nmt/subword_nmt/learn_joint_bpe_and_vocab.py new file mode 100755 index 0000000000000000000000000000000000000000..d75ff3d3f687c4f4776cc0246b05e3f6765374b2 --- /dev/null +++ b/subword-nmt/subword_nmt/learn_joint_bpe_and_vocab.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. +This script learns BPE jointly on a concatenation of a list of texts (typically the source and target side of a parallel corpus, +applies the learned operation to each and (optionally) returns the resulting vocabulary of each text. +The vocabulary can be used in apply_bpe.py to avoid producing symbols that are rare or OOV in a training text. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals + +import sys +import os +import inspect +import codecs +import argparse +import tempfile +import warnings +from collections import Counter +from multiprocessing import cpu_count + +#hack to get imports working if running this as a script, or within a package +if __name__ == '__main__': + import learn_bpe + import apply_bpe +else: + from . import learn_bpe + from . import apply_bpe + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('learn-joint-bpe-and-vocab', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), required=True, nargs = '+', + metavar='PATH', + help="Input texts (multiple allowed).") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), required=True, + metavar='PATH', + help="Output file for BPE codes.") + parser.add_argument( + '--symbols', '-s', type=int, default=10000, + help="Create this many new symbols (each representing a character n-gram) (default: %(default)s)") + parser.add_argument( + '--separator', type=str, default='@@', metavar='STR', + help="Separator between non-final subword units (default: '%(default)s')") + parser.add_argument( + '--write-vocabulary', type=argparse.FileType('w'), required=True, nargs = '+', default=None, + metavar='PATH', dest='vocab', + help='Write to these vocabulary files after applying BPE. One per input text. Used for filtering in apply_bpe.py') + parser.add_argument( + '--min-frequency', type=int, default=2, metavar='FREQ', + help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s)') + parser.add_argument( + '--total-symbols', '-t', action="store_true", + help="subtract number of characters from the symbols to be generated (so that '--symbols' becomes an estimate for the total number of symbols needed to encode text).") + parser.add_argument( + '--num-workers', type=int, default=1, + help="Number of processors to process texts, only supported in Python3. If -1, set `multiprocessing.cpu_count()`. (default: %(default)s)") + parser.add_argument( + '--verbose', '-v', action="store_true", + help="verbose mode.") + + return parser + +def learn_joint_bpe_and_vocab(args): + + if args.vocab and len(args.input) != len(args.vocab): + sys.stderr.write('Error: number of input files and vocabulary files must match\n') + sys.exit(1) + + # read/write files as UTF-8 + args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] + args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] + + # get combined vocabulary of all input texts + full_vocab = Counter() + for f in args.input: + full_vocab += learn_bpe.get_vocabulary(f, num_workers=args.num_workers) + f.seek(0) + + vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] + + # learn BPE on combined vocabulary + with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: + learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols) + + with codecs.open(args.output.name, encoding='UTF-8') as codes: + bpe = apply_bpe.BPE(codes, separator=args.separator) + + # apply BPE to each training corpus and get vocabulary + for train_file, vocab_file in zip(args.input, args.vocab): + + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + + tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') + + train_file.seek(0) + bpe.process_lines(train_file.name, tmpout, num_workers=args.num_workers) + + tmpout.close() + tmpin = codecs.open(tmp.name, encoding='UTF-8') + + vocab = learn_bpe.get_vocabulary(tmpin, num_workers=args.num_workers) + tmpin.close() + os.remove(tmp.name) + + for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): + vocab_file.write("{0} {1}\n".format(key, freq)) + vocab_file.close() + + +if __name__ == '__main__': + + currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) + newdir = os.path.join(currentdir, 'subword_nmt') + if os.path.isdir(newdir): + warnings.simplefilter('default') + warnings.warn( + "this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), + DeprecationWarning + ) + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + if args.num_workers <= 0: + args.num_workers = cpu_count() + + if sys.version_info < (3, 0): + args.separator = args.separator.decode('UTF-8') + if args.num_workers > 1: + args.num_workers = 1 + warnings.warn("Parallel mode is only supported in Python3. Using 1 processor instead.") + + assert(len(args.input) == len(args.vocab)) + + learn_joint_bpe_and_vocab(args) diff --git a/subword-nmt/subword_nmt/segment_char_ngrams.py b/subword-nmt/subword_nmt/segment_char_ngrams.py new file mode 100755 index 0000000000000000000000000000000000000000..8d94bc7a36eb3163271e95e167190d7423564308 --- /dev/null +++ b/subword-nmt/subword_nmt/segment_char_ngrams.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +from __future__ import unicode_literals, division + +import sys +import codecs +import argparse + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(subparsers=None): + + if subparsers: + parser = subparsers.add_parser('segment-char-ngrams', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="segment rare words into character n-grams") + else: + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="segment rare words into character n-grams") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input file (default: standard input).") + parser.add_argument( + '--vocab', type=argparse.FileType('r'), metavar='PATH', + required=True, + help="Vocabulary file.") + parser.add_argument( + '--shortlist', type=int, metavar='INT', default=0, + help="do not segment INT most frequent words in vocabulary (default: '%(default)s')).") + parser.add_argument( + '-n', type=int, metavar='INT', default=2, + help="segment rare words into character n-grams of size INT (default: '%(default)s')).") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file (default: standard output)") + parser.add_argument( + '--separator', '-s', type=str, default='@@', metavar='STR', + help="Separator between non-final subword units (default: '%(default)s'))") + + return parser + +def segment_char_ngrams(args): + + vocab = [line.split()[0] for line in args.vocab if len(line.split()) == 2] + vocab = dict((y,x) for (x,y) in enumerate(vocab)) + + for line in args.input: + for word in line.split(): + if word not in vocab or vocab[word] > args.shortlist: + i = 0 + while i*args.n < len(word): + args.output.write(word[i*args.n:i*args.n+args.n]) + i += 1 + if i*args.n < len(word): + args.output.write(args.separator) + args.output.write(' ') + else: + args.output.write(word + ' ') + args.output.write('\n') + + +if __name__ == '__main__': + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + if sys.version_info < (3, 0): + args.separator = args.separator.decode('UTF-8') + + # read/write files as UTF-8 + args.vocab = codecs.open(args.vocab.name, encoding='utf-8') + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + segment_char_ngrams(args) \ No newline at end of file diff --git a/subword-nmt/subword_nmt/subword_nmt.py b/subword-nmt/subword_nmt/subword_nmt.py new file mode 100755 index 0000000000000000000000000000000000000000..29104f4d8029524a80d6fa649b69a8acec0b8abc --- /dev/null +++ b/subword-nmt/subword_nmt/subword_nmt.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import io +import sys +import codecs +import argparse + +from .learn_bpe import learn_bpe +from .apply_bpe import BPE, read_vocabulary +from .get_vocab import get_vocab +from .learn_joint_bpe_and_vocab import learn_joint_bpe_and_vocab + +from .learn_bpe import create_parser as create_learn_bpe_parser +from .apply_bpe import create_parser as create_apply_bpe_parser +from .get_vocab import create_parser as create_get_vocab_parser +from .learn_joint_bpe_and_vocab import create_parser as create_learn_joint_bpe_and_vocab_parser + +# hack for python2/3 compatibility +argparse.open = io.open + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawTextHelpFormatter, + description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ") + subparsers = parser.add_subparsers(dest='command', + help="""command to run. Run one of the commands with '-h' for more info. + +learn-bpe: learn BPE merge operations on input text. +apply-bpe: apply given BPE operations to input text. +get-vocab: extract vocabulary and word frequencies from input text. +learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""") + + learn_bpe_parser = create_learn_bpe_parser(subparsers) + apply_bpe_parser = create_apply_bpe_parser(subparsers) + get_vocab_parser = create_get_vocab_parser(subparsers) + learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers) + + args = parser.parse_args() + + if args.command == 'learn-bpe': + # read/write files as UTF-8 + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, + is_dict=args.dict_input, total_symbols=args.total_symbols) + elif args.command == 'apply-bpe': + # read/write files as UTF-8 + args.codes = codecs.open(args.codes.name, encoding='utf-8') + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + if args.vocabulary: + args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') + + if args.vocabulary: + vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) + else: + vocabulary = None + + if sys.version_info < (3, 0): + args.separator = args.separator.decode('UTF-8') + if args.glossaries: + args.glossaries = [g.decode('UTF-8') for g in args.glossaries] + + bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) + + for line in args.input: + args.output.write(bpe.process_line(line, args.dropout)) + + elif args.command == 'get-vocab': + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + get_vocab(args.input, args.output) + elif args.command == 'learn-joint-bpe-and-vocab': + learn_joint_bpe_and_vocab(args) + if sys.version_info < (3, 0): + args.separator = args.separator.decode('UTF-8') + else: + raise Exception('Invalid command provided') + + +# python 2/3 compatibility +if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) +else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) diff --git a/subword-nmt/subword_nmt/tests/__init__.py b/subword-nmt/subword_nmt/tests/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/subword-nmt/subword_nmt/tests/data/.gitignore b/subword-nmt/subword_nmt/tests/data/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..035921d8642398e26dcf7b9ea3cb9f52512f4e5c --- /dev/null +++ b/subword-nmt/subword_nmt/tests/data/.gitignore @@ -0,0 +1 @@ +bpe.out diff --git a/subword-nmt/subword_nmt/tests/data/bpe.ref b/subword-nmt/subword_nmt/tests/data/bpe.ref new file mode 100644 index 0000000000000000000000000000000000000000..4c9b1c8f9be525d2ce29e8f931f542523eec0328 --- /dev/null +++ b/subword-nmt/subword_nmt/tests/data/bpe.ref @@ -0,0 +1,1001 @@ +#version: 0.2 +t h +th e +i n +a n +e r +r e +o r +t i +a r +an d +e n +o f +o u +o n +t o +o n +i s +e d +in g +a l +i n +e r +i t +s t +e s +a t +o r +a t +r o +i c +o m +e s +i l +e n +o u +a s +a s +e l +u s +a n +e c +i s +o s +a c +ti on +y ou +o t +f or +w h +i t +a l +v e +p l +a p +s h +o l +d i +th e +q u +th at +e t +m a +ar e +al l +th is +c om +c h +r i +u n +en t +b e +b l +n o +a m +e v +c e +@ - +@- @ +f or +s i +u r +l o +it h +er s +t s +ou r +w ith +re s +h a +p ro +qu ot +quot ; +& quot; +e m +ti on +a d +l y +e t +b e +or d +c on +er e +i g +n e +a y +ro m +f rom +b u +n d +ap os +& apos +o w +i r +w or +b y +a tion +o p +&apos ; +f f +t r +l i +s u +y our +no t +the y +ic h +s p +c an +ou t +e x +e ar +l d +d e +v er +t a +g e +wh ich +d s +bl e +p ar +on e +a y +w il +in g +d at +t er +t er +ha ve +sh all +tion s +m an +it y +d e +wil l +p a +o d +& # +th er +c l +. . +.. . +u l +es s +0 0 +i f +a b +h e +ou ld +i r +c h +t h +r a +m er +1 2 +p u +A nd +un to +s it +res s +p e +h t +en ts +4 ; +12 4; +&# 124; +ing s +h ol +v er +m e +w e +s o +re e +m y +u p +k e +i d +at ed +us e +m ent +' s +es t +a r +P ress +ou n +h o +for e +f il +d ow +al l +at e +t ed +p er +h is +er e +as e +the ir +p or +I C +th ere +t o +is h +2 00 +r ou +m e +ec om +h i +as t +wor k +w as +sit es +f t +u m +in e +a ti +ri bu +or e +g l +c at +a ble +IC E +ICE cat +g i +am e +ac c +u d +st r +s o +pl e +mer ce +k s +g o +ev en +c re +y st +us t +or s +ic e +h as +ecom merce +c i +no w +a v +m ents +a d +us ing +s t +man y +ma y +k ing +ev er +ere fore +di st +y e +u t +ti me +s e +re n +os e +o ther +m ore +e st +s er +s el +re c +p h +lo c +l ic +in ce +en s +bu t +ar y +an t +G od +s yst +s om +l e +f ree +dist ribu +an s +a g +W ord +p ur +en t +d o +ar t +al so +w e +v i +s a +ri g +ne w +l and +b o +w ere +u c +n ing +m ig +i c +f ir +es e +em s +e l +d o +b r +as ed +ab out +E n +th ings +lic ens +it s +i m +g r +dat a +y e +up on +s ti +or d +in s +con t +w i +us ed +si on +p os +ou nd +l a +f e +es s +com m +L ord +1 9 +the m +th ese +on ly +is h +in cl +et c +el s +el l +c ol +c o +ac h +a m +a il +u l +th ou +ou r +n lo +in to +i es +hi m +dow nlo +di z +d er +al ly +ac e +Word Press +som e +s ince +re m +pe o +peo ple +pa in +os t +on s +n o +i ma +ho w +for ma +en d +ad ing +a re +S pain +O p +u s +por t +ou s +in ter +ha d +h ere +en ti +be en +ay s +ur e +t e +sh ould +ser v +p re +l ay +g re +ff er +b ased +ap art +a diz +C h +C adiz +w ould +w are +ver y +u p +syst ems +o st +loc ated +incl ud +hol d +gl ish +forma tion +f in +en d +d ev +ar k +Q u +Op en +En glish +wh o +u ro +t ing +su p +o re +n ess +in formation +g et +f i +ec t +b ec +ar d +an ds +an ce +E uro +u e +ord er +id ay +ic tion +ft ware +f ul +d is +at h +a tions +L u +wh en +w ay +t e +sh e +pur ch +on g +m ust +fir st +fil e +em b +e p +e di +an g +ye a +t ors +st ati +stati sti +re s +purch ase +m ost +m en +m an +l a +it e +i l +h erefore +fil es +f t +f a +an c +I n +w ell +ti c +s ec +par is +p res +o ff +l in +ima ge +iction ary +i z +h op +h el +h e +g h +f l +e d +com paris +a use +P S +A S +v al +statisti c +so ftware +she et +o k +o g +m is +j o +hop s +hol iday +h ear +go od +g o +f e +es hops +en ce +e i +downlo ading +distribu tors +di ffer +d ay +comparis on +an y +am il +a ge +a f +P s +P H +N A +AS Ps +6 8 +v ing +th y +su ch +pu bl +ord ing +l ine +i d +gre at +for m +f ul +ever y +el y +d et +d es +ch o +c oun +c ity +be hold +all ed +W herefore +PH P +P r +wor ld +wi th +wh at +w r +w at +tion al +si m +ren t +p r +ord s +o b +no w +mig ht +m u +f amil +e as +d ing +bec ause +ark X +arkX Press +acc ording +a u +Qu arkXPress +M edi +C om +0 0 +w s +us ers +ti es +th ing +se e +p ri +o m +o c +l l +k e +ic es +em ent +ec i +e p +e m +d uc +d er +ar i +am p +af ter +Medi a +' t +ver sion +v es +u res +u m +ta r +rig ht +rig h +par t +ow n +or y +o ver +o s +o k +mu ch +k now +in st +ig h +g en +ex c +differ ent +d en +ap p +ans a +al lo +S tar +Lu f +L NA +D LNA +1 9 +y p +w ords +v is +v en +u r +th ansa +si d +sel f +re n +pu ter +pl o +p ow +ot h +n i +licens e +li ke +l ear +k now +in ut +il e +f ore +et s +emb er +d ec +cont ent +com e +c alled +av ail +ar ound +an d +O ff +Luf thansa +F or +A l +w o +up dat +u t +u g +ti ve +ta ke +str uc +sid enti +s et +s e +s ame +rec ei +re ad +pro duc +pl ay +p dat +ou s +o l +n al +m at +ish ed +ir it +in ed +i um +h ot +g in +g ht +f un +com pl +c ur +avail able +a ir +W in +U pdat +wor ks +with out +un g +tr ans +th ose +th an +sp on +sp eci +pro c +pa ge +on al +o ds +ma de +m es +includ ed +in i +ig n +fe at +el l +ec ts +ear s +e w +e Star +dow s +be fore +b et +at or +an s +al s +Win dows +Updat eStar +F ra +ä sidenti +äsidenti n +ä ft +äft s +äfts ord +äftsord n +äftsordn ung +z ur +v id +um b +u plo +th rou +t yp +t wo +spon s +si ble +s m +rem ium +re p +re gi +r e +pow er +per s +p an +or ing +op en +o w +n ec +mig al +is t +ha ving +h ath +gi ven +ev er +et h +es ch +esch äftsordnung +en ter +e a +con ta +com man +ch il +c or +c ap +b oth +ati ve +apart ments +apart ment +ad a +S er +Pr äsidentin +PS D +H ot +G eschäftsordnung +Fra u +For migal +C al +2 . +1 1 +y ears +wh erefore +u st +throu gh +th en +t l +t en +sh al +shal t +s ou +res t +recei ve +r u +ot ter +mer ci +ma ke +m s +m o +la w +k et +j ust +ic k +g rou +fun c +fore ver +fin d +f ace +ear ch +e ds +e al +distribu tion +d ays +comman d +chil d +br ands +bl ess +be gin +am ong +am es +ac t +a in +a bl +T h +P remium +D e +wat ers +v o +u es +ti v +t y +t ur +sup port +spons oring +r on +r an +qu i +pl ug +par t +p as +otter y +n or +n er +n ed +m ine +l ast +it ed +inut e +in d +il li +ic ation +gen er +g es +g e +g al +famil y +f ol +f f +er y +er nal +el i +d ra +cho ose +child ren +c at +be ach +as es +Off ers +M inute +L e +L ast +G ods +G er +D ictionary +Cal a +B o +6 3 +1 5 +wr it +wh ile +w ar +val ue +v ed +v ari +u al +tr an +to ol +t ri +t en +st ing +s ed +s ay +re d +pl e +on g +ol d +n ers +n a +merci al +me di +m on +lo ok +l et +j ada +ic i +hel p +feat ures +en tr +en c +eas y +ear th +d on +con nec +ch ar +c ould +be ing +b ac +ar k +amp ; +a in +P y +H ost +A n +2 0 +& amp; +ye ar +w ing +w ant +w a +v ers +us er +ur ing +updat es +ti mes +t re +t ly +syst em +sp ea +sit e +sim pl +sa id +s k +s et +re v +re l +re f +pu t +pro g +pl ace +pe an +p ho +pho to +p at +oun t +ot e +or t +og y +ne y +ne es +ne eds +ne ed +n umb +n ame +lay ers +l l +k en +ic al +i a +ful l +fi ed +fe w +et y +est s +es si +dow n +do m +det ail +dat ab +d ictionary +con f +com mercial +c a +b re diff --git a/subword-nmt/subword_nmt/tests/data/corpus.bpe.ref.en b/subword-nmt/subword_nmt/tests/data/corpus.bpe.ref.en new file mode 100644 index 0000000000000000000000000000000000000000..dcc900c2f9e552d04b1151b5f8183b161e45e5a8 --- /dev/null +++ b/subword-nmt/subword_nmt/tests/data/corpus.bpe.ref.en @@ -0,0 +1,1015 @@ +ir@@ on c@@ ement is a read@@ y for use pa@@ st@@ e which is la@@ id as a fil@@ let by pu@@ t@@ ty k@@ ni@@ fe or f@@ ing@@ er in the m@@ ould ed@@ ges ( cor@@ ners ) of the st@@ e@@ el ing@@ o@@ t m@@ ould . +ir@@ on c@@ ement pro@@ t@@ ects the ing@@ o@@ t ag@@ ain@@ st the ho@@ t , ab@@ r@@ as@@ i@@ ve st@@ e@@ el c@@ a@@ sting proc@@ ess . +a fir@@ e re@@ st@@ ant re@@ pa@@ ir c@@ ement for fir@@ e pl@@ ac@@ es , o@@ v@@ en@@ s , open fi@@ re@@ pl@@ ac@@ es etc . +con@@ struc@@ tion and re@@ pa@@ ir of h@@ igh@@ w@@ ays and ... +an an@@ n@@ oun@@ c@@ ement must be commercial char@@ ac@@ ter . +go@@ ods and serv@@ ices ad@@ v@@ anc@@ ement through the P@@ .@@ O@@ .@@ Bo@@ x system is N@@ O@@ T A@@ L@@ L@@ O@@ W@@ E@@ D . +d@@ eli@@ ver@@ ies ( sp@@ am ) and other im@@ pro@@ p@@ er information d@@ el@@ et@@ ed . +trans@@ l@@ ator In@@ ter@@ n@@ et is a T@@ o@@ ol@@ b@@ ar for M@@ S In@@ ter@@ n@@ et E@@ x@@ pl@@ ore@@ r . +it allo@@ ws you to trans@@ l@@ ate in re@@ al time any we@@ b pas@@ ge from one l@@ ang@@ u@@ age to an@@ other . +you only have to sel@@ ect l@@ ang@@ u@@ ag@@ es and T@@ I do@@ es all the work for you ! a@@ ut@@ om@@ ati@@ c dictionary updates ..@@ .@@ . +this software is writ@@ ten in order to in@@ cre@@ ase your English ke@@ y@@ bo@@ ard typ@@ ing sp@@ e@@ ed , through te@@ ac@@ h@@ ing the b@@ as@@ ic@@ s of how to put your h@@ and on to the ke@@ y@@ bo@@ ard and gi@@ ve some tr@@ ain@@ ing ex@@ am@@ pl@@ es . +e@@ ach l@@ ess@@ on te@@ ac@@ h@@ es some ex@@ tr@@ a ke@@ y@@ s , and there is also a pr@@ ac@@ ti@@ ce , if it is ch@@ os@@ en , one can pr@@ ac@@ ti@@ ce the pre@@ vi@@ ous ke@@ y@@ s lear@@ ned through the pre@@ vi@@ ous l@@ ess@@ ons . the words ch@@ os@@ en in the pr@@ ac@@ ti@@ ce are m@@ ost@@ ly me@@ an@@ ing@@ ful and rel@@ at@@ es to the t@@ ou@@ gh ke@@ y@@ s ... +are you one of m@@ illi@@ ons out there who are tr@@ y@@ ing to lear@@ n fore@@ ign l@@ ang@@ u@@ age , but n@@ ever have en@@ ou@@ gh time ? +get V@@ T@@ e@@ ac@@ h@@ er , a s@@ cre@@ ens@@ a@@ ver that dis@@ pl@@ ays words and ph@@ r@@ ases you are tr@@ y@@ ing to lear@@ n and their trans@@ l@@ ation . +la@@ un@@ ch it d@@ uring your off@@ ice bre@@ a@@ k and ad@@ d new words to your v@@ oc@@ a@@ bu@@ l@@ ary while si@@ p@@ p@@ ing ho@@ t co@@ ff@@ e@@ e and ch@@ e@@ wing a s@@ and@@ w@@ ich . +this is a one time char@@ ge and you will n@@ ever be re@@ b@@ il@@ l@@ ed ! +you will receive di@@ rec@@ t acc@@ ess to a regi@@ str@@ ation co@@ de a@@ ut@@ om@@ ati@@ c@@ ally after you place your order . the enti@@ re proc@@ ess of regi@@ str@@ ation and cl@@ e@@ an@@ ing your system should take l@@ ess than 5 m@@ inut@@ es . +you will also receive a con@@ fir@@ ma@@ tion e@@ ma@@ il with your order information ( regi@@ str@@ ation co@@ de , order # , etc ) ... +the English @-@ Ger@@ man Pr@@ o Dictionary conta@@ ins over 5@@ 0@@ ,@@ 8@@ 1@@ 3 words and 2@@ 3@@ ,@@ 3@@ 4@@ 3 ar@@ ti@@ cl@@ es pres@@ en@@ ted in r@@ ich @-@ t@@ ex@@ t form@@ at ar@@ ti@@ cl@@ es . +the dictionary is a w@@ on@@ der@@ ful b@@ il@@ ing@@ ual ref@@ er@@ ence tool and can be used both by begin@@ ners and ad@@ v@@ anc@@ ed lear@@ ners ... +the M@@ S@@ D@@ ic@@ t English @-@ S@@ pan@@ ish Pr@@ o Dictionary conta@@ ins over 3@@ 8@@ ,@@ 00@@ 0 ent@@ ri@@ es in 19@@ ,@@ 8@@ 00 wor@@ d ar@@ ti@@ cl@@ es , pres@@ en@@ ted in r@@ ich @-@ t@@ ex@@ t form@@ at . +the dictionary is a w@@ on@@ der@@ ful b@@ il@@ ing@@ ual ref@@ er@@ ence tool and can be used both by begin@@ ners and ad@@ v@@ anc@@ ed lear@@ ners of English and S@@ pan@@ ish ... +p@@ oc@@ ket O@@ x@@ for@@ d English Dictionary F@@ ir@@ st P@@ u@@ bl@@ ished in 19@@ 2@@ 4 T@@ his is a re@@ is@@ su@@ e of the n@@ in@@ th edi@@ tion of the world 's l@@ ong@@ est @-@ est@@ abl@@ ished p@@ oc@@ ket English dictionary by O@@ x@@ for@@ d U@@ ni@@ ver@@ s@@ ity Press . +it is one of the new gener@@ ation O@@ x@@ for@@ d d@@ iction@@ ar@@ ies der@@ i@@ ved from the datab@@ ase of the h@@ igh@@ ly ac@@ cl@@ a@@ im@@ ed N@@ ew O@@ x@@ for@@ d Dictionary of English and is par@@ tic@@ ul@@ ar@@ ly user f@@ ri@@ end@@ ly with its el@@ e@@ g@@ ant open des@@ ign , with different el@@ em@@ ents st@@ art@@ ing on new lin@@ es ... +Word@@ B@@ an@@ k@@ er is a un@@ i@@ qu@@ e and f@@ u@@ n me@@ th@@ od of hel@@ p@@ ing you to lear@@ n a fore@@ ign l@@ ang@@ u@@ age . +ra@@ ther than bo@@ g you down with compl@@ ic@@ ated gr@@ am@@ ma@@ r it de@@ als only with bu@@ il@@ ding a v@@ oc@@ a@@ bu@@ l@@ ary . +tr@@ ou@@ ble m@@ em@@ or@@ is@@ ing new words or ph@@ r@@ ases ? +Word@@ B@@ an@@ k@@ er 's " V@@ is@@ ual C@@ l@@ ue " me@@ th@@ od of t@@ est@@ ing me@@ ans you lear@@ n without even re@@ al@@ is@@ ing it . +can be used by F@@ ren@@ ch people lear@@ ning English or English people lear@@ ning English ... +the English Pr@@ o Dictionary for Ser@@ ies 6@@ 0 S@@ m@@ art@@ ph@@ on@@ es is an ex@@ ten@@ si@@ ve dictionary and th@@ es@@ a@@ ur@@ us with over 9@@ 0@@ ,@@ 00@@ 0 words , ge@@ are@@ d to the needs of a wi@@ de ran@@ ge of us@@ er@@ s@@ - from the st@@ ud@@ ent at inter@@ medi@@ ate l@@ ev@@ el and ab@@ o@@ ve to the en@@ th@@ us@@ i@@ as@@ ti@@ c t@@ our@@ ist , or n@@ ative English spea@@ king b@@ us@@ in@@ ess pro@@ f@@ essi@@ onal ... +Word@@ B@@ an@@ k@@ er is a un@@ i@@ qu@@ e and f@@ u@@ n me@@ th@@ od of hel@@ p@@ ing you to lear@@ n a fore@@ ign l@@ ang@@ u@@ age . ra@@ ther than bo@@ g you down with compl@@ ic@@ ated gr@@ am@@ ma@@ r it de@@ als only with bu@@ il@@ ding a v@@ oc@@ a@@ bu@@ l@@ ary . +tr@@ ou@@ ble m@@ em@@ or@@ is@@ ing new words or ph@@ r@@ ases ? Word@@ B@@ an@@ k@@ er 's " V@@ is@@ ual C@@ l@@ ue " me@@ th@@ od of t@@ est@@ ing me@@ ans you lear@@ n without even re@@ al@@ is@@ ing it . +can be used by I@@ t@@ al@@ i@@ an people lear@@ ning English or English people lear@@ ning I@@ t@@ al@@ i@@ an ... +this line of L@@ ing@@ vo@@ S@@ o@@ ft English Al@@ b@@ an@@ i@@ an D@@ iction@@ ar@@ ies for Windows br@@ ings you acc@@ ur@@ ate and pro@@ m@@ p@@ t two @-@ way wor@@ d trans@@ la@@ tions , wr@@ app@@ ed in a user @-@ f@@ ri@@ end@@ ly inter@@ face with con@@ v@@ en@@ i@@ ent s@@ earch op@@ tions . +they are bu@@ il@@ t on updat@@ ed l@@ ing@@ u@@ is@@ ti@@ c datab@@ ases and come in f@@ our ver@@ si@@ ons des@@ ig@@ ned for different typ@@ es of users . L@@ ing@@ vo@@ S@@ o@@ ft Dictionary B@@ as@@ ic English Al@@ b@@ an@@ i@@ an is a com@@ p@@ act , f@@ ast and easy to use dictionary ... +this s@@ cre@@ en sa@@ ver dis@@ pl@@ ays be@@ au@@ ti@@ ful pa@@ in@@ t@@ ings of qu@@ ain@@ t English c@@ ot@@ ta@@ ges . +it ma@@ k@@ es a w@@ on@@ der@@ ful gi@@ ft for a family m@@ ember or f@@ ri@@ end . +it features a very st@@ able s@@ cre@@ en sa@@ ver en@@ g@@ ine and s@@ ever@@ al different user @-@ de@@ fin@@ able features . the user inter@@ face is at@@ tr@@ ac@@ tive , in@@ t@@ u@@ i@@ tive , and easy to use . +the Windows 9@@ 5 / 9@@ 8 / N@@ T D@@ es@@ k@@ to@@ p P@@ ro@@ per@@ ties di@@ al@@ o@@ g dis@@ pl@@ ays a li@@ ve pre@@ vi@@ ew of the s@@ cre@@ en sa@@ ver ... +the English Pr@@ o Dictionary for P@@ oc@@ ket M@@ S@@ D@@ ic@@ t V@@ i@@ e@@ w@@ er is an ex@@ ten@@ si@@ ve dictionary and tre@@ as@@ ures with over 9@@ 0@@ ,@@ 00@@ 0 words , ge@@ are@@ d to the needs of a wi@@ de ran@@ ge of us@@ er@@ s@@ - from the st@@ ud@@ ent at inter@@ medi@@ ate l@@ ev@@ el and ab@@ o@@ ve to the en@@ th@@ us@@ i@@ as@@ ti@@ c t@@ our@@ ist , or n@@ ative English spea@@ king b@@ us@@ in@@ ess pro@@ f@@ essi@@ onal ... +p@@ oc@@ ket O@@ x@@ for@@ d English Dictionary F@@ ir@@ st P@@ u@@ bl@@ ished in 19@@ 2@@ 4 T@@ his is a re@@ is@@ su@@ e of the n@@ in@@ th edi@@ tion of the world 's l@@ ong@@ est @-@ est@@ abl@@ ished p@@ oc@@ ket English dictionary by O@@ x@@ for@@ d U@@ ni@@ ver@@ s@@ ity Press . +it is one of the new gener@@ ation O@@ x@@ for@@ d d@@ iction@@ ar@@ ies der@@ i@@ ved from the datab@@ ase of the h@@ igh@@ ly ac@@ cl@@ a@@ im@@ ed N@@ ew O@@ x@@ for@@ d Dictionary of English and is par@@ tic@@ ul@@ ar@@ ly user f@@ ri@@ end@@ ly with its el@@ e@@ g@@ ant open des@@ ign , with different el@@ em@@ ents st@@ art@@ ing on new lin@@ es ... +p@@ oc@@ ket O@@ x@@ for@@ d English Dictionary F@@ ir@@ st P@@ u@@ bl@@ ished in 19@@ 2@@ 4 T@@ his is a P@@ oc@@ ket P@@ C re@@ is@@ su@@ e of the n@@ in@@ th edi@@ tion of the world 's l@@ ong@@ est @-@ est@@ abl@@ ished p@@ oc@@ ket English dictionary by O@@ x@@ for@@ d U@@ ni@@ ver@@ s@@ ity Press ... +the English P@@ h@@ r@@ ases Dictionary for P@@ oc@@ ket M@@ S@@ D@@ ic@@ t V@@ i@@ e@@ w@@ er includ@@ es vari@@ ety of ph@@ r@@ ases , col@@ loc@@ ations and comm@@ on i@@ di@@ om@@ s . +the datab@@ ase pro@@ vid@@ es 1@@ 1@@ ,@@ 1@@ 0@@ 7 de@@ fin@@ i@@ tions and over 9@@ ,@@ 8@@ 00 ph@@ r@@ ases . +the dictionary works as an ad@@ d @-@ on file for P@@ oc@@ ket M@@ S@@ D@@ ic@@ t V@@ i@@ e@@ w@@ er and is ful@@ ly com@@ pa@@ ti@@ ble with all the us@@ e@@ ful func@@ tion@@ al@@ i@@ ties of the vi@@ e@@ w@@ er . the ph@@ r@@ ases are ch@@ os@@ en among most comm@@ only used A@@ mer@@ ic@@ an and B@@ ri@@ ti@@ s@@ h English col@@ loc@@ ations and ph@@ r@@ ases ... +feat@@ uring 6@@ 5@@ ,@@ 00@@ 0 ent@@ ri@@ es , this is a f@@ ast and e@@ ff@@ ici@@ ent ap@@ pl@@ ication , which will pro@@ vi@@ de you with st@@ ud@@ y mat@@ er@@ i@@ al and im@@ medi@@ ate pr@@ ac@@ tic@@ al help when f@@ ac@@ ed with a comm@@ un@@ ication ch@@ all@@ en@@ ge re@@ qu@@ ir@@ ing an inst@@ ant res@@ p@@ on@@ se ... +the English @-@ Ger@@ man Pr@@ o Dictionary conta@@ ins over 5@@ 0@@ ,@@ 8@@ 1@@ 3 words and 2@@ 3@@ ,@@ 3@@ 4@@ 3 ar@@ ti@@ cl@@ es pres@@ en@@ ted in r@@ ich @-@ t@@ ex@@ t form@@ at ar@@ ti@@ cl@@ es . +the dictionary is a w@@ on@@ der@@ ful b@@ il@@ ing@@ ual ref@@ er@@ ence tool and can be used both by begin@@ ners and ad@@ v@@ anc@@ ed lear@@ ners ... +" the p@@ ot@@ enti@@ al produc@@ tiv@@ ity g@@ a@@ ins should be con@@ si@@ der@@ able . +to@@ day , QuarkXPress ® 8 has ti@@ g@@ h@@ ter in@@ te@@ gr@@ ation with P@@ hot@@ os@@ ho@@ p ® and I@@ ll@@ u@@ str@@ ator ® than ever before , and through st@@ and@@ ar@@ ds like H@@ T@@ M@@ L and C@@ S@@ S , QuarkXPress users can publ@@ ish ac@@ ro@@ s@@ s medi@@ a both in@@ de@@ p@@ end@@ ent@@ ly and al@@ ong@@ si@@ de A@@ do@@ be ® C@@ re@@ ative S@@ u@@ ite ® ap@@ pl@@ ic@@ ations like A@@ do@@ be F@@ l@@ as@@ h ® ( S@@ W@@ F ) and A@@ do@@ be D@@ re@@ am@@ we@@ a@@ ver ® . +here , you '@@ ll find out how C@@ re@@ ative S@@ u@@ ite users can get the b@@ est pos@@ sible inter@@ ac@@ tion with QuarkXPress . +you '@@ ll be s@@ ur@@ pr@@ is@@ ed how easy Qu@@ ark has made it to un@@ loc@@ k the full p@@ ot@@ enti@@ al of all your des@@ ign software . +QuarkXPress 8 is con@@ si@@ der@@ ed by many to have the b@@ est in@@ te@@ gr@@ ation with P@@ hot@@ os@@ ho@@ p 's PSD file form@@ at of any lay@@ out tool available to@@ day . +in this sec@@ tion we '@@ ll ex@@ pl@@ ain when you should use the PSD form@@ at for your ima@@ ges and how to get the most out of them . +for ex@@ am@@ ple , you may have m@@ ul@@ ti@@ ple layers in your PSD with different produc@@ t sh@@ ot@@ s , which will v@@ ary from publ@@ ication to publ@@ ication . +if you use PSD , you can s@@ w@@ it@@ ch those layers on or o@@ ff in QuarkXPress without having to sa@@ ve a se@@ par@@ ate T@@ I@@ F@@ F for e@@ ach publ@@ ication . +an@@ other qu@@ es@@ tion that might ti@@ p you in f@@ av@@ or of PSD is , " D@@ o I need to use a sp@@ o@@ t col@@ or with this image ? +" using sp@@ o@@ t col@@ ors in most image form@@ at@@ s is o@@ ft@@ en compl@@ ic@@ ated . +h@@ ow@@ ever , because of the way QuarkXPress sup@@ por@@ ts PSD ch@@ an@@ n@@ els , it 's simpl@@ er and more fl@@ ex@@ i@@ ble . +br@@ ing@@ ing the PSD files into QuarkXPress is the same as any other image . cre@@ ate a Bo@@ x and then use F@@ ile &@@ g@@ t@@ ; I@@ m@@ port ... or simpl@@ y dra@@ g and d@@ ro@@ p the image from your des@@ k@@ to@@ p , F@@ in@@ der or an ap@@ pl@@ ication like A@@ do@@ be B@@ ri@@ d@@ ge ® with or without cre@@ at@@ ing a bo@@ x first . +to acc@@ ess the speci@@ al features of PSD , open the PSD I@@ m@@ port p@@ al@@ et@@ te . ( w@@ in@@ do@@ w &@@ g@@ t@@ ; PSD I@@ m@@ port ) Y@@ ou '@@ ll inst@@ an@@ tly see th@@ umb@@ n@@ ail@@ s of the layers al@@ ong with their n@@ ames . +for ex@@ am@@ ple , you '@@ ve cre@@ ated a lay@@ er in P@@ hot@@ os@@ ho@@ p to gi@@ ve your image an an@@ ti@@ qu@@ ed look , but when you put it in your lay@@ out it se@@ ems s@@ w@@ amp@@ ed by the s@@ ur@@ r@@ oun@@ ding col@@ ors . one op@@ tion might be to re@@ d@@ u@@ ce the op@@ ac@@ ity of that lay@@ er by cl@@ ic@@ king on it and en@@ ter@@ ing a new op@@ ac@@ ity l@@ ev@@ el . +if you want to ad@@ d an ex@@ tr@@ a in@@ k or pl@@ ate to your ima@@ ges , you can set up a ch@@ an@@ ne@@ l to do that in P@@ hot@@ os@@ ho@@ p . for ex@@ am@@ ple , ma@@ y@@ be you in@@ t@@ end to v@@ ar@@ n@@ ish part of an image , or you want to use a sp@@ o@@ t col@@ or with@@ in your image . +QuarkXPress can re ­ ma@@ p any ch@@ an@@ ne@@ l right in the PSD I@@ m@@ port p@@ al@@ et@@ te - n@@ on ­ de@@ struc@@ tiv@@ ely . +so cl@@ ick on the ch@@ an@@ n@@ els di@@ vid@@ er of the PSD I@@ m@@ port P@@ al@@ et@@ te ; d@@ ou@@ ble cl@@ ick on the ch@@ an@@ ne@@ l in qu@@ es@@ tion , and you can p@@ ick any col@@ or from your pro@@ j@@ ect 's col@@ or p@@ al@@ et@@ te , ens@@ uring con@@ si@@ st@@ enc@@ y . +as pow@@ er@@ ful as the PSD support of QuarkXPress 8 is , it can 't man@@ i@@ p@@ ul@@ ate c@@ er@@ ta@@ in k@@ in@@ ds of layers , such as lay@@ er e@@ ff@@ ects &#@@ 9@@ 1@@ ; e@@ .@@ g@@ . +if you use one of these layers , the image will im@@ port and pr@@ in@@ t just f@@ ine , but you w@@ on 't get acc@@ ess to the lay@@ er cont@@ ro@@ l@@ s of the PSD I@@ m@@ port p@@ al@@ et@@ te . +if you need that func@@ tion@@ al@@ ity , you can el@@ im@@ in@@ ate those k@@ in@@ ds of layers from your PSD by con@@ ver@@ ting the lay@@ er e@@ ff@@ ects to st@@ and @-@ al@@ one layers or &apos@@ ; sm@@ ar@@ t ob@@ j@@ ects &apos@@ ; &#@@ 9@@ 1@@ ; right cl@@ ick on the L@@ ay@@ er in the P@@ hot@@ os@@ ho@@ p layers p@@ al@@ et@@ te &#@@ 9@@ 3@@ ; . +QuarkXPress sup@@ por@@ ts t@@ ex@@ t layers , most ad@@ j@@ ust@@ ment layers , and even 3@@ D layers includ@@ ing op@@ ac@@ ity and bl@@ end@@ ing mo@@ de cont@@ ro@@ l@@ s . +I@@ ll@@ u@@ str@@ ator is a great tool for cre@@ at@@ ing lo@@ go@@ s and v@@ ec@@ t@@ or il@@ l@@ ust@@ ra@@ tions . +tr@@ a@@ di@@ tion@@ ally , the rou@@ te into QuarkXPress has been to ex@@ port an E@@ P@@ S from I@@ ll@@ u@@ str@@ ator . +now , things are made much eas@@ i@@ er with the ar@@ ri@@ v@@ al of di@@ rec@@ t I@@ ll@@ u@@ str@@ ator .@@ a@@ i file im@@ port into QuarkXPress 8 . +simpl@@ y dra@@ g and d@@ ro@@ p or im@@ port your I@@ ll@@ u@@ str@@ ator n@@ ative file into your lay@@ out just as you would any gr@@ ap@@ h@@ ic@@ s file form@@ at . +QuarkXPress has a pow@@ er@@ ful tran@@ sp@@ a@@ ren@@ c@@ y en@@ g@@ ine , but it do@@ es@@ n 't support par@@ ti@@ ally tran@@ sp@@ a@@ rent ob@@ j@@ ects in P@@ D@@ F or .@@ a@@ i files y@@ et . +so if you are using som@@ e@@ thing like a d@@ ro@@ p sh@@ ad@@ ow in I@@ ll@@ u@@ str@@ ator and pl@@ an to place that over a n@@ on @-@ wh@@ ite bac@@ k@@ grou@@ nd or ob@@ j@@ ect , you might see un@@ des@@ ir@@ able res@@ ul@@ ts . +the good ne@@ ws is that the d@@ ro@@ p sh@@ ad@@ ow and tran@@ sp@@ a@@ ren@@ c@@ y features of QuarkXPress works on im@@ por@@ ted I@@ ll@@ u@@ str@@ ator files , so you can ap@@ pl@@ y a d@@ ro@@ p sh@@ ad@@ ow or ch@@ an@@ ge the op@@ ac@@ ity of your .@@ a@@ i file right in your lay@@ out inst@@ e@@ ad . +QuarkXPress is well e@@ qui@@ p@@ p@@ ed for dra@@ wing t@@ as@@ ks and in our ne@@ w@@ est re@@ le@@ ase we have st@@ and@@ ar@@ dis@@ ed many of our gr@@ ap@@ h@@ ic@@ s to@@ ol@@ s to work more like I@@ ll@@ u@@ str@@ ator , F@@ re@@ e@@ h@@ and and sim@@ il@@ ar to@@ ol@@ s . +wat@@ ch a vi@@ de@@ o on how to cre@@ ate com@@ p@@ ell@@ ing il@@ l@@ ust@@ ra@@ tions in QuarkXPress 8 . +page @-@ lay@@ out pro@@ f@@ essi@@ on@@ als can cre@@ ate r@@ ich F@@ l@@ as@@ h pro@@ j@@ ects - without com@@ pro@@ mis@@ ing des@@ ign - using the bu@@ il@@ t @-@ in F@@ l@@ as@@ h au@@ th@@ oring cap@@ ab@@ il@@ i@@ ties included in every edi@@ tion of QuarkXPress 8 . +wor@@ king in the same famil@@ i@@ ar pr@@ in@@ t en@@ v@@ ir@@ on@@ ment of QuarkXPress 8 , you can take ex@@ i@@ sting pr@@ in@@ t jo@@ b@@ s to F@@ l@@ as@@ h , or cre@@ ate new F@@ l@@ as@@ h pro@@ j@@ ects , in m@@ inut@@ es - no ad@@ di@@ tional purchase or co@@ ding re@@ qui@@ red ! +wat@@ ch a vi@@ de@@ o on how to cre@@ ate s@@ op@@ h@@ is@@ tic@@ ated F@@ l@@ as@@ h des@@ ig@@ n@@ s in QuarkXPress 8 . +* In@@ D@@ es@@ ign C@@ S 4 : inter@@ ac@@ tive el@@ em@@ ents such as h@@ y@@ per@@ lin@@ ks , page tran@@ si@@ tions , and bu@@ t@@ t@@ on ac@@ tions are not included in the X@@ F@@ L file . +dis@@ cl@@ a@@ im@@ er : this do@@ c@@ u@@ ment is based on publ@@ ic@@ ly available information and not based on h@@ ands @-@ on software ev@@ al@@ u@@ ation . +its content may be re@@ vis@@ ed at any time . +Qu@@ ark In@@ c@@ . acc@@ ep@@ ts no res@@ p@@ on@@ si@@ b@@ il@@ ity le@@ gal or o@@ th@@ er@@ w@@ is@@ e for the acc@@ ur@@ ac@@ y of this content . +QuarkXPress includ@@ es W@@ e@@ b lay@@ ou@@ ts that can cre@@ ate m@@ en@@ us and h@@ y@@ per@@ lin@@ ks , con@@ ver@@ t pr@@ in@@ t gr@@ ap@@ h@@ ic@@ s or f@@ anc@@ y t@@ ex@@ t tre@@ at@@ ments to W@@ e@@ b gr@@ ap@@ h@@ ic@@ s , and then wr@@ ite a st@@ and@@ ar@@ ds @-@ based H@@ T@@ M@@ L file with C@@ S@@ S that can be op@@ en@@ ed di@@ rec@@ tly in A@@ do@@ be D@@ re@@ am@@ we@@ a@@ ver . +QuarkXPress 8 can im@@ port P@@ D@@ F files up to version 1@@ .@@ 7 ( the de@@ fa@@ ul@@ t P@@ D@@ F version from the C@@ re@@ ative S@@ u@@ ite ap@@ pl@@ ic@@ ations when using the Press Qu@@ al@@ ity P@@ D@@ F set@@ ting is P@@ D@@ F 1@@ .@@ 4 . +" on@@ ce you ad@@ d a vari@@ ety of produc@@ tiv@@ ity g@@ a@@ ins , it b@@ ecom@@ es cl@@ e@@ ar that ... they can re@@ pres@@ ent th@@ ous@@ ands of d@@ ol@@ l@@ ar@@ s of re@@ tur@@ n on in@@ v@@ est@@ ment over the c@@ our@@ se of a year . +© 200@@ 9 Qu@@ ark In@@ c@@ . and Qu@@ ark Media H@@ ous@@ e S@@ à@@ r@@ l , S@@ w@@ it@@ z@@ er@@ land . +ar@@ b@@ it@@ r@@ ation is a for@@ m of al@@ ter@@ n@@ ative dis@@ pu@@ te res@@ ol@@ u@@ tion - speci@@ f@@ ic@@ ally , a le@@ gal al@@ ter@@ n@@ ative to li@@ ti@@ g@@ ation wh@@ ere@@ by the par@@ ties to a dis@@ pu@@ te ag@@ ree to su@@ b@@ m@@ it their res@@ p@@ ec@@ tive pos@@ i@@ tions ( through ag@@ re@@ ement or hear@@ ing ) to a ne@@ u@@ tr@@ al th@@ ir@@ d part@@ y ( the ar@@ b@@ it@@ r@@ ator ( s ) or ar@@ b@@ it@@ er ( s ) ) for res@@ ol@@ u@@ tion . +medi@@ ation is a proc@@ ess of al@@ ter@@ n@@ ative dis@@ pu@@ te res@@ ol@@ u@@ tion in which a ne@@ u@@ tr@@ al th@@ ir@@ d part@@ y , the medi@@ ator , as@@ si@@ st@@ s two or more par@@ ties in order to help them ne@@ go@@ ti@@ ate an ag@@ re@@ ement on a mat@@ ter of comm@@ on inter@@ est . +com@@ p@@ any L@@ a@@ w re@@ g@@ ul@@ at@@ es com@@ p@@ any forma@@ tions , di@@ rec@@ tors &apos@@ ; d@@ u@@ ties and sh@@ are@@ hol@@ der ag@@ re@@ em@@ ents and the inter@@ pre@@ t@@ ation of rel@@ ev@@ ant st@@ at@@ ut@@ ory or other law . +as a m@@ ember of the Euro@@ pean U@@ ni@@ on H@@ un@@ g@@ ary cont@@ in@@ ues to d@@ em@@ on@@ str@@ ate ec@@ on@@ om@@ ic g@@ ro@@ w@@ th . +many emb@@ as@@ si@@ es and trans@@ na@@ tional com@@ pan@@ ies located in the cap@@ it@@ al br@@ ing many ex@@ pat@@ ri@@ ate fore@@ ig@@ ners and their famil@@ ies to t@@ own , cre@@ at@@ ing d@@ em@@ and for pri@@ v@@ ate and inter@@ na@@ tional s@@ cho@@ ol@@ s . +est@@ abl@@ ished in 19@@ 9@@ 0 , the off@@ ice of H@@ a@@ i@@ de@@ g@@ g@@ er & P@@ art@@ ner in B@@ ud@@ ap@@ est has been pro@@ vid@@ ing a full ran@@ ge of le@@ gal serv@@ ices o@@ ffer@@ ing in@@ di@@ vid@@ ual ta@@ il@@ ore@@ d ad@@ v@@ ice . +ap@@ ar@@ t from being H@@ un@@ g@@ ary 's pr@@ in@@ ci@@ p@@ al p@@ ol@@ i@@ tic@@ al , commercial , ind@@ ust@@ ri@@ al and tran@@ sp@@ or@@ t@@ ation c@@ entr@@ e , the city of B@@ ud@@ ap@@ est bo@@ a@@ st@@ s sites , mon@@ um@@ ents and sp@@ as of wor@@ l@@ d@@ wi@@ de re@@ now@@ n . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 19 O@@ c@@ t 200@@ 7 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 2@@ 8 free Open ICEcat users . +A@@ c@@ er pro@@ j@@ ec@@ t@@ or acc@@ ess@@ or@@ ies can op@@ ti@@ m@@ iz@@ e your A@@ c@@ er X@@ 1@@ 1@@ 6@@ 0 / X@@ 12@@ 6@@ 0 pro@@ j@@ ec@@ tors and ex@@ p@@ and the us@@ age and m@@ ob@@ il@@ ity of your produc@@ t . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 0@@ 7 S@@ ep 200@@ 5 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 2@@ 8 free Open ICEcat users . +this statistic is based on the 68@@ 2@@ 1 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 0@@ 8 J@@ u@@ n 200@@ 6 . +this statistic is based on the 68@@ 15 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 1@@ 4 S@@ ep 200@@ 7 . +A@@ K@@ V@@ I@@ S S@@ k@@ et@@ ch con@@ ver@@ ts di@@ g@@ it@@ al ph@@ ot@@ os to a@@ ma@@ z@@ ing p@@ en@@ ci@@ l sk@@ et@@ ch@@ es and wat@@ er@@ col@@ or dra@@ w@@ ings . +con@@ ver@@ t your photo into an o@@ il pa@@ in@@ ting with A@@ K@@ V@@ I@@ S A@@ r@@ t@@ W@@ or@@ k ! +A@@ K@@ V@@ I@@ S Ch@@ am@@ el@@ e@@ on is a f@@ u@@ n to use tool for photo col@@ la@@ ge cre@@ ation . +A@@ K@@ V@@ I@@ S En@@ h@@ anc@@ er is an image en@@ h@@ anc@@ ement prog@@ ra@@ m for rev@@ eal@@ ing detail@@ s on a p@@ ic@@ t@@ ure . +En@@ h@@ anc@@ er allo@@ ws det@@ ec@@ ting detail@@ s from un@@ der@@ ex@@ pos@@ ed , o@@ ver@@ ex@@ pos@@ ed and m@@ id t@@ one are@@ as of a photo . +S@@ m@@ art@@ M@@ as@@ k is an e@@ ff@@ ici@@ ent m@@ as@@ king tool that s@@ av@@ es you time on compl@@ e@@ x sel@@ ec@@ tions and is f@@ u@@ n to use . +A@@ K@@ V@@ I@@ S C@@ ol@@ or@@ i@@ age allo@@ ws col@@ or@@ iz@@ ing B & W ph@@ ot@@ os and re@@ pl@@ ac@@ ing col@@ ors in col@@ or ph@@ ot@@ os . +A@@ K@@ V@@ I@@ S N@@ o@@ is@@ e B@@ ust@@ er is software for no@@ is@@ e sup@@ pres@@ sion on di@@ g@@ it@@ al and s@@ c@@ an@@ ned ima@@ ges . +A@@ r@@ t@@ S@@ u@@ ite is an im@@ pres@@ si@@ ve col@@ l@@ ec@@ tion of ver@@ s@@ ati@@ l@@ e e@@ ff@@ ects for dec@@ or@@ ation of ph@@ ot@@ os . +there are two e@@ ff@@ ect grou@@ p@@ s : photo f@@ r@@ ames which let you gener@@ ate a f@@ r@@ ame for your photo , and ar@@ ti@@ sti@@ c e@@ ff@@ ects which allo@@ w ex@@ per@@ im@@ ent@@ ing with ph@@ ot@@ os , for ex@@ am@@ ple , con@@ ver@@ ting a photo into a B & W image ; re@@ pl@@ ac@@ ing col@@ ors on an image , ad@@ ding a t@@ ex@@ t@@ ure , etc . +A@@ K@@ V@@ I@@ S M@@ ag@@ ni@@ fi@@ er allo@@ ws re@@ si@@ z@@ ing ima@@ ges without l@@ os@@ s in qu@@ al@@ ity . +A@@ K@@ V@@ I@@ S R@@ et@@ ou@@ ch@@ er P@@ l@@ u@@ g @-@ in is an e@@ ff@@ ici@@ ent prog@@ ra@@ m for photo re@@ st@@ or@@ ation and photo re@@ t@@ ou@@ ch@@ ing . +R@@ et@@ ou@@ ch@@ er rem@@ o@@ ves d@@ ust , s@@ c@@ r@@ at@@ ch@@ es , st@@ a@@ ins and other de@@ f@@ ects that ap@@ pe@@ ar on d@@ a@@ ma@@ g@@ ed ph@@ ot@@ os . it re@@ con@@ struc@@ ts the l@@ ac@@ king par@@ ts of the photo using the information of the s@@ our@@ r@@ oun@@ ding are@@ as . +A@@ K@@ V@@ I@@ S A@@ r@@ t@@ S@@ u@@ ite is a col@@ l@@ ec@@ tion of e@@ ff@@ ects for photo dec@@ or@@ ation . +version 5@@ .@@ 0 off@@ ers a new inter@@ face and im@@ pro@@ ves com@@ pa@@ ti@@ b@@ il@@ ity on M@@ ac@@ in@@ t@@ os@@ h . +the st@@ and@@ al@@ one version pres@@ ents new be@@ au@@ ti@@ ful H@@ and P@@ ain@@ ted F@@ r@@ ames des@@ ig@@ ned by ar@@ ti@@ st@@ s . +the pers@@ on in the image has t@@ at@@ to@@ os all around the ch@@ est are@@ a . I '@@ ve tri@@ ed so many times to co@@ ver the t@@ at@@ to@@ os and c@@ ou@@ l@@ d@@ n 't . +I downlo@@ ad@@ ed a tri@@ al of R@@ et@@ ou@@ ch@@ er to see how it would work . one pas@@ s over the are@@ a with R@@ et@@ ou@@ ch@@ er and the t@@ at@@ to@@ os were g@@ one . +I was ab@@ s@@ ol@@ ut@@ ely sh@@ oc@@ k@@ ed at how f@@ ast and easy A@@ k@@ v@@ is R@@ et@@ ou@@ ch@@ er made it . you would n@@ ever know the y@@ oun@@ g l@@ ad@@ y in the image has a t@@ at@@ to@@ o . +there 's o@@ ft@@ en to@@ o much des@@ ign around , but n@@ ever en@@ ou@@ gh good des@@ ign . +this in@@ si@@ ght pro@@ m@@ p@@ ted the la@@ un@@ ch@@ ing in 19@@ 8@@ 3 of the D@@ es@@ ign P@@ l@@ us com@@ pe@@ ti@@ tion by M@@ ess@@ e F@@ ran@@ k@@ f@@ ur@@ t in co@@ op@@ er@@ ation with the " R@@ at f@@ ü@@ r For@@ m@@ ge@@ b@@ ung " ( D@@ es@@ ign C@@ oun@@ ci@@ l ) and the De@@ ut@@ s@@ ch@@ en In@@ d@@ ust@@ ri@@ e@@ - un@@ d H@@ and@@ el@@ sk@@ am@@ mer@@ ta@@ g / A@@ s@@ so@@ ci@@ ation of Ger@@ man Ch@@ am@@ b@@ ers of In@@ d@@ u@@ str@@ y and Com@@ merce ( D@@ I@@ H@@ K ) . +the D@@ es@@ ign P@@ l@@ us a@@ w@@ ard , pres@@ en@@ ted by a dist@@ ing@@ u@@ ished j@@ ur@@ y in con@@ j@@ un@@ c@@ tion with the A@@ m@@ b@@ i@@ ent@@ e inter@@ na@@ tional con@@ su@@ m@@ er go@@ ods tr@@ ad@@ e fa@@ ir in F@@ ran@@ k@@ f@@ ur@@ t , st@@ ands for produc@@ t des@@ ign which is not mer@@ ely an end in it@@ self but inst@@ e@@ ad very much ex@@ em@@ pl@@ ary and tre@@ nd @-@ set@@ ting . +holiday apartments | Hot@@ els | Host@@ els | C@@ amp@@ ing , D@@ or@@ m@@ is & B@@ un@@ g@@ al@@ ow@@ s | Last Minute Offers ! +b@@ ar@@ bec@@ ue and M@@ ate T@@ e@@ a are A@@ r@@ g@@ ent@@ ini@@ an in@@ v@@ en@@ tions with a so@@ ci@@ al func@@ tion . +many A@@ r@@ g@@ ent@@ ini@@ an famil@@ ies have in their own gr@@ il@@ l in the bac@@ k@@ y@@ ard of their h@@ om@@ es ( called par@@ r@@ il@@ l@@ as ) , wh@@ ere they me@@ et on the we@@ e@@ k@@ en@@ ds for a me@@ al to@@ ge@@ ther with f@@ ri@@ en@@ ds and rel@@ ati@@ ves . +the be@@ e@@ f here has different c@@ u@@ ts to that of Euro@@ p@@ e . +the " b@@ a@@ by b@@ i@@ fe " is ap@@ pro@@ x@@ im@@ at@@ ely 1 / 2@@ k@@ g of st@@ ea@@ k , which could n@@ our@@ ish a wh@@ ol@@ e family . +the " b@@ i@@ fe de ch@@ or@@ iz@@ o " is a g@@ ig@@ an@@ ti@@ c r@@ um@@ p st@@ ea@@ k , the " b@@ i@@ fe de l@@ om@@ o " a fil@@ et . +" as@@ ad@@ o de ti@@ r@@ a " is the part of the ri@@ b@@ s , the " b@@ i@@ fe de c@@ os@@ ti@@ l@@ la " is a T @-@ b@@ one S@@ te@@ a@@ k , the " mat@@ am@@ br@@ e " is a very th@@ in c@@ ut p@@ i@@ ec@@ e of me@@ at with l@@ ot@@ s of f@@ at . +like in Euro@@ p@@ e , me@@ at is pre@@ pa@@ red r@@ are , m@@ id r@@ are , d@@ one , well d@@ one etc . +the sa@@ u@@ ce which com@@ es with the me@@ at d@@ ish is called " ch@@ im@@ ic@@ h@@ ur@@ r@@ i . " h@@ ow@@ ever , because of the exc@@ ell@@ ent ta@@ st@@ e of the me@@ at , it can well be e@@ at@@ en without sa@@ u@@ ce . +a full pl@@ ate of m@@ i@@ x@@ ed gr@@ il@@ l conta@@ ins pr@@ ac@@ tic@@ ally all an@@ im@@ al in@@ n@@ ar@@ ds from vari@@ ous an@@ im@@ als . +the S@@ a@@ us@@ age ( ch@@ or@@ iz@@ o ) and the bl@@ ac@@ k pu@@ d@@ ding ( m@@ or@@ c@@ il@@ la ) are very p@@ op@@ ul@@ ar . +serv@@ ed with the me@@ at@@ s are v@@ e@@ g@@ et@@ abl@@ es ( s@@ we@@ et p@@ ot@@ at@@ o@@ es , pu@@ m@@ p@@ k@@ in pu@@ ree ) and m@@ i@@ x@@ ed s@@ al@@ ad@@ s . +in a re@@ st@@ a@@ ur@@ ant it is of c@@ our@@ se pos@@ sible to order other par@@ ts of me@@ at@@ s . +de@@ p@@ end@@ ing on the se@@ as@@ on and regi@@ on , it is pos@@ sible to e@@ at in A@@ r@@ g@@ ent@@ in@@ a very good f@@ o@@ od and me@@ at such as de@@ er , l@@ am@@ b , she@@ ep , go@@ at , f@@ ish , and other se@@ af@@ o@@ od . +speci@@ al d@@ el@@ ic@@ ac@@ ies are : K@@ ing p@@ ra@@ w@@ n@@ s ( c@@ ent@@ ol@@ la ) from T@@ i@@ er@@ r@@ a de F@@ u@@ e@@ go and wil@@ d bo@@ ar ( j@@ ab@@ al@@ í ) or tr@@ out in B@@ ar@@ il@@ o@@ ch@@ e . +speci@@ al d@@ ish@@ es , go@@ ing bac@@ k as f@@ ar as col@@ on@@ i@@ al times are : " pa@@ st@@ el de cho@@ cl@@ o " or the " h@@ um@@ it@@ a . " +more typ@@ ical are ch@@ ic@@ ken " p@@ ol@@ l@@ o " in its most vari@@ ed pre@@ par@@ tions , " m@@ il@@ an@@ es@@ a " ( V@@ i@@ en@@ n@@ ese es@@ c@@ al@@ op@@ e ) , " em@@ pan@@ ad@@ as " ( f@@ ri@@ ed pa@@ str@@ y sh@@ ell@@ s fil@@ l@@ ed with ch@@ op@@ p@@ ed be@@ e@@ f , ch@@ ic@@ ken , ma@@ iz@@ e or ch@@ e@@ ese and ha@@ m ) or " ch@@ or@@ i@@ p@@ an " sim@@ il@@ ar to a H@@ o@@ t D@@ o@@ g . +many I@@ t@@ al@@ i@@ an re@@ st@@ a@@ ur@@ an@@ ts are f@@ ound here which ser@@ ve p@@ iz@@ z@@ a and pa@@ st@@ a . +ne@@ x@@ t to f@@ res@@ h f@@ ru@@ it s@@ al@@ ad@@ s and good ice cre@@ am , a wi@@ de ran@@ ge of des@@ ser@@ ts are o@@ ffer@@ ed . +two A@@ r@@ g@@ ent@@ ine speci@@ al@@ i@@ ties are the " d@@ ul@@ ce de l@@ ec@@ he , " m@@ il@@ k @-@ c@@ ar@@ am@@ el m@@ ar@@ m@@ al@@ ad@@ e which is also used to as a bre@@ ad sp@@ re@@ ad and the " al@@ fa@@ j@@ ore@@ s " ( sm@@ all c@@ a@@ ke @-@ form@@ ed b@@ is@@ c@@ u@@ its with different fil@@ l@@ ings ) . +apartment for holiday l@@ ets in Cadiz . good con@@ di@@ tion , 2 ... +if you are s@@ ear@@ ch@@ ing for pl@@ ac@@ es to st@@ ay on the C@@ ost@@ a de@@ l ... +apartment in S@@ i@@ er@@ r@@ a N@@ ev@@ ada , cl@@ ose to the ch@@ air@@ li@@ f@@ ts . ... +st@@ u@@ di@@ o apartments | T@@ wo B@@ ed@@ ro@@ om A@@ part@@ ments | Th@@ ree or M@@ ore B@@ ed@@ ro@@ om A@@ part@@ ments | H@@ ol@@ iday H@@ ous@@ es | Hot@@ els | Host@@ els | Last Minute Offers ! +B@@ ar@@ c@@ el@@ on@@ a , Spain is a city located at the n@@ or@@ the@@ ast si@@ de of the I@@ b@@ er@@ i@@ an P@@ en@@ in@@ su@@ la , in the hear@@ t of C@@ at@@ al@@ on@@ ia and b@@ ord@@ er@@ ed by the M@@ ed@@ it@@ er@@ ran@@ e@@ an S@@ e@@ a to the e@@ ast . +the city of T@@ ar@@ ra@@ g@@ on@@ a li@@ es sou@@ th of it , L@@ le@@ id@@ a to the w@@ est , and G@@ ir@@ on@@ a to the n@@ or@@ th . in this sec@@ tion you will find out how to get there by many tran@@ sp@@ or@@ t@@ ation me@@ ans . +to get to B@@ ar@@ c@@ el@@ on@@ a by c@@ ar from M@@ ad@@ ri@@ d , you should take the N@@ ac@@ i@@ onal I@@ I m@@ ot@@ or@@ way . +from the co@@ ast , take the A @-@ 7 or the A@@ u@@ to@@ v@@ í@@ a de@@ l M@@ ed@@ it@@ er@@ r@@ á@@ ne@@ o . B@@ ar@@ c@@ el@@ on@@ a is about 6@@ 00 k@@ il@@ om@@ et@@ ers from M@@ ad@@ ri@@ d , at 3@@ 5@@ 0 from V@@ al@@ en@@ ci@@ a , 3@@ 00 from Z@@ ar@@ a@@ go@@ z@@ a , and at 1@@ 00@@ 0 from M@@ al@@ ag@@ a . +from other Euro@@ pean ci@@ ties , P@@ ar@@ is is 1@@ ,@@ 2@@ 00 k@@ il@@ om@@ et@@ ers a@@ way from B@@ ar@@ c@@ el@@ on@@ a , 1@@ ,@@ 0@@ 9@@ 8 from B@@ er@@ n@@ a , and 1@@ ,@@ 6@@ 00 from L@@ on@@ don . +there are bu@@ s rou@@ t@@ es from al@@ most every city in Spain to B@@ ar@@ c@@ el@@ on@@ a and as well from most ma@@ j@@ or Euro@@ pean ci@@ ties . +this is a numb@@ er to c@@ all to get up to dat@@ e information on the st@@ at@@ us of tr@@ a@@ ff@@ ic in B@@ ar@@ c@@ el@@ on@@ a Spain . +ph@@ one numb@@ er for Pr@@ at A@@ ir@@ port in B@@ ar@@ c@@ el@@ on@@ a , Spain . +this great lo@@ d@@ g@@ ing in Cala R@@ at@@ jada , M@@ all@@ or@@ ca is located in ... +there are many Cadiz apartments for rent around the city . ... +bo@@ ok@@ m@@ ark M@@ an@@ ag@@ er ( B@@ K@@ M for sh@@ ort ) is an open s@@ our@@ ce tool for man@@ ag@@ ing your bo@@ ok@@ m@@ ar@@ ks . +you '@@ ll find ever@@ y@@ thing about this tool on this W@@ e@@ b site . cur@@ rent ne@@ ws , new ver@@ si@@ ons and other information . +B@@ K@@ M con@@ si@@ st@@ s of two pro@@ j@@ ects : v@@ b@@ B@@ K@@ M and p@@ y@@ B@@ K@@ M . +v@@ b@@ B@@ K@@ M is a prog@@ ra@@ m dev@@ el@@ op@@ ed in V@@ is@@ ual B@@ as@@ ic which works on the M@@ ic@@ ro@@ so@@ ft Windows pl@@ at@@ for@@ m . +p@@ y@@ B@@ K@@ M is a tool dev@@ el@@ op@@ ed in Py@@ th@@ on r@@ un@@ ning on m@@ ul@@ ti@@ ple pl@@ at@@ form@@ s . +v@@ b@@ B@@ K@@ M is a st@@ able produc@@ t and has as many features as any other bo@@ ok@@ m@@ ark man@@ ag@@ ement software available . +but p@@ y@@ B@@ K@@ M is at very ear@@ ly st@@ ag@@ es and is not read@@ y for produc@@ tion use , y@@ et . +see the two pro@@ j@@ ect pa@@ ges ( v@@ b@@ B@@ K@@ M and p@@ y@@ B@@ K@@ M ) for detail@@ s . +holiday h@@ ous@@ es | Hot@@ els | G@@ u@@ es@@ th@@ ous@@ es ( B & B ) | Host@@ els | C@@ amp@@ sites | T@@ o@@ p ten things to do and see | Last Minute Offers ! +apartment to rent in N@@ er@@ j@@ a , An@@ d@@ al@@ us@@ ia . sp@@ ac@@ i@@ ous and ... +c@@ os@@ y apartment in the sk@@ i st@@ ation of S@@ I@@ er@@ r@@ a N@@ ev@@ ada , ... +there are many Cadiz apartments for rent around the city . ... +B@@ en@@ al@@ ma@@ d@@ en@@ a st@@ u@@ di@@ o apartment in a re@@ sidenti@@ al dev@@ el@@ op@@ ment ... +this char@@ ter fir@@ m based in Cadiz off@@ ers bo@@ at@@ s for h@@ ir@@ e from ... +apartments | H@@ ol@@ iday h@@ ous@@ es | Host@@ els | Ch@@ ar@@ ter / R@@ ent bo@@ at@@ s | Hot@@ els | Last Minute Offers ! +Cadiz , Spain is f@@ as@@ c@@ in@@ at@@ ing ; it 's the ol@@ d@@ est city in Euro@@ p@@ e . +in the w@@ in@@ ter you can en@@ jo@@ y the old -@@ s@@ u @-@ inter@@ i@@ or @-@ pu@@ e@@ bl@@ os @-@ bl@@ an@@ co@@ s , t@@ own , and in the su@@ m@@ m@@ er it 's ... +A@@ part@@ am@@ en@@ to@@ s , ch@@ al@@ ets , hot@@ el@@ es , c@@ as@@ as r@@ ur@@ al@@ es , and lo@@ d@@ g@@ ing of all k@@ in@@ ds . +C@@ ost@@ a de la Lu@@ z is the i@@ de@@ al co@@ ast to char@@ ter . +Cadiz , Spain is the most im@@ por@@ t@@ ant city in the pro@@ v@@ ince . +do you want to work for co@@ st@@ as@@ ur . +to e@@ at in Cadiz , Spain in the regi@@ on of An@@ d@@ al@@ us@@ ia is easy . +Cadiz , Spain is f@@ as@@ c@@ in@@ at@@ ing ; it 's the ol@@ d@@ est city in Euro@@ p@@ e . +al@@ most enti@@ re@@ ly s@@ ur@@ r@@ oun@@ d@@ ed by wat@@ er , the city app@@ ears is@@ ol@@ ated . it st@@ ands on a p@@ en@@ in@@ su@@ la j@@ ut@@ ting out into the b@@ ay , dra@@ ma@@ tic@@ ally de@@ fin@@ ing the s@@ ur@@ r@@ oun@@ ding l@@ and@@ s@@ cap@@ e . +this an@@ ci@@ ent city on the C@@ ost@@ a de la Lu@@ z in An@@ d@@ al@@ us@@ ia is ap@@ pro@@ x@@ im@@ at@@ ely 3@@ ,@@ 00@@ 0 years old . the P@@ ho@@ en@@ ici@@ ans , C@@ ar@@ th@@ ag@@ ini@@ ans , and R@@ om@@ ans have all set@@ tl@@ l@@ ed here at one p@@ o@@ in@@ t over the years . +ne@@ x@@ t were the C@@ ar@@ th@@ ag@@ ini@@ ans and then the R@@ om@@ ans who tur@@ ned it into a th@@ ri@@ ving port . d@@ uring the 1@@ 7@@ th c@@ ent@@ ur@@ y , tr@@ ading bet@@ we@@ en C@@ á@@ diz and the rest of the world in@@ cre@@ ased dra@@ ma@@ tic@@ ally ; the ec@@ on@@ om@@ y bo@@ om@@ ed . +Cadiz on the other h@@ and , is very re@@ la@@ x@@ ed and easy @-@ go@@ ing . even at n@@ ig@@ ht , you '@@ d fe@@ el sa@@ fe w@@ al@@ king around the city . +n@@ ever more than a few bl@@ oc@@ ks a@@ way from the co@@ a@@ st@@ line . +the n@@ ar@@ ro@@ w and co@@ b@@ bl@@ ed st@@ re@@ ets open out on@@ to pre@@ t@@ ty l@@ it@@ tl@@ e s@@ qu@@ are@@ s . people s@@ it ou@@ t@@ si@@ de in c@@ af@@ es all day l@@ ong en@@ jo@@ y@@ ing the he@@ at , and g@@ a@@ z@@ ing up at the M@@ o@@ or@@ ish ar@@ ch@@ it@@ ec@@ h@@ t@@ ure . +Cadiz ( or l@@ it@@ tl@@ e H@@ av@@ an@@ a ) as it 's o@@ ft@@ en called , has str@@ ong re@@ la@@ tions with C@@ u@@ b@@ a . +there has been a cont@@ in@@ u@@ ous f@@ lo@@ w of tr@@ a@@ ff@@ ic bet@@ we@@ en the two c@@ it@@ es over the years . +the two ci@@ ties even look very sim@@ il@@ il@@ ar . C@@ u@@ b@@ a n s@@ c@@ en@@ es from the l@@ at@@ est J@@ ames B@@ on@@ d fil@@ m ( D@@ i@@ e an@@ other day ) were sh@@ o@@ t here in Cadiz . +you can w@@ al@@ k around the old -@@ s@@ u @-@ inter@@ i@@ or @-@ pu@@ e@@ bl@@ os @-@ bl@@ an@@ co@@ s , t@@ own in about an h@@ our . +there are also some lo@@ v@@ ely par@@ ks you can vis@@ it with sp@@ ec@@ t@@ ac@@ ul@@ ar vi@@ e@@ ws out to the B@@ ay . +if you are lo@@ o@@ king to rent apartments in Cadiz , on the C@@ ost@@ a de la Lu@@ z of Spain , this ac@@ comm@@ o@@ d@@ ation is located on the beach pro@@ m@@ en@@ ad@@ e of the city of Cadiz , right ne@@ ar the beach . +there are many Cadiz apartments for rent around the city . +this par@@ tic@@ ul@@ ar one is ful@@ ly f@@ ur@@ n@@ ished and e@@ qui@@ p@@ p@@ ed for up to 4 people . +if you '@@ re lo@@ o@@ king for Cadiz fl@@ at@@ s in the sou@@ th@@ er@@ n regi@@ on of An@@ d@@ al@@ us@@ ia , Spain than we have the per@@ f@@ ect one for you . +this h@@ om@@ e is ful@@ ly f@@ ur@@ n@@ ished and e@@ qui@@ p@@ p@@ ed for your com@@ for@@ t . +Cadiz holiday ren@@ t@@ als for let in An@@ d@@ al@@ us@@ ia , Spain . +this lo@@ v@@ ely apartment is located in the new are@@ a of Cadiz , ne@@ ar the h@@ os@@ p@@ it@@ al . +this 2 @-@ st@@ ar hot@@ el is located in a tran@@ qu@@ il are@@ a , h@@ al@@ f @-@ way bet@@ we@@ en the entr@@ ance of the city of Cadiz and it 's old city c@@ enter . +the hot@@ el is well @-@ sit@@ u@@ ated on one of the city ? s ma@@ in b@@ ou@@ l@@ ev@@ ar@@ ds , just 1@@ 00 m@@ et@@ res from the well @-@ know@@ n bl@@ ue f@@ la@@ g V@@ ic@@ t@@ or@@ ia beach and cl@@ ose to the conf@@ er@@ ence c@@ entr@@ e . +this is a br@@ and new hot@@ el in Cadiz ? hi@@ st@@ or@@ ical city c@@ entr@@ e , right by the b@@ est sh@@ op@@ p@@ ing c@@ ent@@ res and hi@@ st@@ or@@ ical are@@ as . +located in the very hear@@ t of the th@@ ous@@ and @-@ year @-@ old city of Cadiz and a few m@@ inut@@ es a@@ way from the S@@ an@@ t@@ a M@@ ari@@ a de@@ l M@@ ar beach and C@@ on@@ f@@ er@@ ence C@@ entr@@ e . +the Hot@@ el H@@ os@@ pe@@ der@@ ia L@@ as C@@ or@@ t@@ es de C@@ á@@ diz is f@@ ound in the hi@@ st@@ or@@ ic c@@ entr@@ e of Cadiz , in a l@@ and@@ m@@ ark bu@@ il@@ ding typ@@ ical of loc@@ al ar@@ ch@@ it@@ ec@@ t@@ ure of the 19@@ th c@@ ent@@ ur@@ y . +the Host@@ el M@@ ir@@ ad@@ or in V@@ e@@ j@@ er de la F@@ ron@@ ter@@ a is located in a tran@@ qu@@ il are@@ a of this t@@ own in the pro@@ v@@ ince of C@@ ad@@ is . +it 's g@@ u@@ ests have 15 d@@ ou@@ ble ro@@ om@@ s and 4 tri@@ ple ro@@ om@@ s available to them . +this ho@@ st@@ el is located in one of the old t@@ own s@@ qu@@ are@@ s in R@@ ot@@ a . +a per@@ f@@ ect place to en@@ jo@@ y the c@@ enter of t@@ own and the C@@ os@@ ti@@ l@@ la beach which is only 5 m@@ inut@@ es w@@ al@@ king dist@@ ance . +Cadiz M@@ oun@@ ta@@ ins v@@ ac@@ ation h@@ om@@ es for rent in Spain . +this holiday ren@@ t@@ al is sit@@ u@@ ated in the pu@@ e@@ bl@@ o bl@@ anc@@ o called Al@@ c@@ al@@ a de lo@@ s G@@ a@@ z@@ ul@@ es and is sit@@ u@@ ated in the Cadiz M@@ oun@@ ta@@ ins . +this T@@ ari@@ f@@ a holiday l@@ et@@ ting in Spain has one be@@ d@@ ro@@ om , and is a n@@ ice cho@@ ice for your v@@ ac@@ ation in T@@ ari@@ f@@ a . +if you '@@ re lo@@ o@@ king for ac@@ com@@ o@@ d@@ ation in Cadiz , Spain this h@@ om@@ e for rent in Al@@ g@@ ec@@ ir@@ as is a great cho@@ ice . +with a cap@@ ac@@ ity for 2 @-@ 3 people , this S@@ pan@@ ish v@@ il@@ la for let C@@ an@@ os de M@@ ec@@ a is in the Cadiz P@@ ro@@ v@@ ince of Spain . +if you are s@@ ear@@ ch@@ ing for pl@@ ac@@ es to st@@ ay on the C@@ ost@@ a de@@ l ... +be@@ au@@ ti@@ ful apartment in Ch@@ ic@@ l@@ an@@ a , Cadiz for rent . +apartment for holiday l@@ ets in Cadiz . good con@@ di@@ tion , 2 ... +apartments | R@@ ur@@ al H@@ om@@ es | Hot@@ els | Host@@ els | H@@ ous@@ es | A@@ par@@ th@@ ot@@ els | Y@@ ac@@ ht Ch@@ ar@@ ter | Last Minute Offers ! +the n@@ or@@ th e@@ ast cor@@ ner of the is@@ land has f@@ our ch@@ amp@@ i@@ on@@ sh@@ i@@ p st@@ and@@ ard g@@ ol@@ f c@@ our@@ s@@ es . +they '@@ re located at C@@ an@@ y@@ am@@ el , C@@ ap@@ de@@ per@@ a , P@@ ul@@ a @-@ G@@ ol@@ f and S@@ on Ser@@ ver@@ a . +there 's an or@@ g@@ an@@ i@@ st@@ ation on the is@@ land called the M@@ all@@ or@@ ca G@@ ol@@ f C@@ on@@ nec@@ tion , who can or@@ g@@ an@@ is@@ e all you g@@ ol@@ f@@ ing needs - te@@ e @-@ o@@ ff times , tran@@ sp@@ ort to and from the c@@ our@@ s@@ es , dis@@ coun@@ ted gre@@ en fe@@ es etc . they are the off@@ ici@@ al T@@ U@@ I Th@@ om@@ s@@ on g@@ ol@@ f bo@@ o@@ king ag@@ enc@@ y for M@@ all@@ or@@ ca . +this holiday l@@ et@@ t@@ ings in M@@ a@@ j@@ or@@ ca Spain , is located in S@@ a ... +apartments for rent in Cala R@@ at@@ jada , M@@ a@@ j@@ or@@ ca . this ... +self cat@@ er@@ ing holiday in n@@ ice apartment of 9@@ 0@@ s@@ q@@ m in Cala ... +Cala R@@ at@@ jada is located on a sm@@ all ro@@ c@@ k@@ y p@@ en@@ in@@ su@@ la on the ex@@ tre@@ me n@@ or@@ th e@@ ast cor@@ ner of M@@ all@@ or@@ ca , ... +here you will find all k@@ in@@ ds of Cala R@@ at@@ jada ac@@ comm@@ o@@ d@@ ation and lo@@ d@@ g@@ ing , pl@@ ac@@ es to st@@ ay from to@@ p ... +there are pl@@ ent@@ y of things to do and see in R@@ at@@ jada . +Cala r@@ at@@ a@@ jada 's ma@@ in t@@ own beach is called " S@@ on M@@ ol@@ l . " +Cala R@@ at@@ jada is a f@@ an@@ t@@ as@@ ti@@ c place to pr@@ ac@@ ti@@ se ou@@ t@@ do@@ or sp@@ or@@ ts . +it go@@ es without s@@ ay@@ ing that e@@ at@@ ing and di@@ r@@ in@@ king are im@@ por@@ t@@ ant holiday pa@@ st@@ times while you are ... +in Cala R@@ at@@ jada you are s@@ ure to find a n@@ igh@@ t@@ cl@@ u@@ b you like . +Ch@@ ar@@ ter a y@@ ac@@ ht or bo@@ at in Cala R@@ at@@ jada for your hol@@ id@@ ays , it will be a dec@@ is@@ i@@ on you will not ... +there are many qu@@ es@@ tions you may need ans@@ w@@ ers to when pl@@ an@@ ning your hol@@ id@@ ays in the be@@ au@@ ti@@ ful ... +an easy way to get an o@@ ver@@ all look at Cala R@@ at@@ jada is with pan@@ or@@ am@@ ic ph@@ ot@@ os . +go@@ ing sh@@ op@@ p@@ ing in Cala R@@ at@@ jada is a great op@@ tion . +Cala R@@ at@@ jada off@@ ers some very li@@ v@@ ely n@@ igh@@ t@@ li@@ fe with pl@@ ent@@ y of b@@ ar@@ s , cl@@ u@@ b@@ s and dis@@ co@@ s st@@ ay@@ ing open well into the ear@@ ly h@@ our@@ s . +one of the most p@@ op@@ ul@@ ar dis@@ co@@ s is called " ph@@ y@@ s@@ ical . " it 's p@@ op@@ ul@@ ar with both loc@@ als and holiday ma@@ k@@ ers . +they pl@@ ay a wi@@ de vari@@ ety of m@@ us@@ ic and re@@ g@@ ul@@ ar@@ ly hold f@@ o@@ am par@@ ties and la@@ z@@ er sh@@ ow@@ s . +I h@@ ere@@ by ac@@ know@@ le@@ d@@ ge having full know@@ le@@ d@@ ge of the bo@@ o@@ king ter@@ ms . +compl@@ et@@ e g@@ u@@ i@@ de of 3@@ 2@@ 5@@ 2 c@@ amp@@ sites in F@@ r@@ ance . for e@@ ach c@@ am@@ p @-@ site conta@@ in@@ ing the cl@@ as@@ si@@ f@@ ication , the serv@@ ices , the h@@ ir@@ ings , the lo@@ d@@ g@@ ing and the sit@@ u@@ ation . +our m@@ ul@@ tic@@ r@@ it@@ er@@ i@@ on res@@ earch as@@ si@@ st@@ ant allo@@ ws you to ref@@ ine your res@@ earch while ans@@ w@@ er@@ ing sim@@ ple qu@@ es@@ tions . +at the hear@@ t of the mat@@ ter is the is@@ su@@ e of tr@@ ust : tr@@ ust in in@@ sti@@ t@@ u@@ tions , in coun@@ ter@@ par@@ ties , in the m@@ ark@@ et , not le@@ ast , in information . +the in@@ v@@ est@@ ment str@@ at@@ e@@ g@@ y of C@@ re@@ d@@ it S@@ u@@ is@@ se F@@ un@@ d ( Lu@@ x ) G@@ lo@@ b@@ al R@@ es@@ p@@ on@@ sible E@@ qui@@ ties f@@ oc@@ us@@ es on ap@@ pro@@ pri@@ ate cap@@ it@@ al g@@ ro@@ w@@ th in com@@ b@@ in@@ ation with en@@ v@@ ir@@ on@@ m@@ ent@@ al and so@@ ci@@ al con@@ si@@ der@@ ations . +c@@ op@@ y@@ right © 19@@ 9@@ 7 - 200@@ 9 C@@ R@@ E@@ D@@ I@@ T S@@ U@@ I@@ S@@ S@@ E G@@ R@@ O@@ U@@ P and / or its a@@ ff@@ il@@ i@@ at@@ es . +read@@ s an enti@@ re file into an ar@@ r@@ ay . +n@@ ote : cont@@ ex@@ t support was ad@@ d@@ ed with PHP 5@@ .@@ 0@@ .@@ 0 . for a des@@ c@@ ri@@ p@@ tion of cont@@ ex@@ ts , ref@@ er to S@@ tre@@ am F@@ un@@ c@@ tions . +n@@ ote : if PHP is not pro@@ per@@ ly rec@@ og@@ n@@ iz@@ ing the line end@@ ings when re@@ ading files ei@@ ther on or cre@@ ated by a M@@ ac@@ in@@ t@@ os@@ h com@@ puter , en@@ abl@@ ing the au@@ to _ det@@ ect _ line _ end@@ ings ru@@ n @-@ time conf@@ ig@@ ur@@ ation op@@ tion may help res@@ ol@@ ve the pro@@ bl@@ em . +/ / G@@ et a file into an ar@@ r@@ ay . +/ / A@@ no@@ ther ex@@ am@@ ple , let 's get a we@@ b page into a str@@ ing . see also file _ get _ cont@@ ents ( ) . +when using S@@ S@@ L , M@@ ic@@ ro@@ so@@ ft I@@ I@@ S will vi@@ ol@@ ate the pro@@ to@@ co@@ l by cl@@ os@@ ing the connec@@ tion without s@@ end@@ ing a cl@@ ose _ no@@ ti@@ f@@ y ind@@ ic@@ ator . PHP will re@@ port this as " S@@ S@@ L : f@@ at@@ al P@@ ro@@ to@@ co@@ l E@@ r@@ r@@ or " when you re@@ ach the end of the data . +to work around this , the value of er@@ r@@ or _ re@@ por@@ ting should be lo@@ w@@ er@@ ed to a l@@ ev@@ el that do@@ es not incl@@ u@@ de war@@ n@@ ings . +PHP 4@@ .@@ 3@@ .@@ 7 and h@@ igh@@ er can det@@ ect bu@@ g@@ g@@ y I@@ I@@ S ser@@ ver software when you open the st@@ re@@ am using the h@@ t@@ t@@ p@@ s : / / wr@@ app@@ er and will sup@@ p@@ ress the war@@ ning . when using f@@ so@@ c@@ k@@ open ( ) to cre@@ ate an s@@ s@@ l : / / so@@ c@@ ket , the dev@@ el@@ op@@ er is res@@ p@@ on@@ sible for det@@ ec@@ ting and sup@@ pres@@ s@@ ing this war@@ ning . +UpdateStar l@@ ets you st@@ ay up to dat@@ e and sec@@ ure with the pers@@ onal software inst@@ all@@ ations on your P@@ C . +the F@@ ree E@@ di@@ tion pro@@ vid@@ es b@@ as@@ ic pro@@ t@@ ec@@ tion ag@@ ain@@ st ou@@ t@@ dat@@ ed ma@@ j@@ or ver@@ si@@ ons of your software with li@@ m@@ ited features . +UpdateStar Premium d@@ eli@@ vers 20 times more updates . +it d@@ eli@@ vers all m@@ in@@ or and ma@@ j@@ or updates for your software set@@ up . the Premium s@@ av@@ es you so much time s@@ ear@@ ch@@ ing for all the ne@@ w@@ ly available updates every day . +go Premium r@@ is@@ k @-@ free with our un@@ con@@ di@@ tional 3@@ 0 day mon@@ e@@ y bac@@ k g@@ u@@ ar@@ an@@ te@@ e and let the Premium E@@ di@@ tion th@@ or@@ ou@@ g@@ h@@ ly d@@ eli@@ ver all of your P@@ C 's updates . +UpdateStar Premium is available st@@ and @-@ al@@ one and b@@ un@@ d@@ l@@ ed with other world @-@ cl@@ as@@ s software produc@@ ts from our pro@@ mo@@ tions we@@ b@@ page . cor@@ por@@ ate users use our v@@ ol@@ u@@ me licens@@ ing op@@ tions . +the UpdateStar c@@ li@@ ent off@@ ers acc@@ ess to our com@@ pre@@ h@@ en@@ si@@ ve datab@@ ase with more than 2@@ 5@@ 9@@ ,@@ 00@@ 0 software rec@@ og@@ n@@ iz@@ ed produc@@ ts . as our datab@@ ase is user @-@ d@@ ri@@ ven , sa@@ fe and man@@ ta@@ ined by the users it is con@@ st@@ an@@ tly g@@ ro@@ wing and cur@@ ren@@ tly the most compl@@ et@@ e software datab@@ ase around . +UpdateStar Premium d@@ eli@@ vers 20 times more updates and up@@ gr@@ ad@@ es , wh@@ ere@@ as the F@@ ree only d@@ eli@@ vers ma@@ j@@ or updates . +p@@ remium ad@@ ds im@@ por@@ t@@ ant features such as compl@@ et@@ e software ma@@ in@@ ten@@ ance , sec@@ ur@@ ity ad@@ vis@@ ory , f@@ re@@ qu@@ ent m@@ in@@ or up@@ gr@@ ad@@ e ver@@ si@@ ons , ex@@ por@@ ts and im@@ por@@ ts , s@@ ch@@ ed@@ ul@@ ing and more . +go Premium for only $ 2@@ 9@@ .@@ 9@@ 5 and st@@ o@@ p mis@@ s@@ ing all of your P@@ C 's software updates . +we off@@ er our c@@ ust@@ om@@ ers a 1@@ 00 % c@@ ust@@ om@@ er s@@ ati@@ s@@ f@@ ac@@ tion or mon@@ e@@ y @-@ bac@@ k @-@ g@@ u@@ ar@@ an@@ te@@ e . +UpdateStar Premium can be licens@@ ed st@@ and @-@ al@@ one , but also com@@ es b@@ un@@ d@@ l@@ ed with world @-@ cl@@ as@@ s software produc@@ ts . +for more information , ple@@ ase vis@@ it our pro@@ mo@@ tions we@@ b@@ page . +UpdateStar trans@@ ac@@ tions are proc@@ ess@@ ed vi@@ a our ecommerce part@@ ner cl@@ ever@@ b@@ ri@@ d@@ ge . +our st@@ ore sup@@ por@@ ts a vari@@ ety of p@@ ay@@ ment op@@ tions includ@@ ing cre@@ d@@ it c@@ ar@@ ds , ch@@ e@@ qu@@ es , and P@@ ay@@ P@@ al . +all trans@@ ac@@ tion comm@@ un@@ ication is enc@@ r@@ yp@@ ted and st@@ ore@@ d sec@@ u@@ re@@ ly . +ple@@ ase make s@@ ure to ch@@ ec@@ k our cur@@ rent UpdateStar S@@ u@@ ite pro@@ mo@@ tional off@@ ers to get the b@@ est de@@ al for you . +so@@ on after you have re@@ qu@@ est@@ ed your Premium tri@@ al license or purch@@ ased your Premium license you will receive an e@@ ma@@ il conta@@ in@@ ing inst@@ all@@ ation in@@ struc@@ tions and your pers@@ onal L@@ ic@@ ens@@ e K@@ e@@ y to regi@@ st@@ er UpdateStar . +if you are having pro@@ bl@@ ems with your order you can cont@@ act cl@@ ever@@ b@@ ri@@ d@@ ge c@@ ust@@ om@@ er serv@@ ice . +it has been t@@ est@@ ed on Windows 200@@ 0 , Windows X@@ P , and Windows V@@ i@@ st@@ a . +simpl@@ y d@@ ou@@ ble @-@ cl@@ ick the downlo@@ ad@@ ed file to inst@@ all it . +UpdateStar F@@ ree and UpdateStar Premium are included come with the same inst@@ all@@ er . +UpdateStar includ@@ es support for many l@@ ang@@ u@@ ag@@ es such as English , Ger@@ man , F@@ ren@@ ch , I@@ t@@ al@@ i@@ an , H@@ un@@ g@@ ari@@ an , R@@ us@@ si@@ an and many more . +you can choose your l@@ ang@@ u@@ age set@@ t@@ ings from with@@ in the prog@@ ra@@ m . +you can purchase your UpdateStar Premium up@@ gr@@ ad@@ e license ( 1 year license ) for only $ 19@@ .@@ 9@@ 5 di@@ rec@@ tly for@@ m our on@@ line sh@@ o@@ p or choose your f@@ av@@ or@@ ite UpdateStar pro@@ mo@@ tional off@@ er to get your f@@ av@@ or@@ ite de@@ al for your license re@@ ne@@ w@@ al . +if you are up@@ gr@@ ading from a pre@@ vi@@ ous version of UpdateStar , simpl@@ y inst@@ all the downlo@@ ad@@ ed version - your licens@@ ing information will be re@@ ta@@ ined and Premium features will be ac@@ tiv@@ ated . +if you first un@@ inst@@ alled your cur@@ rent UpdateStar , you '@@ ll need to re @-@ enter your license ke@@ y information to acc@@ ess the Premium features . +apartments | H@@ ous@@ es | A@@ par@@ th@@ ot@@ els | Hot@@ els | Last Minute Offers ! +self cat@@ er@@ ing apartment in C@@ on@@ il de la F@@ ron@@ ter@@ a for 6 ... +this P@@ u@@ er@@ to de S@@ an@@ t@@ a M@@ ari@@ a be@@ ac@@ h@@ f@@ ron@@ t fl@@ at in Cadiz , Spain ... +located in the G@@ ol@@ f c@@ our@@ se " U@@ r@@ b@@ an@@ iz@@ ac@@ i@@ on C@@ ig@@ ü@@ e@@ ñ@@ a V@@ I , " ... +holiday ch@@ al@@ et in C@@ on@@ il for 6 people in the are@@ a of E@@ l ... +A@@ t@@ t@@ ribu@@ tion - Y@@ ou must at@@ t@@ ribu@@ te the work in the man@@ ner speci@@ fied by the au@@ th@@ or or licens@@ or ( but not in any way that su@@ g@@ g@@ ests that they end@@ or@@ se you or your use of the work ) . +the page you c@@ ame from conta@@ ined em@@ be@@ d@@ d@@ ed licens@@ ing m@@ et@@ ad@@ at@@ a , includ@@ ing how the cre@@ ator w@@ ish@@ es to be at@@ t@@ ribu@@ ted for re @-@ use . +you can use the H@@ T@@ M@@ L here to c@@ ite the work . +do@@ ing so will also incl@@ u@@ de m@@ et@@ ad@@ at@@ a on your page so that o@@ th@@ ers can find the or@@ ig@@ in@@ al work as well . +N@@ on@@ commercial - Y@@ ou may not use this work for commercial pur@@ pos@@ es . +wa@@ i@@ ver - An@@ y of the ab@@ o@@ ve con@@ di@@ tions can be wa@@ i@@ ved if you get per@@ mis@@ sion from the c@@ op@@ y@@ right hol@@ der . +in ad@@ di@@ tion to the right of licens@@ ors to re@@ qu@@ est rem@@ o@@ v@@ al of their name from the work when used in a der@@ i@@ v@@ ative or col@@ l@@ ec@@ tive they don 't like , c@@ op@@ y@@ right la@@ ws in most j@@ ur@@ is@@ d@@ ic@@ tions around the world ( with the n@@ ot@@ able exc@@ ep@@ tion of the U@@ S exc@@ ep@@ t in very li@@ m@@ ited c@@ ir@@ c@@ um@@ st@@ anc@@ es ) gr@@ ant cre@@ at@@ ors " m@@ or@@ al righ@@ ts " which may pro@@ vi@@ de some re@@ d@@ ress if a der@@ i@@ v@@ ative work re@@ pres@@ ents a " der@@ og@@ at@@ ory tre@@ at@@ ment " of the licens@@ or 's work . +publ@@ ic@@ ity righ@@ ts allo@@ w in@@ di@@ vid@@ u@@ als to cont@@ ro@@ l how their vo@@ ice , image or li@@ k@@ en@@ ess is used for commercial pur@@ pos@@ es in publ@@ ic . if a C@@ C @-@ licens@@ ed work includ@@ es the vo@@ ice or image of an@@ y@@ one other than the licens@@ or , a user of the work may need to get per@@ mis@@ sion from those in@@ di@@ vid@@ u@@ als before using the work for commercial pur@@ pos@@ es . +it is simpl@@ y a h@@ and@@ y ref@@ er@@ ence for un@@ der@@ st@@ an@@ ding the Le@@ gal C@@ o@@ de ( the full license ) - it is a h@@ um@@ an @-@ read@@ able ex@@ pres@@ sion of some of its ke@@ y ter@@ ms . th@@ in@@ k of it as the user @-@ f@@ ri@@ end@@ ly inter@@ face to the Le@@ gal C@@ o@@ de b@@ en@@ e@@ ath . +A@@ t@@ t@@ ribu@@ tion - Y@@ ou must at@@ t@@ ribu@@ te the work in the man@@ ner speci@@ fied by the au@@ th@@ or or licens@@ or ( but not in any way that su@@ g@@ g@@ ests that they end@@ or@@ se you or your use of the work ) . +the page you c@@ ame from conta@@ ined em@@ be@@ d@@ d@@ ed licens@@ ing m@@ et@@ ad@@ at@@ a , includ@@ ing how the cre@@ ator w@@ ish@@ es to be at@@ t@@ ribu@@ ted for re @-@ use . +you can use the H@@ T@@ M@@ L here to c@@ ite the work . +do@@ ing so will also incl@@ u@@ de m@@ et@@ ad@@ at@@ a on your page so that o@@ th@@ ers can find the or@@ ig@@ in@@ al work as well . +sh@@ are A@@ like - I@@ f you al@@ ter , trans@@ for@@ m , or bu@@ il@@ d upon this work , you may distribu@@ te the res@@ ul@@ ting work only un@@ der the same , sim@@ il@@ ar or a com@@ pa@@ ti@@ ble license . +wa@@ i@@ ver - An@@ y of the ab@@ o@@ ve con@@ di@@ tions can be wa@@ i@@ ved if you get per@@ mis@@ sion from the c@@ op@@ y@@ right hol@@ der . +righ@@ ts other pers@@ ons may have ei@@ ther in the work it@@ self or in how the work is used , such as publ@@ ic@@ ity or pri@@ v@@ ac@@ y righ@@ ts . +no@@ ti@@ ce - F@@ or any re@@ use or distribution , you must make cl@@ e@@ ar to o@@ th@@ ers the license ter@@ ms of this work . +the b@@ est way to do this is with a lin@@ k to this we@@ b page . +C@@ C licens@@ es an@@ ti@@ ci@@ p@@ ate that a licens@@ or may want to wa@@ i@@ ve compl@@ i@@ ance with a speci@@ f@@ ic con@@ di@@ tion , such as at@@ t@@ ribu@@ tion . +all j@@ ur@@ is@@ d@@ ic@@ tions allo@@ w some li@@ m@@ ited us@@ es of c@@ op@@ y@@ righ@@ ted mat@@ er@@ i@@ al without per@@ mis@@ sion . +C@@ C licens@@ es do not a@@ ff@@ ect the righ@@ ts of users un@@ der those c@@ op@@ y@@ right li@@ m@@ it@@ ations and exc@@ ep@@ tions , such as fa@@ ir use and fa@@ ir de@@ al@@ ing wh@@ ere ap@@ pl@@ ic@@ able . +in ad@@ di@@ tion to the right of licens@@ ors to re@@ qu@@ est rem@@ o@@ v@@ al of their name from the work when used in a der@@ i@@ v@@ ative or col@@ l@@ ec@@ tive they don 't like , c@@ op@@ y@@ right la@@ ws in most j@@ ur@@ is@@ d@@ ic@@ tions around the world ( with the n@@ ot@@ able exc@@ ep@@ tion of the U@@ S exc@@ ep@@ t in very li@@ m@@ ited c@@ ir@@ c@@ um@@ st@@ anc@@ es ) gr@@ ant cre@@ at@@ ors " m@@ or@@ al righ@@ ts " which may pro@@ vi@@ de some re@@ d@@ ress if a der@@ i@@ v@@ ative work re@@ pres@@ ents a " der@@ og@@ at@@ ory tre@@ at@@ ment " of the licens@@ or 's work . +publ@@ ic@@ ity righ@@ ts allo@@ w in@@ di@@ vid@@ u@@ als to cont@@ ro@@ l how their vo@@ ice , image or li@@ k@@ en@@ ess is used for commercial pur@@ pos@@ es in publ@@ ic . +if a C@@ C @-@ licens@@ ed work includ@@ es the vo@@ ice or image of an@@ y@@ one other than the licens@@ or , a user of the work may need to get per@@ mis@@ sion from those in@@ di@@ vid@@ u@@ als before using the work for commercial pur@@ pos@@ es . +it is simpl@@ y a h@@ and@@ y ref@@ er@@ ence for un@@ der@@ st@@ an@@ ding the Le@@ gal C@@ o@@ de ( the full license ) - it is a h@@ um@@ an @-@ read@@ able ex@@ pres@@ sion of some of its ke@@ y ter@@ ms . th@@ in@@ k of it as the user @-@ f@@ ri@@ end@@ ly inter@@ face to the Le@@ gal C@@ o@@ de b@@ en@@ e@@ ath . +this De@@ ed it@@ self has no le@@ gal value , and its cont@@ ents do not ap@@ pe@@ ar in the ac@@ t@@ ual license . +cre@@ ative Com@@ m@@ ons is not a law fir@@ m and do@@ es not pro@@ vi@@ de le@@ gal serv@@ ices . +distribu@@ ting of , dis@@ play@@ ing of , or lin@@ king to this Com@@ m@@ ons De@@ ed do@@ es not cre@@ ate an at@@ t@@ or@@ ney @-@ c@@ li@@ ent re@@ la@@ tion@@ sh@@ i@@ p . +this is a h@@ um@@ an @-@ read@@ able su@@ m@@ m@@ ary of the Le@@ gal C@@ o@@ de ( the full license ) . +use this license for your own work . +a new version of this license is available . +you should use it for new works , and you may want to re@@ license ex@@ i@@ sting works un@@ der it . +no works are a@@ ut@@ om@@ ati@@ c@@ ally put un@@ der the new license , h@@ ow@@ ever . +r@@ un@@ ning at the S@@ pe@@ ed of L@@ ig@@ ht ... +all other el@@ em@@ ents © 200@@ 9 D@@ C Com@@ ic@@ s . +S@@ O@@ E and the S@@ O@@ E lo@@ go are regi@@ st@@ er@@ ed tr@@ ad@@ em@@ ar@@ ks of S@@ on@@ y O@@ n@@ line En@@ ter@@ ta@@ in@@ ment L@@ L@@ C . +“ P@@ lay@@ S@@ t@@ ation ” and “ P@@ S ” F@@ amil@@ y lo@@ go are regi@@ st@@ er@@ ed tr@@ ad@@ em@@ ar@@ ks and “ PS@@ 3 ” is a tr@@ ad@@ em@@ ark of S@@ on@@ y Com@@ puter En@@ ter@@ ta@@ in@@ ment In@@ c . all other tr@@ ad@@ em@@ ar@@ ks and tr@@ ad@@ e n@@ ames are the pro@@ per@@ ty of their res@@ p@@ ec@@ tive ow@@ ners . +the r@@ at@@ ings ic@@ on is a regi@@ st@@ er@@ ed tr@@ ad@@ em@@ ark of the En@@ ter@@ ta@@ in@@ ment S@@ o@@ ftware A@@ s@@ so@@ ci@@ ation . +D@@ C U@@ N@@ I@@ V@@ E@@ R@@ S@@ E and all rel@@ ated char@@ ac@@ t@@ ers and el@@ em@@ ents are tr@@ ad@@ em@@ ar@@ ks of and © D@@ C Com@@ ic@@ s . +and as we know , em@@ o@@ tions are good for b@@ us@@ in@@ ess . +in this sec@@ tion you will find information on our produc@@ ts and the lic@@ ence con@@ di@@ tions . +ple@@ ase sel@@ ect a produc@@ t from the gr@@ ap@@ hi@@ c on the le@@ ft . +this sec@@ tion hol@@ ds the most gener@@ al qu@@ es@@ tions about PHP : what it is and what it do@@ es . +can I ru@@ n s@@ ever@@ al ver@@ si@@ ons of PHP at the same time ? +what are the differ@@ enc@@ es bet@@ we@@ en PHP 3 and PHP 4 ? +what are the differ@@ enc@@ es bet@@ we@@ en PHP 4 and PHP 5 ? +I th@@ in@@ k I f@@ ound a bu@@ g ! who should I t@@ ell ? +much of its s@@ y@@ n@@ ta@@ x is b@@ or@@ ro@@ w@@ ed from C , J@@ av@@ a and P@@ er@@ l with a c@@ ou@@ ple of un@@ i@@ qu@@ e PHP @-@ speci@@ f@@ ic features th@@ ro@@ w@@ n in . +the go@@ al of the l@@ ang@@ u@@ age is to allo@@ w we@@ b dev@@ el@@ op@@ ers to wr@@ ite d@@ y@@ n@@ am@@ ic@@ ally gener@@ ated pa@@ ges qu@@ ic@@ k@@ ly . +this conf@@ us@@ es many people because the first wor@@ d of the ac@@ ron@@ y@@ m is the ac@@ ron@@ y@@ m . +this typ@@ e of ac@@ ron@@ y@@ m is called a rec@@ ur@@ si@@ ve ac@@ ron@@ y@@ m . +for more information , the c@@ u@@ ri@@ ous can vis@@ it " F@@ ree O@@ n @-@ L@@ ine Dictionary of Com@@ pu@@ ting or the " W@@ i@@ k@@ i@@ pe@@ di@@ a entr@@ y on rec@@ ur@@ si@@ ve ac@@ ron@@ y@@ ms . +PHP / F@@ I 2.@@ 0 is an ear@@ ly and no l@@ ong@@ er sup@@ por@@ ted version of PHP . +PHP 3 is the su@@ c@@ c@@ ess@@ or to PHP / F@@ I 2.@@ 0 and is a lo@@ t n@@ ic@@ er . +PHP 5 is the cur@@ rent gener@@ ation of PHP , which us@@ es the " Z@@ end en@@ g@@ ine 2 which , among other things , off@@ ers many ad@@ di@@ tional O@@ O@@ P features . +ple@@ ase see the " W@@ h@@ at 's new in PHP 4 o@@ ver@@ vi@@ ew for a detail@@ ed ex@@ pl@@ an@@ ation of these features and more . +while PHP 5 was pur@@ pos@@ ely des@@ ig@@ ned to be as com@@ pa@@ ti@@ ble as pos@@ sible with pre@@ vi@@ ous ver@@ si@@ ons , there are some si@@ g@@ ni@@ f@@ ic@@ ant ch@@ ang@@ es . +for more detail@@ ed information , ple@@ ase vi@@ ew the sec@@ tion on M@@ ig@@ r@@ at@@ ing from PHP 4 to PHP 5 and the sec@@ tion on B@@ ac@@ k@@ war@@ ds In@@ com@@ pa@@ ti@@ ble Ch@@ ang@@ es . +you should go to the PHP B@@ u@@ g D@@ at@@ ab@@ ase and make s@@ ure the bu@@ g is@@ n 't a know@@ n bu@@ g . +if you don 't see it in the datab@@ ase , use the re@@ por@@ ting for@@ m to re@@ port the bu@@ g . +it is im@@ por@@ t@@ ant to use the bu@@ g datab@@ ase inst@@ e@@ ad of just s@@ end@@ ing an e@@ ma@@ il to one of the ma@@ il@@ ing li@@ st@@ s because the bu@@ g will have a tr@@ ac@@ king numb@@ er as@@ si@@ g@@ ned and it will then be pos@@ sible for you to go bac@@ k l@@ at@@ er and ch@@ ec@@ k on the st@@ at@@ us of the bu@@ g . +ta@@ g and the cor@@ res@@ p@@ on@@ d@@ ant H@@ T@@ T@@ P content typ@@ e . +n@@ ote : n@@ ote that J@@ P@@ C and J@@ P@@ 2 are cap@@ able of having com@@ p@@ on@@ ents with different b@@ it de@@ p@@ th@@ s . +in this c@@ ase , the value for " b@@ its " is the h@@ igh@@ est b@@ it de@@ p@@ th en@@ coun@@ ter@@ ed . +also , J@@ P@@ 2 files may conta@@ in m@@ ul@@ ti@@ ple J@@ P@@ E@@ G 200@@ 0 co@@ de@@ st@@ re@@ am@@ s . +in this c@@ ase , ge@@ ti@@ ma@@ g@@ es@@ iz@@ e ( ) re@@ tur@@ n@@ s the val@@ ues for the first co@@ de@@ st@@ re@@ am it en@@ coun@@ t@@ ers in the ro@@ o@@ t of the file . +n@@ ote : the information about ic@@ ons are re@@ tri@@ ev@@ ed from the ic@@ on with the h@@ igh@@ est b@@ it@@ r@@ ate . +it can ref@@ er@@ ence a loc@@ al file or ( conf@@ ig@@ ur@@ ation per@@ m@@ it@@ ting ) a rem@@ ote file using one of the sup@@ por@@ ted st@@ re@@ am@@ s . +this op@@ tional par@@ am@@ et@@ er allo@@ ws you to ex@@ tr@@ act some ex@@ t@@ end@@ ed information from the image file . +cur@@ ren@@ tly , this will re@@ tur@@ n the different J@@ P@@ G A@@ P@@ P m@@ ark@@ ers as an as@@ so@@ ci@@ ative ar@@ r@@ ay . +some pro@@ gr@@ am@@ s use these A@@ P@@ P m@@ ark@@ ers to emb@@ ed t@@ ex@@ t information in ima@@ ges . +a very comm@@ on one is to emb@@ ed " I@@ P@@ T@@ C information in the A@@ P@@ P@@ 1@@ 3 m@@ ark@@ er . +you can use the i@@ p@@ t@@ c@@ par@@ se ( ) func@@ tion to par@@ se the b@@ in@@ ary A@@ P@@ P@@ 1@@ 3 m@@ ark@@ er into som@@ e@@ thing read@@ able . +re@@ tur@@ n@@ s an ar@@ r@@ ay with 7 el@@ em@@ ents . +in@@ de@@ x 0 and 1 conta@@ ins res@@ p@@ ec@@ tiv@@ ely the w@@ id@@ th and the he@@ ig@@ ht of the image . +n@@ ote : some form@@ at@@ s may conta@@ in no image or may conta@@ in m@@ ul@@ ti@@ ple ima@@ ges . +in these c@@ ases , ge@@ ti@@ ma@@ g@@ es@@ iz@@ e ( ) might not be able to pro@@ per@@ ly det@@ er@@ mine the image si@@ z@@ e@@ . ge@@ ti@@ ma@@ g@@ es@@ iz@@ e ( ) will re@@ tur@@ n z@@ er@@ o for w@@ id@@ th and he@@ ig@@ ht in these c@@ ases . +in@@ de@@ x 2 is one of the I@@ M@@ A@@ G@@ E@@ T@@ Y@@ P@@ E _ X@@ X@@ X con@@ st@@ an@@ ts ind@@ ic@@ at@@ ing the typ@@ e of the image . +in@@ de@@ x 3 is a t@@ ex@@ t str@@ ing with the cor@@ rec@@ t he@@ ig@@ ht = " y@@ y@@ y " w@@ id@@ th = " x@@ x@@ x " str@@ ing that can be used di@@ rec@@ tly in an I@@ M@@ G ta@@ g . +ch@@ an@@ n@@ els will be 3 for R@@ G@@ B p@@ ic@@ t@@ ures and 4 for C@@ M@@ Y@@ K p@@ ic@@ t@@ ures . +support for J@@ P@@ C , J@@ P@@ 2 , J@@ P@@ X , J@@ B@@ 2 , X@@ B@@ M , and W@@ B@@ M@@ P bec@@ ame available . +n@@ ote : this func@@ tion do@@ es not re@@ qu@@ ir@@ e the G@@ D image li@@ br@@ ary . +vi@@ de@@ o for Windows A@@ V@@ I , Qu@@ ic@@ k@@ time M@@ O@@ V , M@@ P@@ E@@ G M@@ P@@ G , Windows Media V@@ i@@ de@@ o W@@ M@@ V or AS@@ F , etc . ) , you will find the ge@@ ti@@ d@@ 3 li@@ br@@ ary to be in@@ dis@@ p@@ en@@ sible . +now every De@@ pos@@ it@@ files user can uplo@@ ad his file even more eas@@ i@@ er , sa@@ f@@ er and at any time ! +there is available a new version of De@@ pos@@ it@@ files U@@ plo@@ ad@@ er 1@@ .@@ 3@@ .@@ 15 with m@@ ul@@ ti@@ uplo@@ ad func@@ tion ! +it allo@@ ws to uplo@@ ad up to 1@@ 0 files sim@@ ul@@ t@@ an@@ e@@ ous@@ ly ! +- uplo@@ ading proc@@ ess is both easy and pl@@ eas@@ ent ! +* I@@ n order not to lo@@ ose your files we re@@ comm@@ end to uplo@@ ad these into your acc@@ ount . +go to " Op@@ tions " in the m@@ en@@ u in the " A@@ c@@ coun@@ t " ta@@ g and typ@@ e in your lo@@ g@@ in and pas@@ s@@ wor@@ d ( if you don 't have an acc@@ ount with De@@ pos@@ it@@ files , you can regi@@ st@@ er one here - this is t@@ ot@@ ally free ) . +* T@@ he prog@@ ra@@ m a@@ ut@@ om@@ ati@@ c@@ ally s@@ pl@@ its b@@ i@@ g files ( more then 1@@ 00 M@@ b ) on@@ to sm@@ all@@ er par@@ ts with help of ar@@ ch@@ i@@ ver so that you could ad@@ d the files b@@ ig@@ g@@ er then 1@@ 00 M@@ b to the qu@@ e@@ ue for uplo@@ ad . +to s@@ w@@ it@@ ch this op@@ tion on , ple@@ ase ind@@ ic@@ ate the rou@@ te to the Win@@ ra@@ r ar@@ ch@@ i@@ ver on your com@@ puter . +to do this ple@@ ase ch@@ ec@@ k the bo@@ x with " com@@ p@@ ress files b@@ ig@@ g@@ er then ... " and ind@@ ic@@ ate the rou@@ te to the fol@@ der with ar@@ ch@@ i@@ ver . +the ar@@ ch@@ i@@ ver can be downlo@@ ad@@ ed here . +ex@@ port the lin@@ ks to the uplo@@ ad@@ ed fil@@ es@@ . you can set up y@@ our@@ self the form@@ at of the lin@@ ks which is more con@@ v@@ ini@@ ent ! +- Y@@ ou can choose the way to ex@@ port the lin@@ ks to uplo@@ ad@@ ed files : ei@@ ther to c@@ op@@ y to c@@ li@@ p@@ bo@@ ard or to sa@@ ve it as t@@ ex@@ t file . to do this , ple@@ ase go to " Op@@ tions " in the m@@ en@@ u and in the ta@@ g " M@@ ain " choose " c@@ li@@ p@@ bo@@ ard " or " n@@ ot@@ e@@ pa@@ d . " +- Y@@ ou can choose the form@@ at of the lin@@ ks to uplo@@ ad@@ ed files . +any lin@@ k form@@ at @-@ choose any that 's con@@ v@@ ini@@ ent for you ! +if you want to uplo@@ ad a file b@@ ig@@ g@@ er then 1@@ 00@@ M@@ b - just ad@@ d this to the uplo@@ ad qu@@ e@@ ue and the prog@@ ra@@ m will off@@ er s@@ pl@@ it@@ ting the file into sm@@ all@@ er files and then will ad@@ d these to the uplo@@ ad qu@@ e@@ ue ! +p@@ ay at@@ ten@@ tion that for this func@@ tion to func@@ tion pro@@ per@@ ly , you need to go to " Op@@ tions , " choose su@@ b@@ m@@ en@@ u " M@@ ain " and ind@@ ic@@ ate the rou@@ te to the ar@@ ch@@ i@@ ver prog@@ ra@@ m , m@@ en@@ tion@@ ing the m@@ in@@ im@@ um si@@ z@@ e of the file and m@@ ark the op@@ tion if you w@@ ish to ac@@ tiv@@ ate it . +also ple@@ ase p@@ ay at@@ ten@@ tion to such op@@ tions as : m@@ in@@ im@@ iz@@ e to tr@@ ay , tran@@ sp@@ a@@ ren@@ c@@ y of the prog@@ ra@@ m 's pan@@ el , op@@ tion of st@@ art@@ ing and st@@ op@@ p@@ ing uplo@@ ading according to the set@@ t@@ ings you make , etc . +for@@ um@@ s about S@@ an J@@ u@@ an de lo@@ s T@@ er@@ r@@ er@@ os - Al@@ mer@@ í@@ a . +for@@ um@@ s about C@@ on@@ il de la F@@ ron@@ ter@@ a - C@@ á@@ diz . +for@@ um@@ s about H@@ or@@ t@@ a - I@@ s@@ la F@@ a@@ i@@ al - A@@ z@@ ore@@ s . +for@@ um@@ s about E@@ l M@@ oc@@ an@@ al - E@@ l H@@ i@@ er@@ r@@ o . +for@@ um@@ s about L@@ a R@@ est@@ ing@@ a - E@@ l H@@ i@@ er@@ r@@ o . +for@@ um@@ s about T@@ en - be@@ l , ur@@ b@@ an@@ iz@@ ac@@ i@@ on - T@@ en@@ er@@ i@@ fe . +for@@ um@@ s about P@@ lay@@ a s@@ on b@@ ou - I@@ b@@ iz@@ a . +is a str@@ ing ob@@ j@@ ect that can have z@@ er@@ o or more val@@ ues , e@@ ach of which must be ch@@ os@@ en from a l@@ ist of allo@@ w@@ ed val@@ ues speci@@ fied when the ta@@ ble is cre@@ ated . +M@@ ember val@@ ues in the ta@@ ble de@@ fin@@ i@@ tion when a ta@@ ble is cre@@ ated . +col@@ um@@ n are dis@@ play@@ ed using the l@@ et@@ ter@@ c@@ ase that was used in the col@@ um@@ n de@@ fin@@ i@@ tion . +col@@ um@@ n@@ s can be as@@ si@@ g@@ ned a char@@ ac@@ ter set and col@@ l@@ ation . +for b@@ in@@ ary or c@@ ase @-@ s@@ en@@ si@@ tive col@@ la@@ tions , l@@ et@@ ter@@ c@@ ase is ta@@ ken into acc@@ ount when as@@ si@@ g@@ ning val@@ ues to the col@@ um@@ n . +val@@ ues n@@ u@@ mer@@ ic@@ ally , with the lo@@ w @-@ order b@@ it of the st@@ ore@@ d value cor@@ res@@ p@@ on@@ ding to the first set m@@ ember . +value in a n@@ u@@ mer@@ ic cont@@ ex@@ t , the value re@@ tri@@ ev@@ ed has b@@ its set cor@@ res@@ p@@ on@@ ding to the set m@@ emb@@ ers that make up the col@@ um@@ n value . +col@@ um@@ n , the b@@ its that are set in the b@@ in@@ ary re@@ pres@@ ent@@ ation of the numb@@ er det@@ er@@ mine the set m@@ emb@@ ers in the col@@ um@@ n value . +el@@ ement , it do@@ es not mat@@ ter what order the el@@ em@@ ents are li@@ st@@ ed in when you in@@ ser@@ t the value . +it also do@@ es not mat@@ ter how many times a given el@@ ement is li@@ st@@ ed in the value . +when the value is re@@ tri@@ ev@@ ed l@@ at@@ er , e@@ ach el@@ ement in the value app@@ ears on@@ ce , with el@@ em@@ ents li@@ st@@ ed according to the order in which they were speci@@ fied at ta@@ ble cre@@ ation time . +an@@ y@@ wh@@ ere , even as a su@@ b@@ str@@ ing of an@@ other set m@@ ember . +the first of these st@@ at@@ em@@ ents lo@@ o@@ ks for val@@ ues conta@@ in@@ ing the first set m@@ ember . +the sec@@ on@@ d lo@@ o@@ ks for an ex@@ act mat@@ ch . +be c@@ are@@ ful with comparis@@ ons of the sec@@ on@@ d typ@@ e . +you should speci@@ f@@ y the val@@ ues in the same order they are li@@ st@@ ed in the col@@ um@@ n de@@ fin@@ i@@ tion . +my st@@ art@@ l@@ ing com@@ ic D@@ i@@ e z@@ we@@ i l@@ us@@ ti@@ g@@ en R@@ a@@ ver with the two j@@ ol@@ ly ra@@ vers A@@ ci@@ d & E has been publ@@ ished in the a@@ w@@ es@@ om@@ e bo@@ ok S@@ ha@@ ke Y@@ our T@@ ree # 3 . +I was dra@@ wing this st@@ ory with M@@ ic@@ ro@@ so@@ ft P@@ ain@@ t@@ br@@ us@@ h . +ex@@ am@@ ine the or@@ ig@@ in@@ al dra@@ w@@ ings in full si@@ z@@ e ! +there is also some H@@ u@@ pe@@ l P@@ u@@ pe@@ l in the bo@@ ok ! +one be@@ d@@ ro@@ om A@@ part@@ ments and S@@ t@@ u@@ di@@ os | 2 B@@ ed@@ ro@@ om A@@ part@@ ments | 3 or more B@@ ed@@ ro@@ om@@ s | S@@ pan@@ ish V@@ il@@ l@@ as | Hot@@ els | Host@@ els | Last Minute Offers ! +this is the ma@@ in loc@@ ation for k@@ it@@ es@@ ur@@ f in this part of Cadiz pro@@ v@@ ince . +beach with g@@ ol@@ den s@@ ands , located in the re@@ sidenti@@ al are@@ a ne@@ x@@ t to the C@@ as@@ ti@@ ll@@ o ( c@@ a@@ st@@ l@@ e ) of S@@ an ... +this beach is sit@@ u@@ ated ne@@ x@@ t to the ru@@ ins of the C@@ as@@ ti@@ ll@@ o de S@@ an@@ t@@ a C@@ at@@ al@@ in@@ a , of s@@ av@@ age as@@ p@@ ect , ... +beach on the port of f@@ ine g@@ ol@@ den s@@ ands and s@@ em@@ i @-@ ur@@ b@@ an char@@ a@@ k@@ ter with exc@@ ell@@ ent serv@@ ices and ... +the E@@ l L@@ ev@@ an@@ te beach , also know@@ n as L@@ os T@@ or@@ u@@ ñ@@ os is a beach is@@ ol@@ ated from the ur@@ b@@ an c@@ entr@@ e of the ... +s@@ em@@ i @-@ ur@@ b@@ an , very b@@ us@@ y , l@@ ar@@ ge beach of g@@ ol@@ den s@@ ands . +this place has many vis@@ it@@ ors . here you can en@@ jo@@ y n@@ au@@ ti@@ c sp@@ or@@ ts , f@@ ish@@ ing , w@@ in@@ d @-@ s@@ ur@@ f , k@@ ay@@ a@@ k etc . +located in Ch@@ ic@@ l@@ an@@ a de la F@@ ron@@ ter@@ a , this holiday h@@ om@@ e for ... +located in C@@ on@@ il de la F@@ ron@@ ter@@ a , An@@ d@@ al@@ us@@ ia , this holiday ... +the T@@ o@@ p 7@@ 0 F@@ in@@ al@@ i@@ st@@ s sh@@ or@@ tl@@ ist has been sel@@ ec@@ ted from the ent@@ ri@@ es su@@ b@@ m@@ it@@ ted to the e@@ L@@ ear@@ ning A@@ war@@ ds in 200@@ 8 . +if you are among the sh@@ or@@ t@@ li@@ st@@ ed pro@@ j@@ ect , you can incl@@ u@@ de a lo@@ go &apos@@ ; T@@ O@@ P 7@@ 0 &apos@@ ; on your we@@ b@@ site . +in the sp@@ irit of ne@@ w@@ ness , let 's take a look at the only dec@@ k in the T@@ o@@ p 8 of T@@ ur@@ in that made use of the new A@@ ren@@ a G@@ r@@ and M@@ el@@ e@@ e c@@ ar@@ ds . +b@@ as@@ ic@@ ally , to have tr@@ ue cont@@ ro@@ l over the t@@ em@@ p@@ o of the g@@ ame is like being a p@@ ol@@ ic@@ em@@ an cont@@ ro@@ ll@@ ing tr@@ a@@ ff@@ ic . +if you are able to su@@ c@@ c@@ ess@@ ful@@ ly put up a st@@ o@@ p si@@ g@@ n for your op@@ p@@ on@@ ent 's pl@@ ans and / or con@@ si@@ st@@ ent@@ ly make a gre@@ en l@@ ig@@ ht for y@@ our@@ self , you have ac@@ hi@@ ev@@ ed t@@ em@@ p@@ o ad@@ v@@ an@@ ta@@ ge . +we just go@@ t a h@@ u@@ ge in@@ fl@@ u@@ x of new play@@ abl@@ es ( the new set is simpl@@ y a@@ w@@ es@@ om@@ e ! +) , and ever@@ y@@ one is s@@ c@@ r@@ am@@ bl@@ ing to det@@ er@@ mine the b@@ est way to use them . +apartments | Hot@@ els | Host@@ els | C@@ amp@@ ings | Th@@ ing to do | Last Minute Offers ! +holiday l@@ et@@ t@@ ings available on the C@@ ost@@ a de@@ l S@@ ol of the ... +this fl@@ at for rent in B@@ en@@ al@@ ma@@ d@@ en@@ a , M@@ al@@ ag@@ a is located in the ... +the Euro@@ L@@ in@@ u@@ x Al@@ li@@ ance for a F@@ ree In@@ formation In@@ f@@ ra@@ struc@@ t@@ ure is an open co@@ al@@ i@@ tion of commercial com@@ pan@@ ies and n@@ on @-@ pro@@ f@@ it as@@ so@@ ci@@ ations un@@ ited to pro@@ m@@ ote and pro@@ t@@ ect a v@@ ig@@ ou@@ rou@@ s Euro@@ pean S@@ o@@ ftware C@@ ul@@ t@@ ure based on c@@ op@@ y@@ right , open st@@ and@@ ar@@ ds , open com@@ pe@@ ti@@ tion and open s@@ our@@ ce software such as L@@ in@@ u@@ x . +cor@@ por@@ ate m@@ emb@@ ers or spons@@ ors of Euro@@ L@@ in@@ u@@ x dev@@ el@@ o@@ p or sel@@ l software un@@ der free , s@@ em@@ i @-@ free and n@@ on @-@ free licens@@ es for op@@ er@@ at@@ ing systems such as G@@ N@@ U / L@@ in@@ u@@ x , M@@ ac@@ O@@ S or M@@ S Windows . +for the last few years the Euro@@ pean P@@ at@@ ent Off@@ ice ( E@@ P@@ O ) has , con@@ tr@@ ary to the l@@ et@@ ter and sp@@ irit of the ex@@ i@@ sting law , gr@@ an@@ ted more than 3@@ 00@@ 00 pat@@ ents on r@@ ul@@ es of or@@ g@@ an@@ is@@ ation and c@@ al@@ c@@ ul@@ ation cl@@ a@@ im@@ ed in ter@@ ms of gener@@ al @-@ pur@@ p@@ ose com@@ pu@@ ting e@@ qui@@ p@@ ment , called " pro@@ gr@@ am@@ s for com@@ pu@@ t@@ ers " in the law of 19@@ 7@@ 3 and " com@@ puter @-@ im@@ pl@@ em@@ en@@ ted in@@ v@@ en@@ tions " in E@@ P@@ O N@@ e@@ w@@ spea@@ k since 200@@ 0 . +Euro@@ p@@ e 's pat@@ ent mo@@ v@@ ement is pres@@ s@@ ing to le@@ gi@@ ti@@ m@@ ate this pr@@ ac@@ ti@@ se by writ@@ ing a new law . +al@@ th@@ ou@@ gh the pat@@ ent mo@@ v@@ ement has l@@ ost ma@@ j@@ or b@@ at@@ tl@@ es in N@@ o@@ v@@ ember 200@@ 0 and S@@ ep@@ t@@ ember 200@@ 3 , Euro@@ p@@ e 's pro@@ gr@@ am@@ m@@ ers and ci@@ ti@@ z@@ en@@ s are sti@@ ll f@@ ac@@ ing con@@ si@@ der@@ able r@@ is@@ ks . +here you find the b@@ as@@ ic do@@ c@@ um@@ ent@@ ation , st@@ art@@ ing from the l@@ at@@ est ne@@ ws and a sh@@ ort o@@ ver@@ vi@@ ew . +the pat@@ ent mo@@ v@@ ement has d@@ uring s@@ ever@@ al dec@@ ad@@ es w@@ on the support of l@@ ar@@ ge cor@@ por@@ ations and go@@ ver@@ n@@ ments for its ex@@ pan@@ si@@ on@@ ist c@@ ause . +y@@ et F@@ F@@ I@@ I , Euro@@ lin@@ u@@ x and o@@ th@@ ers have dev@@ ot@@ ed the@@ m@@ sel@@ ves to this work with con@@ si@@ der@@ able su@@ c@@ c@@ ess . +sti@@ ll , we cont@@ in@@ ue to have more t@@ as@@ ks than free h@@ ands . +here we t@@ ell you how you can help us mo@@ ve for@@ w@@ ard more qu@@ ic@@ k@@ ly . +a datab@@ ase of the mon@@ op@@ ol@@ ies on pro@@ gr@@ am@@ m@@ ing pro@@ bl@@ ems , which the Euro@@ pean P@@ at@@ ent Off@@ ice has gr@@ an@@ ted ag@@ ain@@ st the l@@ et@@ ter and sp@@ irit of the ex@@ i@@ sting la@@ ws , and about which it is un@@ su@@ ff@@ ici@@ ent@@ ly in@@ form@@ ing the publ@@ ic , d@@ eli@@ ver@@ ing only ch@@ un@@ ks of gr@@ ap@@ h@@ ical data h@@ id@@ den be@@ h@@ in@@ d in@@ put m@@ as@@ ks . +the F@@ F@@ I@@ I software pat@@ ent wor@@ k@@ grou@@ p is tr@@ y@@ ing to s@@ ing@@ l@@ e out the software pat@@ ents , make them bet@@ ter acc@@ es@@ sible and sh@@ ow their e@@ ff@@ ects on software dev@@ el@@ op@@ ment . +d@@ uring the last few years , the Euro@@ pean P@@ at@@ ent Off@@ ice ( E@@ P@@ O ) has gr@@ an@@ ted s@@ ever@@ al 1@@ 00@@ 00 pat@@ ents on com@@ puter @-@ im@@ pl@@ em@@ en@@ ted r@@ ul@@ es of or@@ g@@ an@@ is@@ ation and c@@ al@@ c@@ ul@@ ation , i@@ .@@ e@@ . pro@@ gr@@ am@@ s for com@@ pu@@ t@@ ers &#@@ 9@@ 1@@ ; as such &#@@ 9@@ 3@@ ; . +we are syst@@ e@@ ma@@ tic@@ ally col@@ l@@ ec@@ ting these pat@@ ents and re@@ publ@@ ish@@ ing them in a more acc@@ es@@ sible for@@ m . +having been app@@ o@@ in@@ ted as the ex@@ cl@@ us@@ i@@ ve distribu@@ t@@ or for the S@@ u@@ fi@@ x br@@ and in the U@@ K , S@@ hi@@ man@@ o is now able to off@@ er one of the most com@@ pre@@ h@@ en@@ si@@ ve mon@@ o , b@@ ra@@ id & ho@@ ok@@ lin@@ k r@@ ang@@ es in the m@@ ark@@ et . +ag@@ ed 2@@ 9 and from W@@ est B@@ ay , D@@ or@@ set , D@@ a@@ ve st@@ ar@@ ted f@@ ish@@ ing at the t@@ end@@ er age of 5 and jo@@ ined the W@@ est B@@ ay S@@ e@@ a An@@ gl@@ ing C@@ l@@ u@@ b when he was 1@@ 4 , go@@ ing on to w@@ in pre@@ t@@ ty much every t@@ ro@@ ph@@ y there was . +" S@@ hi@@ man@@ o are ple@@ ased to an@@ n@@ oun@@ ce the si@@ g@@ ning of A@@ le@@ x B@@ on@@ es , to b@@ ol@@ st@@ er our al@@ read@@ y im@@ pres@@ si@@ ve mat@@ ch line @-@ up . +S@@ hi@@ man@@ o is ple@@ ased to a an@@ n@@ oun@@ ce the app@@ o@@ in@@ t@@ ment of D@@ ar@@ r@@ an G@@ ou@@ l@@ der to their con@@ su@@ l@@ t@@ ant te@@ am . +S@@ hi@@ man@@ o is p@@ rou@@ d to an@@ n@@ oun@@ ce avail@@ ab@@ il@@ ity of its new cat@@ al@@ og@@ ues for 200@@ 9 . +S@@ e@@ on k@@ at@@ õ@@ go@@ or@@ i@@ an om@@ ma@@ q j@@ ä@@ r@@ g@@ m@@ ä@@ d@@ se@@ q 19 le@@ he@@ k@@ ü@@ l@@ ge ( k@@ ok@@ k@@ o 19 ) . +S@@ E@@ O le@@ ht om vi@@ im@@ ä@@ te mu@@ ud@@ õ@@ t 0@@ 7 : 5@@ 9 , 2@@ 5@@ . le@@ he@@ k@@ u@@ u 200@@ 9 . +h@@ ous@@ es | A@@ part@@ ments | Hot@@ els | Th@@ ings to do | Last Minute Offers ! +Formigal also know@@ n as For@@ n@@ ig@@ al in A@@ ra@@ g@@ on is a sm@@ all t@@ own in the pro@@ v@@ ince of H@@ u@@ es@@ ca of N@@ or@@ th@@ er@@ n ... +if you '@@ re lo@@ o@@ king for things to do in Formigal , Py@@ re@@ nees in Spain the ma@@ in at@@ tr@@ ac@@ tion here ... +the c@@ li@@ m@@ ate in Formigal is that of the Py@@ re@@ nees in A@@ ra@@ g@@ on , Spain , col@@ d in the w@@ in@@ ter and m@@ il@@ d ... +we are am@@ ong@@ st one of the ma@@ in sk@@ i res@@ or@@ ts in the I@@ b@@ er@@ i@@ an P@@ en@@ n@@ in@@ su@@ la and the Py@@ r@@ nees . +b@@ el@@ ow these lin@@ es we off@@ er you some information of inter@@ est in Formigal , Py@@ re@@ nees , Spain . +to spea@@ k of n@@ at@@ ure in Formigal is to spea@@ k of the T@@ en@@ a V@@ all@@ e@@ y , the Py@@ re@@ nees and in@@ cre@@ di@@ ble are@@ as ... +Formigal is a sk@@ i res@@ ort located in a sm@@ all m@@ un@@ ici@@ p@@ al@@ ity of about 2@@ 00 people . +there are vari@@ ous sp@@ or@@ ts you can pr@@ ac@@ ti@@ ce in Formigal , Spain . +if you want to get a great me@@ al in Formigal , it will not be di@@ ff@@ ic@@ ul@@ t as there are vari@@ ous ... +Formigal , Spain in the Py@@ r@@ nees b@@ el@@ ong@@ s to the pro@@ v@@ ince of A@@ ra@@ g@@ on and is located in the ... +S@@ all@@ ent de G@@ all@@ e@@ go is a sm@@ all t@@ own in the pro@@ v@@ ince of H@@ u@@ es@@ ca in the Py@@ re@@ nees in Spain that has ... +Formigal is located in S@@ all@@ ent de G@@ all@@ e@@ go , in the Py@@ re@@ nees of Spain . +Formigal also know@@ n as For@@ n@@ ig@@ al in A@@ ra@@ g@@ on is a sm@@ all t@@ own in the pro@@ v@@ ince of H@@ u@@ es@@ ca of N@@ or@@ th@@ er@@ n Spain in the hear@@ t of the Py@@ re@@ nees . +it is part of the loc@@ al@@ ity of S@@ all@@ ent de G@@ all@@ e@@ go and has a p@@ op@@ ul@@ ation of about 2@@ 00 in@@ ha@@ b@@ it@@ an@@ ts , al@@ th@@ ou@@ gh d@@ uring the sk@@ i se@@ as@@ on this am@@ ount is m@@ ul@@ ti@@ pl@@ i@@ ed by 5 . +Formigal is located at only a few k@@ il@@ om@@ et@@ ers from the F@@ ren@@ ch b@@ order , located at 9@@ 0 k@@ il@@ om@@ et@@ ers from H@@ u@@ es@@ ca . +it is located in the be@@ au@@ ti@@ ful T@@ en@@ a V@@ all@@ e@@ y and is one of the most im@@ por@@ t@@ ant and b@@ ig@@ g@@ est sk@@ i res@@ or@@ ts in all of Spain . +the cap@@ ac@@ ity of this sk@@ i res@@ ort is for ap@@ pro@@ x@@ im@@ at@@ ely 3@@ 0@@ ,@@ 00@@ 0 people . +A@@ r@@ am@@ on Formigal is the S@@ k@@ i res@@ ort in Formigal , and this res@@ ort is located bet@@ we@@ en 1@@ 5@@ 00 @-@ 2@@ 2@@ 5@@ 0 m@@ et@@ ers ab@@ o@@ ve se@@ a l@@ ev@@ el , and has more than 1@@ 00 k@@ il@@ om@@ et@@ ers of sk@@ i@@ able s@@ lo@@ p@@ es . +this res@@ ort has all typ@@ es of e@@ qui@@ p@@ ment such as sk@@ i li@@ f@@ ts , and con@@ v@@ e@@ y@@ er b@@ el@@ ts to get around . +it has the cap@@ ac@@ ity to allo@@ w 2@@ 5@@ ,@@ 00@@ 0 people sk@@ i e@@ ach h@@ our , and has serv@@ ices for mat@@ er@@ i@@ al ren@@ t@@ al , re@@ st@@ a@@ ur@@ an@@ ts , ac@@ comm@@ o@@ d@@ ation , ac@@ comm@@ o@@ d@@ ation such as hot@@ els and apartments , he@@ al@@ th cl@@ in@@ ic@@ s , and a vari@@ ety of s@@ lo@@ p@@ es speci@@ al for s@@ now@@ bo@@ ar@@ ding , s@@ l@@ al@@ om and t@@ u@@ b@@ ers . +located in the hi@@ st@@ or@@ ic city c@@ enter of la V@@ il@@ la de S@@ all@@ ent , hot@@ el B@@ al@@ a@@ it@@ us is a tr@@ a@@ di@@ tional m@@ oun@@ ta@@ in lo@@ d@@ g@@ ing . the ou@@ t@@ si@@ de is cl@@ as@@ si@@ c in st@@ y@@ l@@ e with an ar@@ ch of st@@ one . +sit@@ u@@ ated in H@@ u@@ es@@ ca the hot@@ el is located in an are@@ a of n@@ at@@ ur@@ al be@@ a@@ ut@@ y , the hot@@ el off@@ ers an am@@ b@@ i@@ ence of pe@@ ace and tran@@ qu@@ il@@ l@@ ity in the hear@@ t of the Py@@ re@@ nees . +it is located in an exc@@ ep@@ tion loc@@ ation . very co@@ z@@ y h@@ om@@ e and well e@@ qui@@ p@@ p@@ ed with all of the nec@@ essi@@ ties for your hol@@ id@@ ays . +this holiday apartment in Formigal , Py@@ re@@ nees in n@@ or@@ th@@ er@@ n Spain is located right by the s@@ lo@@ p@@ es and very c@@ ent@@ ri@@ c . +ne@@ ar the su@@ per@@ m@@ ark@@ ets , b@@ ar@@ s , re@@ st@@ a@@ ur@@ an@@ ts and much much more ! +this fl@@ at for rent in B@@ en@@ al@@ ma@@ d@@ en@@ a , M@@ al@@ ag@@ a is located in the ... +self cat@@ er@@ ing holiday in n@@ ice apartment of 9@@ 0@@ s@@ q@@ m in Cala ... +hot@@ els | H@@ ol@@ iday apartments | Host@@ els | R@@ ent a C@@ ar | Last Minute Offers ! +re@@ al est@@ ate , con@@ struc@@ tions , trans@@ f@@ ers et@@ c@@ . in F@@ u@@ en@@ gi@@ ro@@ la . +if you are s@@ ear@@ ch@@ ing for pl@@ ac@@ es to st@@ ay on the C@@ ost@@ a de@@ l ... +these F@@ u@@ en@@ gi@@ ro@@ la apartment ren@@ t@@ als in M@@ al@@ ag@@ a , Spain are ... +this st@@ u@@ di@@ o for rent in T@@ or@@ rem@@ ol@@ in@@ os , M@@ al@@ ag@@ a is only a very ... +on@@ ce res@@ er@@ v@@ ations have been con@@ fir@@ m@@ ed we k@@ ind@@ ly as@@ k you for a down p@@ ay@@ ment of € 3@@ 00 , -@@ - p@@ er ro@@ om to be rem@@ it@@ ted to our acc@@ ount with R@@ a@@ i@@ ff@@ e@@ is@@ en@@ b@@ an@@ k E@@ h@@ r@@ w@@ al@@ d . +in c@@ ase of p@@ rem@@ at@@ ure de@@ part@@ ure or l@@ ate ar@@ ri@@ v@@ al we will char@@ ge ro@@ om r@@ at@@ es as bo@@ ok@@ ed . +we acc@@ ep@@ t c@@ as@@ h , tr@@ av@@ ell@@ ers ch@@ e@@ qu@@ es , E@@ C@@ - , V@@ is@@ a@@ - or M@@ a@@ st@@ er@@ c@@ ard as well as ad@@ v@@ ance b@@ an@@ k trans@@ f@@ er . +we would like to ad@@ vis@@ e you that it is only pos@@ sible to bo@@ ok in@@ di@@ vid@@ ual ro@@ om cat@@ e@@ g@@ or@@ ies . res@@ er@@ v@@ ations of speci@@ f@@ ic ro@@ om@@ s or f@@ lo@@ ors are not acc@@ ep@@ ted . +ro@@ om@@ s are available for g@@ u@@ ests from 3 p@@ .@@ m@@ . +g@@ u@@ ests are as@@ k@@ ed to v@@ ac@@ ate ro@@ om@@ s by 11 a@@ .@@ m@@ . +with your res@@ er@@ v@@ ation you can take out ho@@ g@@ ast holiday in@@ s@@ ur@@ ance to co@@ ver you for c@@ anc@@ ell@@ ation and other un@@ f@@ o@@ res@@ e@@ en ev@@ ent@@ u@@ al@@ i@@ ties . +this statistic is based on the 68@@ 15 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 0@@ 1 M@@ ar 200@@ 8 . +the ex@@ act sp@@ e@@ ed v@@ ar@@ ies de@@ p@@ end@@ ing on the system conf@@ ig@@ ur@@ ation , software prog@@ ra@@ m , and do@@ c@@ u@@ ment compl@@ ex@@ ity . +time ne@@ ed@@ ed before pr@@ in@@ ting after power s@@ w@@ it@@ ch@@ ed on ; ex@@ pres@@ sed in sec@@ on@@ ds . +si@@ z@@ e of p@@ all@@ ets ( W@@ id@@ th x D@@ ep@@ th x H@@ e@@ ig@@ ht ) in m@@ illi@@ m@@ et@@ er . +all pers@@ onal data is enc@@ r@@ yp@@ ted and will be proc@@ ess@@ ed in a sec@@ ure way . +H@@ R@@ O@@ S ta@@ k@@ es the pri@@ v@@ ac@@ y of your pers@@ onal data very ser@@ i@@ ous . +your pers@@ onal information will only be used to proc@@ ess your bo@@ o@@ king . +for more information , re@@ ad our pri@@ v@@ ac@@ y st@@ at@@ ement . +H@@ R@@ O@@ S will not char@@ ge you any res@@ er@@ v@@ ation fe@@ es for ma@@ king this bo@@ o@@ king , nor char@@ ge your cre@@ d@@ it c@@ ard . +you will simpl@@ y p@@ ay for your st@@ ay at the hot@@ el . +c@@ anc@@ ell@@ ation is free of char@@ ge ; pro@@ vid@@ ed you ad@@ here to the no@@ ti@@ f@@ ication per@@ i@@ od st@@ ated in the hot@@ el c@@ anc@@ ell@@ ation p@@ ol@@ ic@@ y ( see " Hot@@ el P@@ ol@@ ic@@ ies " b@@ el@@ ow ) . +more information can be f@@ ound in our ter@@ ms and con@@ di@@ tions . +see your off@@ ers here to@@ o ? regi@@ st@@ er on@@ line ( free ) ! +this statistic is based on the 68@@ 6@@ 2 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 1@@ 7 S@@ ep 200@@ 5 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 6@@ 8 free Open ICEcat users . +this statistic is based on the 68@@ 6@@ 5 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 1@@ 4 J@@ ul 200@@ 5 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 7@@ 1 free Open ICEcat users . +this statistic is based on the 68@@ 2@@ 7 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 3 J@@ ul 200@@ 6 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 3@@ 6 free Open ICEcat users . +this statistic is based on the 68@@ 7@@ 0 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 4 S@@ ep 200@@ 5 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 7@@ 6 free Open ICEcat users . +this statistic is based on the 68@@ 2@@ 8 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 4 S@@ ep 200@@ 5 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 3@@ 6 free Open ICEcat users . +this statistic is based on the 68@@ 6@@ 3 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 4 O@@ c@@ t 200@@ 6 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 6@@ 9 free Open ICEcat users . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 0@@ 2 N@@ o@@ v 200@@ 6 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 2@@ 8 free Open ICEcat users . +this statistic is based on the 68@@ 5@@ 9 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 15 N@@ o@@ v 200@@ 6 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 6@@ 5 free Open ICEcat users . +this statistic is based on the 68@@ 6@@ 1 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 20 M@@ ar 200@@ 8 . +this statistic is based on the 68@@ 6@@ 2 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 11 M@@ ar 200@@ 7 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 6@@ 8 free Open ICEcat users . +this statistic is based on the 68@@ 6@@ 4 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 15 J@@ ul 200@@ 5 . +ICEcat : cre@@ at@@ ing the world 's l@@ ar@@ g@@ est open cat@@ al@@ og@@ ue with 2@@ 3@@ 00@@ 00@@ 0 produc@@ ts , 6@@ 0@@ 4@@ 8@@ 6@@ 5 data @-@ she@@ ets , 4@@ 00@@ 1 brands . +work t@@ ow@@ ar@@ ds the h@@ ar@@ mon@@ is@@ ation of ter@@ m@@ in@@ ol@@ ogy in the fi@@ el@@ d of a@@ ud@@ it . +from its very begin@@ ning E@@ U@@ R@@ O@@ S@@ A@@ I has been ac@@ tive in or@@ g@@ an@@ iz@@ ing a f@@ ru@@ it@@ ful and m@@ ut@@ u@@ ally b@@ en@@ e@@ f@@ ici@@ al co@@ op@@ er@@ ation in the fi@@ el@@ d of publ@@ ic a@@ ud@@ it among all the coun@@ tri@@ es of the pre@@ vi@@ ous@@ ly di@@ vid@@ ed Euro@@ p@@ e , pri@@ or@@ i@@ ti@@ z@@ ing support for the est@@ abl@@ ish@@ ment of in@@ de@@ p@@ end@@ ent publ@@ ic a@@ ud@@ it bo@@ di@@ es in C@@ entr@@ al and E@@ a@@ st@@ er@@ n Euro@@ pean tran@@ si@@ tion coun@@ tri@@ es . +d@@ ri@@ vers and Updat@@ es includ@@ es h@@ ar@@ d@@ ware d@@ ri@@ vers ( e@@ .@@ g@@ . , N@@ I @-@ D@@ A@@ Q , N@@ I @-@ 4@@ 8@@ 8@@ .@@ 2 , N@@ I @-@ I@@ M@@ A@@ Q , etc . ) , fir@@ m@@ ware updates , and ap@@ pl@@ ication software updates ( e@@ .@@ g@@ . , N@@ I L@@ ab@@ V@@ I@@ E@@ W and N@@ I M@@ eas@@ u@@ re@@ ment S@@ t@@ u@@ di@@ o ) . +to ob@@ ta@@ in in@@ str@@ u@@ ment d@@ ri@@ vers for N@@ I produc@@ ts ( e@@ .@@ g@@ . , N@@ I @-@ S@@ C@@ O@@ P@@ E , N@@ I @-@ D@@ M@@ M , etc . +) or for th@@ ir@@ d @-@ part@@ y produc@@ ts ( e@@ .@@ g@@ . , H@@ P 3@@ 4@@ 4@@ 0@@ 1@@ A D@@ ig@@ it@@ al M@@ ul@@ ti@@ m@@ et@@ er ) , vis@@ it the In@@ str@@ u@@ ment D@@ ri@@ ver N@@ et@@ work . +my P@@ ro@@ file | R@@ S@@ S | P@@ ri@@ v@@ ac@@ y | Le@@ gal | C@@ on@@ t@@ act N@@ I © 200@@ 9 N@@ a@@ tional In@@ str@@ um@@ ents C@@ or@@ por@@ ation . +it 's the last mon@@ th of the year ... time to look bac@@ k on this year and time to em@@ br@@ ace the new one . +for the K@@ in@@ do te@@ am it was a very exc@@ it@@ ing one and we are b@@ us@@ y wor@@ king in the ar@@ ms of the M@@ y@@ H@@ er@@ it@@ age family . +you can re@@ ad the an@@ n@@ oun@@ c@@ ement here ( p@@ d@@ f ) , or get the compl@@ et@@ e F@@ A@@ Q here , but I w@@ an@@ ted to gi@@ ve you some more bac@@ k@@ grou@@ nd on wh@@ y K@@ in@@ do and M@@ y@@ H@@ er@@ it@@ age have te@@ am@@ ed up . +the first time we m@@ et G@@ il@@ ad , the f@@ oun@@ der and C@@ E@@ O of M@@ y@@ h@@ er@@ it@@ age , was in ear@@ ly 200@@ 7 - a few we@@ e@@ ks before we re@@ le@@ ased the first publ@@ ic version of K@@ in@@ do . +G@@ are@@ th and me were in@@ v@@ ited to have l@@ un@@ ch with him in S@@ o@@ h@@ o in c@@ entr@@ al L@@ on@@ don , and w@@ ent there with the ob@@ j@@ ec@@ tive of lear@@ ning ever@@ y@@ thing we could about the " en@@ em@@ y . " +this pro@@ ved to be pre@@ t@@ ty na@@ i@@ ve , since G@@ il@@ ad is much to@@ o n@@ ice to be called an " en@@ em@@ y . " +but there we were : a S@@ we@@ de , a S@@ ou@@ th A@@ f@@ r@@ ic@@ an and an I@@ s@@ ra@@ el@@ i , all with very different pro@@ f@@ essi@@ onal bac@@ k@@ grou@@ nd and li@@ fe st@@ or@@ ies , t@@ al@@ king about the f@@ ut@@ ure of famil@@ ies on@@ line from very different per@@ sp@@ ec@@ ti@@ ves . +I di@@ d@@ n 't ex@@ p@@ ect this , but we f@@ ound that we had much in comm@@ on . +we sh@@ are@@ d the same i@@ de@@ as and vis@@ i@@ on for what we w@@ an@@ ted to ac@@ hi@@ e@@ ve with our b@@ us@@ in@@ ess@@ es , even th@@ ou@@ gh our ap@@ pro@@ ach was f@@ ar from sim@@ il@@ ar . +K@@ in@@ do had set out to bu@@ il@@ d a site that would help you inter@@ act with the family that is around you here and now . +we were tr@@ y@@ ing to come up with to@@ ol@@ s to help you sh@@ are information and comm@@ un@@ ic@@ ate with the people that mat@@ ter most to you right now . +M@@ y@@ H@@ er@@ it@@ age on the other h@@ and had dev@@ el@@ op@@ ed a@@ ma@@ z@@ ing t@@ ec@@ h@@ n@@ ol@@ o@@ gi@@ es to help you find out ever@@ y@@ thing about your family 's hi@@ st@@ ory , and had sp@@ ent years per@@ f@@ ec@@ ting these t@@ ec@@ h@@ n@@ ol@@ o@@ gi@@ es . +ul@@ ti@@ mat@@ ely th@@ ou@@ gh , what inter@@ est@@ ed us both was the op@@ por@@ t@@ un@@ ity to help famil@@ ies dis@@ co@@ ver more about who they are and their p@@ ast , and use the we@@ b to br@@ ing them cl@@ os@@ er to@@ ge@@ ther . +as G@@ are@@ th and I tr@@ av@@ ell@@ ed bac@@ k on the t@@ u@@ be , we t@@ al@@ k@@ ed about how n@@ ice it would be to be able to off@@ er our own users the same to@@ ol@@ s as M@@ y@@ H@@ er@@ it@@ age al@@ read@@ y had . +what re@@ ally go@@ t us exc@@ ited was their S@@ m@@ art@@ M@@ at@@ ch@@ ing T@@ ec@@ h@@ n@@ ol@@ ogy , which mat@@ ch@@ es people in your family t@@ ree with 2@@ 5@@ 0 M@@ illi@@ on other n@@ ames , and su@@ g@@ g@@ ests who you might be rel@@ ated to@@ o . +d@@ uring this su@@ m@@ m@@ er , we '@@ ve been th@@ in@@ king l@@ ong and h@@ ard about the f@@ ut@@ ure of K@@ in@@ do , and what the b@@ est op@@ tion would be for ta@@ king K@@ in@@ do to the ne@@ x@@ t l@@ ev@@ el . +the more time we sp@@ ent with G@@ il@@ ad as well as the rest of the te@@ am in I@@ s@@ ra@@ el ( not to m@@ en@@ tion the very l@@ ou@@ d ro@@ ost@@ er that r@@ un@@ s around in their c@@ am@@ pu@@ s ) , the more con@@ v@@ in@@ c@@ ed we all bec@@ ame - we '@@ ll be bet@@ ter o@@ ff to@@ ge@@ ther . +so we jo@@ in the M@@ y@@ H@@ er@@ it@@ age family because we sh@@ are the same vis@@ i@@ on and val@@ ues ( as famil@@ ies should ) , and because we th@@ in@@ k that we can bu@@ il@@ d an a@@ ma@@ z@@ ing produc@@ t to@@ ge@@ ther - br@@ ing@@ ing re@@ al b@@ en@@ e@@ f@@ its to famil@@ ies around the world . +this is what we '@@ re pl@@ an@@ ning to do over the ne@@ x@@ t years . +as you see from the sm@@ il@@ es in the p@@ ic@@ t@@ ure b@@ el@@ ow , this is a h@@ app@@ y day for us here in the P@@ ut@@ ney Off@@ ices . +this years S@@ we@@ d@@ ish G@@ en@@ eal@@ ogy D@@ ays the y@@ ear@@ ly conf@@ er@@ ence of the S@@ we@@ d@@ ish F@@ e@@ der@@ ation of G@@ en@@ eal@@ og@@ ical S@@ o@@ ci@@ e@@ ties was hel@@ d in the city of M@@ al@@ m@@ ö in the pro@@ v@@ ince of S@@ k@@ å@@ n@@ e in sou@@ th@@ er@@ n S@@ we@@ den . +over 1@@ 00 ex@@ i@@ b@@ it@@ ors c@@ ro@@ w@@ d@@ ed in the con@@ v@@ en@@ tion c@@ enter at Euro@@ p@@ ap@@ or@@ ten in M@@ al@@ m@@ ö . ever@@ y@@ thing from gen@@ eal@@ og@@ ical so@@ ci@@ e@@ ties , ar@@ ch@@ i@@ ves to com@@ pan@@ ies o@@ ffer@@ ing or@@ ig@@ in@@ al s@@ our@@ c@@ es on@@ line . +in the me@@ an@@ while , you can sti@@ ll work on your pro@@ files and uplo@@ ad the ph@@ ot@@ os from the we@@ e@@ k@@ end 's family re@@ un@@ i@@ on . +updat@@ e : now ever@@ y@@ thing is fi@@ x@@ ed ! it was the datab@@ ase ac@@ ting a b@@ it we@@ ir@@ d but S@@ te@@ ph@@ en has d@@ one a great jo@@ b to fi@@ x it . +ever@@ y@@ one h@@ its a br@@ ick w@@ all in gen@@ eal@@ ogy , wh@@ ere the b@@ ir@@ th and de@@ ath rec@@ ords just are@@ n 't easy to find . in that c@@ ase , tr@@ y some different rec@@ ords . +de@@ eds o@@ ft@@ en conta@@ in information on who s@@ old what to wh@@ om ; who in@@ h@@ er@@ ited what from wh@@ om ; or how some land was di@@ vid@@ ed among a family . hi@@ st@@ or@@ ic ta@@ x rec@@ ords on land also som@@ e@@ times have inter@@ est@@ ing information , such as c@@ o @-@ ow@@ n@@ er@@ sh@@ i@@ p . +just as the s@@ qu@@ ir@@ rel@@ s and different k@@ in@@ ds of b@@ ir@@ ds col@@ l@@ ect n@@ u@@ ts and se@@ eds , som@@ e@@ one in the K@@ in@@ do off@@ ice se@@ ems to be do@@ ing some re@@ ally s@@ ol@@ id for@@ w@@ ard pl@@ an@@ ning . +j@@ ud@@ g@@ ing from this p@@ ic@@ t@@ ure - it 's go@@ ing to be a lo@@ o@@ ong w@@ in@@ ter . +as I p@@ o@@ in@@ t out in my bo@@ ok , G@@ en@@ eal@@ ogy O@@ n@@ line 8@@ th E@@ di@@ tion , you can do a lo@@ t of good res@@ earch using s@@ earch en@@ gin@@ es and Bo@@ ol@@ e@@ an op@@ er@@ at@@ ors ( A@@ N@@ D , N@@ O@@ T , O@@ R , and pa@@ ren@@ th@@ es@@ es . ) R@@ ec@@ ent@@ ly , Y@@ a@@ ho@@ o ! +ad@@ v@@ anc@@ ed G@@ o@@ o@@ gl@@ e sti@@ ll acc@@ ep@@ ts the most p@@ op@@ ul@@ ar Bo@@ ol@@ e@@ an ter@@ ms , and E@@ x@@ al@@ e@@ ad even sup@@ por@@ ts the N@@ E@@ A@@ R op@@ er@@ ator , which re@@ ally hel@@ p@@ s with comm@@ on s@@ ur@@ n@@ ames , but L@@ i@@ ve S@@ earch is now the only ma@@ j@@ or s@@ earch en@@ g@@ ine with full Bo@@ ol@@ e@@ an support . +for more detail@@ s on how to do the Bo@@ ol@@ e@@ an Bo@@ o@@ gi@@ e for gen@@ eal@@ ogy , re@@ ad G@@ en@@ eal@@ ogy O@@ n@@ line 8@@ th E@@ di@@ tion . +most begin@@ ners to gen@@ eal@@ ogy are f@@ u@@ z@@ z@@ y about c@@ ous@@ in@@ sh@@ i@@ p . for ex@@ am@@ ple , while many un@@ der@@ st@@ and first c@@ ous@@ ins are people with comm@@ on gr@@ and@@ pa@@ ren@@ ts , many fol@@ ks conf@@ use first c@@ ous@@ in on@@ ce rem@@ o@@ ved and sec@@ on@@ d c@@ ous@@ in . +then , there are famil@@ ies like mine : my m@@ other 's si@@ bl@@ ings were b@@ or@@ n from 19@@ 11 to 19@@ 3@@ 2 . that me@@ ans h@@ er y@@ oun@@ g@@ est b@@ ro@@ ther " M@@ i@@ ke " w@@ ent to s@@ cho@@ ol with the ol@@ d@@ est si@@ st@@ er 's ol@@ d@@ est chil@@ d , his ni@@ ec@@ e " K@@ are@@ n . " +as it tur@@ ned out , U@@ n@@ cl@@ e " M@@ i@@ ke " and ni@@ ec@@ e " K@@ are@@ n " m@@ ar@@ ri@@ ed two people who were si@@ bl@@ ings , " M@@ ic@@ hel@@ l@@ e " and " K@@ ev@@ in , " ma@@ king their children both first c@@ ous@@ ins and first c@@ ous@@ ins on@@ ce rem@@ o@@ ved . +sh@@ ow@@ ing de@@ gre@@ es of re@@ la@@ tion@@ sh@@ i@@ p by bl@@ o@@ od . +-@@ first it do@@ es not re@@ place p@@ ap@@ er do@@ c@@ um@@ ent@@ ation since it do@@ es not name comm@@ on anc@@ est@@ ors , it is more a tool to pro@@ ve lin@@ e@@ age . +s@@ ever@@ al com@@ pan@@ ies , like F@@ amil@@ y T@@ ree D@@ NA , can as@@ s@@ ist you to con@@ fir@@ m your family t@@ ree or allo@@ w you to det@@ er@@ mine wh@@ e@@ ther two people are rel@@ ated if you gi@@ ve them D@@ NA of som@@ e@@ one . +- I@@ t 's l@@ ess re@@ ff@@ ec@@ tive for f@@ em@@ al@@ es to use D@@ NA than for m@@ al@@ es because th@@ an@@ ks to Y @-@ D@@ NA , we can det@@ er@@ mine pat@@ ernal and mat@@ ernal lin@@ es , wh@@ ere@@ as for f@@ em@@ al@@ es it is just mat@@ ernal . +here are a few su@@ g@@ g@@ es@@ tions to get you st@@ ar@@ ted : in@@ t@@ ro@@ duc@@ tion to G@@ en@@ eal@@ ogy from N@@ a@@ tional G@@ en@@ eal@@ og@@ ical S@@ o@@ ci@@ ety ( U@@ .@@ S@@ . ) M@@ ost gen@@ eal@@ o@@ gi@@ st@@ s take this c@@ our@@ se first . it is very A@@ mer@@ ic@@ an @-@ c@@ ent@@ ri@@ c , but the t@@ ec@@ h@@ ni@@ qu@@ es can be ap@@ pl@@ i@@ ed to any coun@@ tr@@ y 's v@@ it@@ al statisti@@ c@@ s . +m@@ emb@@ ers of the N@@ a@@ tional G@@ en@@ eal@@ og@@ ical S@@ o@@ ci@@ ety ( N@@ G@@ S ) receive a t@@ u@@ i@@ tion dis@@ coun@@ t . family H@@ i@@ st@@ ory P@@ er@@ s@@ onal En@@ r@@ ic@@ h@@ ment C@@ l@@ as@@ s@@ es L@@ ear@@ n from B@@ rig@@ ha@@ m Y@@ oun@@ g U@@ ni@@ ver@@ s@@ ity about res@@ earch in the U@@ n@@ ited S@@ t@@ at@@ es , F@@ r@@ ance and Ger@@ many . +Lufthansa ow@@ es its or@@ ig@@ ins to " De@@ ut@@ s@@ ch@@ e Lu@@ ft H@@ ansa A@@ k@@ ti@@ en@@ g@@ es@@ ell@@ s@@ ch@@ a@@ ft " ( ren@@ am@@ ed " Lufthansa " in 19@@ 3@@ 3 ) , which is form@@ ed from a mer@@ g@@ er bet@@ we@@ en " De@@ ut@@ s@@ ch@@ e A@@ er@@ o L@@ lo@@ y@@ d " ( D@@ A@@ L ) and " J@@ un@@ k@@ ers Lu@@ ft@@ ver@@ ke@@ h@@ r " on J@@ an@@ u@@ ary 6 . +the new air@@ line in@@ h@@ er@@ its its c@@ ran@@ e lo@@ go , des@@ ig@@ ned by " De@@ ut@@ s@@ ch@@ e Lu@@ ft @-@ R@@ e@@ ed@@ ere@@ i " in 19@@ 19 , from D@@ A@@ L , the bl@@ ue @-@ and @-@ y@@ el@@ lo@@ w h@@ ous@@ e col@@ our@@ s from J@@ un@@ k@@ ers . +it comm@@ enc@@ es s@@ ch@@ ed@@ ul@@ ed fl@@ igh@@ ts on A@@ pri@@ l 6 with a f@@ le@@ et of 1@@ 6@@ 2 air@@ c@@ ra@@ ft , of 1@@ 8 different typ@@ es . +a fl@@ y@@ ing ex@@ pe@@ di@@ tion to Ch@@ in@@ a is the ev@@ ent of the year . +fol@@ lo@@ wing its ac@@ qu@@ is@@ i@@ tion of sh@@ are@@ s in 19@@ 2@@ 6 in the Ger@@ man @-@ R@@ us@@ si@@ an " D@@ er@@ el@@ u@@ ft " air@@ line , which was f@@ oun@@ d@@ ed in 19@@ 2@@ 1 , Lufthansa is in@@ fl@@ u@@ enti@@ al in the f@@ oun@@ ding of the S@@ pan@@ ish I@@ b@@ er@@ ia , the B@@ ra@@ z@@ il@@ i@@ an " S@@ y@@ n@@ d@@ ic@@ a@@ to C@@ on@@ d@@ or " and the Ch@@ in@@ ese " E@@ ur@@ as@@ ia " air@@ lin@@ es . +Lufthansa op@@ en@@ s the first tr@@ ans @-@ oc@@ e@@ an@@ ic , s@@ ch@@ ed@@ ul@@ ed air@@ ma@@ il serv@@ ice ac@@ ro@@ s@@ s the S@@ ou@@ th A@@ tl@@ an@@ ti@@ c . +bet@@ we@@ en 19@@ 3@@ 6 and 19@@ 3@@ 8 , it also ex@@ per@@ i@@ ments with s@@ ch@@ ed@@ ul@@ ed a@@ ir serv@@ ices ac@@ ro@@ s@@ s the N@@ or@@ th A@@ tl@@ an@@ ti@@ c . +after su@@ b@@ st@@ an@@ ti@@ al ex@@ pan@@ sion of the rou@@ te n@@ et@@ work in 19@@ 3@@ 9 — includ@@ ing fl@@ igh@@ ts to B@@ ang@@ k@@ ok and S@@ an@@ ti@@ ag@@ o de Ch@@ ile — war@@ time a@@ ir serv@@ ices , exc@@ ep@@ t for a few Euro@@ pean coun@@ tri@@ es , are s@@ us@@ p@@ end@@ ed . +all fl@@ igh@@ ts are dis@@ cont@@ in@@ u@@ ed in 19@@ 4@@ 5 and Lufthansa go@@ es into recei@@ ver@@ sh@@ i@@ p and is fin@@ ally w@@ ound up and struc@@ k from the B@@ er@@ l@@ in commercial regi@@ st@@ er in 19@@ 6@@ 5 . +the F@@ e@@ der@@ al T@@ ran@@ sp@@ ort M@@ ini@@ st@@ er s@@ ets up a wor@@ king comm@@ it@@ te@@ e in 19@@ 5@@ 1 to pre@@ p@@ are for the res@@ um@@ p@@ tion of a@@ ir tr@@ a@@ ff@@ ic in p@@ ost@@ w@@ ar Ger@@ many and entr@@ ust@@ s the jo@@ b of im@@ pl@@ em@@ ent@@ ation to " B@@ ü@@ r@@ o B@@ ong@@ ers , " the off@@ ice he@@ ad@@ ed by H@@ ans M@@ . B@@ ong@@ ers , the tr@@ a@@ ff@@ ic ch@@ i@@ e@@ f of the old Lufthansa in C@@ ol@@ og@@ n@@ e . a new com@@ p@@ any to ru@@ n a@@ ir serv@@ ices and n@@ am@@ ed " A@@ k@@ ti@@ en@@ g@@ es@@ ell@@ s@@ ch@@ a@@ ft f@@ ü@@ r Lu@@ ft@@ ver@@ ke@@ h@@ r@@ s@@ be@@ d@@ ar@@ f " ( Luf@@ ta@@ g ) is f@@ oun@@ d@@ ed in C@@ ol@@ og@@ n@@ e on J@@ an@@ u@@ ary 6 , 19@@ 5@@ 3 . +Lufthansa ent@@ ers the j@@ et age , ini@@ ti@@ ally on l@@ ong @-@ ha@@ ul rou@@ t@@ es , with the ar@@ ri@@ v@@ al in the f@@ le@@ et of the Bo@@ e@@ ing 7@@ 0@@ 7 . +the last of the pro@@ p@@ ell@@ er @-@ d@@ ri@@ ven air@@ c@@ ra@@ ft , a V@@ ic@@ k@@ ers V@@ is@@ coun@@ t , is re@@ ti@@ red in 19@@ 7@@ 1 . +con@@ version to j@@ et air@@ c@@ ra@@ ft cont@@ in@@ ues with the st@@ ar@@ t of fl@@ igh@@ ts on medi@@ um @-@ ha@@ ul rou@@ t@@ es with the Bo@@ e@@ ing 7@@ 2@@ 7 and , on sh@@ ort @-@ ha@@ ul , with the Bo@@ e@@ ing 7@@ 3@@ 7 , the city j@@ et l@@ ar@@ g@@ ely fa@@ th@@ er@@ ed by Lufthansa . +the wi@@ de @-@ bo@@ d@@ y er@@ a be@@ g@@ ins at Lufthansa with the d@@ eli@@ very of its first Bo@@ e@@ ing 7@@ 4@@ 7 j@@ umb@@ o j@@ et in 19@@ 7@@ 0 , l@@ at@@ er to be jo@@ ined by the M@@ c@@ D@@ on@@ n@@ ell @-@ D@@ ou@@ gl@@ as D@@ C@@ 1@@ 0 and the A@@ 3@@ 00 , the first of the j@@ ets from the ne@@ w@@ ly f@@ oun@@ d@@ ed Euro@@ pean air@@ c@@ ra@@ ft@@ ma@@ k@@ er . +Lufthansa res@@ um@@ es fl@@ igh@@ ts to B@@ er@@ l@@ in 4@@ 5 years after the end of W@@ or@@ ld W@@ ar T@@ wo fol@@ lo@@ wing Ger@@ many 's re@@ un@@ i@@ f@@ ication . +Lufthansa ma@@ st@@ ers its wor@@ st @-@ ever ec@@ on@@ om@@ ic c@@ r@@ is@@ is with a s@@ we@@ ep@@ ing re@@ ha@@ b@@ il@@ it@@ ation pro@@ gr@@ am@@ me . +the air@@ line , l@@ ar@@ g@@ ely ow@@ ned by the st@@ ate , is pri@@ v@@ ati@@ sed st@@ ep by st@@ ep . +its M@@ R@@ O , c@@ ar@@ go and I@@ T b@@ us@@ in@@ ess@@ es are sp@@ u@@ n o@@ ff as in@@ de@@ p@@ end@@ ent com@@ pan@@ ies . +Lufthansa , A@@ ir C@@ an@@ ada , S@@ A@@ S , T@@ ha@@ i A@@ ir@@ w@@ ays un@@ d U@@ n@@ ited A@@ ir@@ lin@@ es cre@@ ate the " Star Al@@ li@@ ance , " the world 's first m@@ ul@@ ti@@ l@@ at@@ er@@ al air@@ line grou@@ p@@ ing , l@@ at@@ er to be jo@@ ined by other c@@ ar@@ ri@@ ers . +the Lufthansa A@@ vi@@ ation G@@ rou@@ p e@@ qui@@ p@@ s it@@ self for the new m@@ il@@ l@@ en@@ ium , tr@@ ain@@ ing its f@@ oc@@ us on in@@ no@@ v@@ ation and qu@@ al@@ ity . +pl@@ ac@@ ement of ord@@ ers for 15 A@@ ir@@ bu@@ s A@@ 3@@ 8@@ 0 me@@ g@@ al@@ in@@ ers char@@ ts the air@@ line 's p@@ ath into the f@@ ut@@ ure . +even d@@ uring times of c@@ r@@ is@@ is in the av@@ i@@ ation ind@@ u@@ str@@ y , Lufthansa re@@ ma@@ ins on the as@@ c@@ ent : with the " F@@ ut@@ ure Euro@@ pean O@@ per@@ ations " pro@@ gr@@ am@@ me , the air@@ line re@@ or@@ g@@ an@@ is@@ es its regi@@ onal m@@ ark@@ ets , while g@@ ain@@ ing new part@@ ner air@@ lin@@ es to ex@@ p@@ and the Star Al@@ li@@ ance g@@ lo@@ b@@ al rou@@ te n@@ et@@ work . +pas@@ s@@ en@@ g@@ ers en@@ jo@@ y gre@@ at@@ er com@@ for@@ t in a compl@@ et@@ ely rev@@ amp@@ ed B@@ us@@ in@@ ess C@@ l@@ as@@ s with f@@ ast b@@ ro@@ ad@@ b@@ and In@@ ter@@ n@@ et connec@@ tiv@@ ity in the air@@ c@@ ra@@ ft c@@ ab@@ in . +Lufthansa cre@@ at@@ es new per@@ sp@@ ec@@ ti@@ ves for Ger@@ many 's f@@ ut@@ ure as a b@@ us@@ in@@ ess loc@@ ation : Lufthansa ord@@ ers 20 Bo@@ e@@ ing 7@@ 4@@ 7 @-@ 8@@ s and is the la@@ un@@ ch c@@ ust@@ om@@ er for the air@@ c@@ ra@@ ft . +pre@@ par@@ ations for the A@@ 3@@ 8@@ 0 incl@@ u@@ de rou@@ te pro@@ ving with A@@ ir@@ bu@@ s , a new A@@ 3@@ 8@@ 0 ma@@ in@@ ten@@ ance h@@ ang@@ ar and a new ter@@ m@@ in@@ al are@@ a in F@@ ran@@ k@@ f@@ ur@@ t . +the Lufthansa A@@ vi@@ ation C@@ enter b@@ ecom@@ es an ar@@ ch@@ it@@ ec@@ tur@@ al fl@@ ag@@ sh@@ i@@ p . +Lufthansa C@@ ar@@ go f@@ oun@@ ds the c@@ ar@@ go air@@ line A@@ er@@ o@@ L@@ o@@ gi@@ c with D@@ H@@ L E@@ x@@ p@@ ress . +f@@ ur@@ ther part@@ ners st@@ ren@@ g@@ then the Star Al@@ li@@ ance , which now en@@ com@@ pas@@ s@@ es 2@@ 1 m@@ emb@@ ers . +ad@@ just set@@ t@@ ings for the connec@@ tion to dev@@ ices that support DLNA . +en@@ able connec@@ tion to dev@@ ices that support DLNA . +dis@@ able connec@@ tion to dev@@ ices that support DLNA . +DLNA ( D@@ ig@@ it@@ al L@@ i@@ ving N@@ et@@ work Al@@ li@@ ance ) is a st@@ and@@ ard that en@@ abl@@ es di@@ g@@ it@@ al dev@@ ices such as pers@@ onal com@@ pu@@ t@@ ers , di@@ g@@ it@@ al vi@@ de@@ o rec@@ ord@@ ers , and T@@ V@@ s to be connec@@ ted on a n@@ et@@ work and to sh@@ are data that is on other connec@@ ted , DLNA @-@ com@@ pa@@ ti@@ ble dev@@ ices . +" serv@@ ers " distribu@@ te medi@@ a such as image , m@@ us@@ ic , or vi@@ de@@ o files , and " c@@ li@@ ents " receive and pl@@ ay the medi@@ a . +using a PS@@ 3 ™ system as a c@@ li@@ ent , you can dis@@ pl@@ ay ima@@ ges , or pl@@ ay m@@ us@@ ic or vi@@ de@@ o files that are st@@ ore@@ d on a dev@@ ice with DLNA Media Ser@@ ver func@@ tion@@ al@@ ity over a n@@ et@@ work . +con@@ n@@ ect the PS@@ 3 ™ system and DLNA Media Ser@@ ver using a wi@@ red or wi@@ rel@@ ess connec@@ tion . +set up the DLNA Media Ser@@ ver so that it can be used by the PS@@ 3 ™ system . +the fol@@ lo@@ wing dev@@ ices can be used as DLNA Media Ser@@ vers . +en@@ able the DLNA Media Ser@@ ver func@@ tion of the connec@@ ted dev@@ ice to make its content available for sh@@ are@@ d acc@@ ess . +the set@@ up me@@ th@@ od v@@ ar@@ ies de@@ p@@ end@@ ing on the connec@@ ted dev@@ ice . +for detail@@ s , ref@@ er to the in@@ struc@@ tions sup@@ pl@@ i@@ ed with the dev@@ ice . +a M@@ ic@@ ro@@ so@@ ft ® Windows ® pers@@ onal com@@ puter can be used as a DLNA Media Ser@@ ver by using Windows Media ® P@@ lay@@ er 11 func@@ tions . +from the l@@ ist of dev@@ ices un@@ der the &#@@ 9@@ 1@@ ; S@@ h@@ are medi@@ a &#@@ 9@@ 3@@ ; ch@@ ec@@ k@@ bo@@ x , sel@@ ect the dev@@ ices that you want to sh@@ are data with , and then sel@@ ect &#@@ 9@@ 1@@ ; Al@@ lo@@ w &#@@ 9@@ 3@@ ; . +set@@ up for the DLNA Media Ser@@ ver is compl@@ et@@ ed . +Windows Media ® P@@ lay@@ er 11 is not inst@@ alled by de@@ fa@@ ul@@ t on a M@@ ic@@ ro@@ so@@ ft ® Windows ® pers@@ onal com@@ puter . +downlo@@ ad the inst@@ all@@ er from the M@@ ic@@ ro@@ so@@ ft ® W@@ e@@ b site to inst@@ all Windows Media ® P@@ lay@@ er 11 . +for detail@@ s about how to use Windows Media ® P@@ lay@@ er 11 , ref@@ er to the Windows Media ® P@@ lay@@ er 11 H@@ el@@ p feat@@ ure . +in some c@@ ases , or@@ ig@@ in@@ al DLNA Media Ser@@ ver software may be inst@@ alled on the pers@@ onal com@@ puter . +for detail@@ s , ref@@ er to the in@@ struc@@ tions sup@@ pl@@ i@@ ed with the com@@ puter . +all available fol@@ d@@ ers and files that can be play@@ ed by the PS@@ 3 ™ system will be dis@@ play@@ ed . +sel@@ ect the file that you want to pl@@ ay . +the PS@@ 3 ™ system must be connec@@ ted to a n@@ et@@ work . +the fol@@ der n@@ ames that are dis@@ play@@ ed v@@ ary de@@ p@@ end@@ ing on the DLNA Media Ser@@ ver . +de@@ p@@ end@@ ing on the DLNA Media Ser@@ ver , some files may not be play@@ able or op@@ er@@ ations that can be per@@ form@@ ed d@@ uring play@@ bac@@ k may be re@@ str@@ ic@@ ted . +file n@@ ames for data that is st@@ ore@@ d on serv@@ ers that are not compl@@ i@@ ant with DLNA may have an a@@ st@@ er@@ is@@ k app@@ end@@ ed to the file name . +in some c@@ ases , these files c@@ an@@ not be play@@ ed on the PS@@ 3 ™ system . +also , even if the files can be play@@ ed on the PS@@ 3 ™ system , it might not be pos@@ sible to pl@@ ay the files on other dev@@ ices . +you can ini@@ ti@@ ate a s@@ earch for DLNA Media Ser@@ vers on the same n@@ et@@ work . +use this feat@@ ure if no DLNA Media Ser@@ ver is det@@ ec@@ ted when the PS@@ 3 ™ system is tur@@ ned on . +when the s@@ earch res@@ ul@@ ts are dis@@ play@@ ed and you re@@ tur@@ n to the h@@ om@@ e m@@ en@@ u , a l@@ ist of DLNA Media Ser@@ vers that can be connec@@ ted will be dis@@ play@@ ed . +apartments | Hot@@ els | Host@@ el | H@@ ol@@ iday H@@ ous@@ es | Th@@ ings to do | Last Minute Offers ! +holiday l@@ et@@ t@@ ings available on the C@@ ost@@ a de@@ l S@@ ol of the ... +apartment to rent in F@@ u@@ en@@ gi@@ ro@@ la , 15 m@@ inut@@ es w@@ al@@ k from the ... +at only st@@ ep@@ s from the P@@ as@@ e@@ o M@@ ari@@ ti@@ m@@ o , this beach ... +19@@ 6 / 200@@ 3 re@@ g@@ ar@@ ding the proc@@ ess@@ ing of pers@@ onal data and con@@ s@@ en@@ su@@ s to proc@@ ess such data . data are being ac@@ qui@@ red here in order to regi@@ st@@ er the pers@@ on in@@ v@@ ol@@ ved and to ini@@ ti@@ ate a serv@@ ice through which said pers@@ on will receive information about off@@ ers , pr@@ ices and sim@@ il@@ ar ini@@ ti@@ ati@@ ves per@@ ta@@ in@@ ing to the R@@ om@@ an@@ ti@@ k Hot@@ el P@@ ost@@ a C@@ av@@ all@@ in@@ o B@@ i@@ anc@@ o . +even th@@ ou@@ gh pro@@ vid@@ ing such data , which will be proc@@ ess@@ ed by me@@ ans of el@@ ec@@ tr@@ on@@ ic proc@@ ed@@ ures , is not man@@ dat@@ ory , not ag@@ re@@ e@@ ing to f@@ ur@@ n@@ ish such information will make proc@@ ess@@ ing by the h@@ and@@ l@@ er im@@ pos@@ sible . +the pers@@ on in@@ v@@ ol@@ ved can re@@ qu@@ est to updat@@ e , cor@@ rec@@ t and sup@@ pl@@ ement in@@ compl@@ et@@ e or in@@ acc@@ ur@@ ate data , and to c@@ anc@@ el such data when the proc@@ ess@@ ing vi@@ ol@@ at@@ es le@@ gal st@@ and@@ ar@@ ds or re@@ g@@ ul@@ ations , and can ex@@ er@@ c@@ is@@ e the other righ@@ ts est@@ abl@@ ished by ar@@ t . +19@@ 6 / 200@@ 3 by cont@@ ac@@ ting the ow@@ ner of the proc@@ ess@@ ing , the R@@ om@@ an@@ ti@@ k Hot@@ el P@@ ost@@ a C@@ av@@ all@@ in@@ o B@@ i@@ anc@@ o . +which n@@ av@@ ig@@ ation systems are com@@ pa@@ ti@@ ble with T@@ el@@ e A@@ tl@@ as m@@ ap@@ s ? +do I receive con@@ fir@@ ma@@ tion of my order ? +what are T@@ el@@ e A@@ tl@@ as " gener@@ al ter@@ ms and con@@ di@@ tions ? +which n@@ av@@ ig@@ ation systems are com@@ pa@@ ti@@ ble with T@@ el@@ e A@@ tl@@ as m@@ ap@@ s ? +this p@@ ac@@ k@@ age conta@@ ins the do@@ c@@ um@@ ent@@ ation for ot@@ r@@ s@@ 2 in English l@@ ang@@ u@@ age . +this serv@@ ice is spon@@ so@@ red by 1 & 1 In@@ ter@@ n@@ et A@@ G . +holiday A@@ part@@ ments | Hot@@ els | Host@@ els | T@@ o@@ p ten things to do | Last Minute Offers ! +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 9 O@@ c@@ t 200@@ 6 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 2@@ 8 free Open ICEcat users . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 9 O@@ c@@ t 200@@ 6 . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 2 De@@ c 200@@ 6 . +der en@@ gl@@ is@@ ch@@ sp@@ r@@ ac@@ h@@ ig@@ e Ch@@ an@@ ne@@ l li@@ st@@ et In@@ h@@ al@@ te a@@ us@@ ge@@ w@@ ä@@ h@@ l@@ ter B@@ lo@@ g@@ s au@@ f , di@@ e re@@ g@@ el@@ m@@ ä@@ ß@@ i@@ g B@@ e@@ it@@ r@@ ä@@ ge z@@ u WordPress publ@@ iz@@ i@@ er@@ en . +n@@ or@@ m@@ ally when I set up plug@@ in @-@ l@@ ev@@ el S@@ E@@ O on a WordPress bl@@ o@@ g , I '@@ ll need 5 @-@ 8 plug@@ ins to pro@@ vi@@ de all the des@@ i@@ red S@@ E@@ O func@@ tion@@ al@@ ity . w@@ ou@@ l@@ d@@ n 't it be co@@ ol if there was one plug@@ in that in@@ cor@@ por@@ ated all that func@@ tion@@ al@@ ity and more into one easy @-@ to @-@ use su@@ ite ? +there is fin@@ ally an off@@ ici@@ al ans@@ w@@ er to the qu@@ es@@ tion of wh@@ e@@ ther or not WordPress the@@ mes must " in@@ h@@ er@@ it " the G@@ P@@ L license that WordPress it@@ self us@@ es . +M@@ at@@ t as@@ k@@ ed the S@@ o@@ ftware F@@ re@@ e@@ dom L@@ a@@ w C@@ enter to ex@@ am@@ ine the WordPress s@@ our@@ ce and how the@@ mes f@@ it in . the fin@@ al , off@@ ici@@ al ans@@ w@@ er to wh@@ e@@ ther the@@ mes must be G@@ P@@ L ? +the WordPress P@@ l@@ ug@@ in Com@@ pe@@ ti@@ tion is the b@@ est m@@ om@@ ent of the year , plug@@ in w@@ is@@ e . +this year 's edi@@ tion is sti@@ ll r@@ un@@ ning ti@@ ll the end of the mon@@ th of J@@ u@@ ly and as of writ@@ ing , there are about 1@@ 0 plug@@ ins ( with some re@@ ally inter@@ est@@ ing st@@ u@@ ff ) . sti@@ ll , new plug@@ ins are re@@ le@@ ased every day . +wh@@ y are@@ n 't these au@@ th@@ ors jo@@ in@@ ing the com@@ pe@@ ti@@ tion ? par@@ ti@@ ci@@ pat@@ ing in the com@@ pe@@ ti@@ tion is In@@ struc@@ tive : you * will * lear@@ n som@@ e@@ thing . +if WordPress were a coun@@ tr@@ y , our B@@ il@@ l of R@@ igh@@ ts would be the G@@ P@@ L because it pro@@ t@@ ects our c@@ ore f@@ re@@ ed@@ om@@ s . +we '@@ ve al@@ w@@ ays d@@ one our b@@ est to ke@@ ep Word@@ P@@ res@@ s@@ .@@ or@@ g cl@@ e@@ an and only pro@@ m@@ ote things that are compl@@ et@@ ely com@@ pa@@ ti@@ ble and le@@ gal with WordPress " license . +there have been some qu@@ es@@ tions in the comm@@ un@@ ity about wh@@ e@@ ther the G@@ P@@ L ap@@ pl@@ ies to the@@ mes like we '@@ ve al@@ w@@ ays as@@ su@@ m@@ ed . +the Off@@ ici@@ al WordPress Com@@ mercial Th@@ em@@ e D@@ i@@ rec@@ t@@ ory is now open . it is lin@@ k@@ ed o@@ ff the WordPress E@@ x@@ t@@ end and is part of the pa@@ rent the@@ me di@@ rec@@ t@@ ory set of lin@@ ks . +it is not as much a di@@ rec@@ t@@ ory as it is a li@@ sting of sites that off@@ er commercial G@@ P@@ L the@@ mes . +if you are@@ n 't al@@ read@@ y fol@@ lo@@ wing al@@ ong , I h@@ igh@@ ly re@@ comm@@ end ch@@ ec@@ king out the H@@ ow T@@ o C@@ re@@ ate A WordPress Th@@ em@@ e t@@ ut@@ or@@ i@@ al ser@@ ies by I@@ an S@@ te@@ war@@ t ( Th@@ em@@ e@@ S@@ h@@ ap@@ er@@ .@@ co@@ m ) . +this 1@@ 2 @-@ part ser@@ ies ( 8 compl@@ et@@ e at the time of this writ@@ ing ) a@@ im@@ s to take you from no@@ thing to a ful@@ ly func@@ tional , s@@ em@@ an@@ tic@@ ally r@@ ich , fl@@ ex@@ i@@ ble WordPress the@@ me in di@@ g@@ es@@ ti@@ ble ch@@ un@@ ks . +al@@ ong the way , I@@ an des@@ c@@ ri@@ b@@ es the ch@@ ang@@ es being made , and wh@@ y you are ma@@ king them . +a c@@ ri@@ tic@@ al v@@ ul@@ n@@ er@@ ab@@ il@@ ity has been dis@@ co@@ ver@@ ed in the WordPress P@@ l@@ ug@@ in R@@ el@@ ated S@@ it@@ es plug@@ in . +an ex@@ plo@@ it is available in the wil@@ d and available on M@@ il@@ w@@ 0@@ r@@ m , ma@@ king this at@@ t@@ ac@@ k eas@@ i@@ er to ex@@ plo@@ it . +al@@ th@@ ou@@ gh , the v@@ ul@@ n@@ er@@ ab@@ il@@ ity s@@ ays that version 2.@@ 1 is v@@ ul@@ n@@ er@@ able . +you should as@@ su@@ me pre@@ vi@@ ous ver@@ si@@ ons are v@@ ul@@ n@@ er@@ able as well . +D@@ M Al@@ bu@@ ms ™ is an in@@ line photo al@@ bu@@ m / g@@ all@@ ery plug@@ in that dis@@ pl@@ ays h@@ ig@@ h qu@@ al@@ ity ima@@ ges and th@@ umb@@ n@@ ail@@ s per@@ f@@ ec@@ tly si@@ z@@ ed to your bl@@ o@@ g . two v@@ ul@@ n@@ er@@ ab@@ il@@ i@@ ties have been made publ@@ ic : 1 . +w@@ p R@@ o@@ ad@@ ma@@ p a@@ im@@ s to cre@@ ate a detail@@ ed vi@@ ew of how WordPress works , by or@@ g@@ an@@ iz@@ ing the l@@ ist of c@@ all@@ s by the order in which they are ex@@ ec@@ u@@ ted T@@ he co@@ de cur@@ ren@@ tly tr@@ ac@@ ks ac@@ tions , fil@@ t@@ ers , includ@@ es , and re@@ qui@@ res . +w@@ p R@@ o@@ ad@@ ma@@ p can tr@@ ac@@ k the order of these c@@ all@@ s from different page vi@@ e@@ ws and different ver@@ si@@ ons of WordPress . +WordPress C@@ on@@ f@@ ig@@ ur@@ ation T@@ r@@ ic@@ ks : if you have ever inst@@ alled WordPress and w@@ an@@ ted to know what el@@ se you could do with your w@@ p @-@ conf@@ ig@@ .@@ ph@@ p file , this is the bl@@ o@@ g pos@@ t to re@@ ad . +as O@@ z@@ h p@@ o@@ in@@ ts out in the com@@ ments , a c@@ ou@@ ple of tr@@ ic@@ ks were le@@ ft out but no@@ thing that could not be re@@ medi@@ ed with a sim@@ ple G@@ o@@ o@@ gl@@ e s@@ earch ( e@@ .@@ g@@ . +w@@ p _ H@@ T@@ T@@ P ) as l@@ ong as you know what you are lo@@ o@@ king for . +ti@@ p@@ s from that page that go@@ t me th@@ in@@ king ( things that make you go h@@ m@@ m@@ m@@ m@@ m ? +bet@@ a 2 is here . get it while it 's ... sti@@ ll not qu@@ ite out of B@@ et@@ a ... or som@@ e@@ thing . +D@@ ev@@ 4@@ Press has d@@ one some inter@@ est@@ ing work on b@@ en@@ ch@@ m@@ ar@@ king vari@@ ous as@@ p@@ ects of WordPress and then t@@ est@@ ing out some p@@ op@@ ul@@ ar qu@@ es@@ tions on page lo@@ ad@@ s etc . +the me@@ th@@ o@@ d@@ ol@@ o@@ gi@@ es are well do@@ c@@ um@@ en@@ ted and the b@@ en@@ ch@@ m@@ ar@@ king set@@ up is st@@ and@@ ar@@ di@@ z@@ ed . +they are t@@ est@@ ing th@@ ree version of WordPress includ@@ ing 2.@@ 6@@ .@@ 5 , 2.@@ 7@@ .@@ 1 and 2.@@ 8 ( I w@@ ish they would have wa@@ ited for 2.@@ 8@@ .@@ 1 to be re@@ le@@ ased ) . +are you one of those wor@@ ri@@ ed about the prog@@ ress of your WordPress bl@@ o@@ g ? do you w@@ an@@ n@@ a know how your vis@@ it@@ ors f@@ ound you and how they inter@@ act with your site ? +well , it re@@ qui@@ res you to d@@ r@@ il@@ l down the site an@@ al@@ y@@ tic@@ s to get to the b@@ ot@@ to@@ m . +G@@ o@@ o@@ gl@@ e An@@ al@@ y@@ tic@@ s is a tr@@ ust@@ ed th@@ ir@@ d part@@ y serv@@ ice that pro@@ vid@@ es an acc@@ ur@@ ate vi@@ ew and detail@@ ed an@@ al@@ y@@ s@@ is of per@@ for@@ man@@ ce statisti@@ c@@ s of your we@@ b@@ site without as@@ king for your own res@@ our@@ ce . it may s@@ ound str@@ an@@ ge that some other we@@ b@@ site is tr@@ ac@@ king your per@@ for@@ man@@ ce without you having to pro@@ vi@@ de an@@ y@@ thing . +what do you me@@ an by a v@@ ir@@ t@@ ual world ? a place wh@@ ere you can come and work and ch@@ at and pl@@ ay and le@@ a@@ ve . +what if I t@@ ell you about a wh@@ ol@@ e new world wh@@ ere you can li@@ ve , bre@@ a@@ the , pl@@ ay , s@@ ing , d@@ ance , do every d@@ am@@ n thing you can do in this ( s@@ or@@ r@@ y ? +li@@ ve S@@ tre@@ am@@ ing vi@@ de@@ o from Word@@ C@@ am@@ p D@@ all@@ as th@@ an@@ ks to Cal@@ i Le@@ w@@ is of G@@ e@@ e@@ k@@ B@@ ri@@ e@@ f@@ .@@ t@@ v@@ . Th@@ an@@ ks to W@@ P@@ T@@ a@@ ver@@ n for the lin@@ k . +I was at Word@@ C@@ am@@ p D@@ all@@ as last year but could not make it to this one . I am j@@ eal@@ ous ! +Word@@ C@@ am@@ p D@@ all@@ as 200@@ 9 is go@@ ing on this we@@ e@@ k@@ end in the D@@ F@@ W are@@ a , and T@@ he WordPress P@@ o@@ d@@ c@@ ast is h@@ app@@ y to be one the spons@@ ors . +Cal@@ i Le@@ w@@ is is li@@ ve st@@ re@@ am@@ ing the enti@@ re ev@@ ent , so if you want to cat@@ ch some good WordPress in@@ f@@ o , that can 't be be@@ at ! +2.@@ 8@@ .@@ 1 B@@ et@@ a 2 is read@@ y for t@@ est@@ ing . downlo@@ ad it , ch@@ ec@@ k out the ch@@ ang@@ es since bet@@ a 1 , and re@@ vi@@ ew all tic@@ k@@ ets fi@@ x@@ ed in 2.@@ 8@@ .@@ 1 . +we es@@ p@@ eci@@ ally su@@ g@@ g@@ est , re@@ comm@@ end , and be@@ g that plug@@ in dev@@ el@@ op@@ ers t@@ est their plug@@ ins ag@@ ain@@ st bet@@ a 2 and let us know of any is@@ su@@ es . +one of the sites I '@@ m r@@ un@@ ning is a l@@ ong est@@ abl@@ ished comm@@ un@@ ity we@@ b@@ site with z@@ illi@@ ons of p@@ ost@@ s and b@@ a@@ z@@ illi@@ ons of com@@ ments . +the wh@@ ol@@ e st@@ u@@ ff is pow@@ er@@ ed by a h@@ om@@ e made C@@ M@@ S writ@@ ten in P@@ er@@ l , som@@ e@@ thing I di@@ d ne@@ ar@@ ly 1@@ 0 years ag@@ o and that is t@@ ot@@ ally ou@@ t@@ dat@@ ed to@@ day . +it 's not un@@ comm@@ on that a bl@@ o@@ g r@@ un@@ s by different au@@ th@@ ors , so it may be us@@ e@@ ful if you can have a qu@@ ick look at the dra@@ f@@ ts of all au@@ th@@ ors . +in our jo@@ in@@ t bl@@ o@@ g W@@ P En@@ gin@@ e@@ er we cre@@ ated a fe@@ ed , which ke@@ ep@@ s us up to dat@@ e if a new dra@@ ft of all au@@ th@@ ors were cre@@ ated . the work of every au@@ th@@ or is different and the d@@ as@@ h@@ bo@@ ard is the c@@ enter of information . +th@@ erefore we have dec@@ id@@ ed to sup@@ pl@@ ement a w@@ id@@ get in the D@@ as@@ h@@ bo@@ ard , which sh@@ ow@@ s the last fi@@ ve dra@@ f@@ ts of all au@@ th@@ ors . I en@@ h@@ anc@@ ed the ex@@ i@@ sting P@@ l@@ ug@@ in D@@ ra@@ ft F@@ e@@ ed . +so , if I di@@ d@@ n 't for@@ get to uplo@@ ad a z@@ i@@ p ar@@ ch@@ i@@ ve or to p@@ ress a " Com@@ m@@ it " bu@@ t@@ t@@ on , it should be all here for you : the G@@ P@@ L '@@ d U@@ R@@ L sh@@ or@@ ten@@ er I w@@ ro@@ te about ear@@ li@@ er that L@@ est@@ er Ch@@ an and I have been using for a while now . +our U@@ R@@ L sh@@ or@@ ten@@ er serv@@ ice is called Y@@ O@@ U@@ R@@ L@@ S , as in Y@@ our O@@ w@@ n U@@ R@@ L S@@ h@@ or@@ ten@@ er . it 's all G@@ P@@ L , free , do @-@ what @-@ you @-@ want @-@ with @-@ it . +based upon the e@@ ma@@ il@@ s we get e@@ ach mon@@ th from read@@ ers who are tr@@ y@@ ing to find our T@@ w@@ it@@ ter fe@@ ed , I fe@@ el this pos@@ t is pro@@ b@@ abl@@ y l@@ ong o@@ ver@@ d@@ ue , but I w@@ an@@ ted to p@@ o@@ in@@ t out to our read@@ er@@ sh@@ i@@ p that we do in f@@ act have a T@@ w@@ it@@ ter page ! +if you '@@ d like to fol@@ lo@@ w W@@ PH@@ ac@@ k@@ s@@ .@@ co@@ m on T@@ w@@ it@@ ter , you can get our updates here ( @ H@@ ac@@ k@@ WordPress ) . +here 's a s@@ ni@@ p@@ p@@ et of his re@@ as@@ on@@ ing ... I go@@ t a ti@@ p that Ch@@ r@@ is An@@ der@@ s@@ on 's up@@ com@@ ing bo@@ ok F@@ ree has the fol@@ lo@@ wing to say about WordPress : 2 . feat@@ ure li@@ m@@ ited ( B@@ as@@ ic version free , more s@@ op@@ h@@ is@@ tic@@ ated version pa@@ id . +to@@ day we have a l@@ it@@ tl@@ e lin@@ k ti@@ p for our read@@ ers : at w@@ p t@@ op@@ ic@@ s you can find all us@@ e@@ ful ne@@ ws , h@@ ac@@ ks , t@@ ut@@ or@@ i@@ als about WordPress in one place right a@@ way ! +in e@@ ach cat@@ e@@ g@@ ory are the most rel@@ ev@@ ant and inter@@ est@@ ing we@@ b@@ sites about WordPress li@@ st@@ ed . +you can also v@@ ote for e@@ ach we@@ b@@ site , how much you like their content . +WordPress 2.@@ 8 just c@@ ro@@ s@@ sed the 1 m@@ illi@@ on downlo@@ ad m@@ ark to@@ day , you can see the li@@ ve coun@@ ter for WordPress downlo@@ ad@@ s on the downlo@@ ad coun@@ ter page . +this is de@@ fin@@ it@@ ely great ne@@ ws since this m@@ il@@ est@@ one was re@@ ac@@ h@@ ed in 1@@ 2 days , have you up@@ gr@@ ad@@ ed to WordPress 2.@@ 8 y@@ et ? th@@ an@@ ks @ ph@@ ot@@ om@@ at@@ t vi@@ a t@@ w@@ it@@ ter . +by the time you '@@ re re@@ ading this , it is qu@@ ite pos@@ sible that the B@@ ud@@ d@@ y@@ Press rev@@ ol@@ u@@ tion will have h@@ app@@ en@@ ed , but if you '@@ re re@@ ading this so@@ on after this was p@@ ost@@ ed , I can sa@@ f@@ ely say that the B@@ ud@@ d@@ y@@ Press rev@@ ol@@ u@@ tion h@@ as@@ n 't h@@ app@@ en@@ ed y@@ et . it 's un@@ der way , but is h@@ as@@ n 't h@@ app@@ en@@ ed . +one we@@ e@@ k after A@@ C / D@@ C 's B@@ l@@ ac@@ k I@@ ce T@@ our in P@@ ar@@ is , I '@@ ve had y@@ et an@@ other m@@ us@@ ical ren@@ de@@ z @-@ v@@ ous : H@@ ell@@ f@@ est Open A@@ ir 200@@ 9 ( m@@ om@@ ent@@ ar@@ il@@ y cl@@ os@@ ed as of writ@@ ing ) , a m@@ us@@ ic f@@ es@@ tiv@@ al hel@@ d about 3@@ 0 k@@ ms from wh@@ ere I cur@@ ren@@ tly li@@ ve . +I '@@ ve se@@ en li@@ ve on st@@ age : ( ... ) R@@ e@@ ad the rest of B@@ ands I '@@ ve S@@ e@@ en T@@ his W@@ e@@ e@@ k@@ end : H@@ ell@@ f@@ est 200@@ 9 , M@@ y R@@ ec@@ a@@ p ( 2@@ 1@@ 6 words ) ( c@@ c ) O@@ z@@ h for pl@@ an@@ et@@ O@@ z@@ h , 200@@ 9 . +since so many people are having sim@@ ple pro@@ bl@@ ems with their WordPress 2.@@ 8 inst@@ all@@ ations , the WordPress comm@@ un@@ ity is wor@@ king ti@@ rel@@ ess@@ ly to get a p@@ o@@ in@@ t @-@ one updat@@ e out as so@@ on as pos@@ sible , and the first st@@ ep on that ro@@ ad is to t@@ est a bet@@ a of the re@@ le@@ ase . +WordPress 2.@@ 8@@ .@@ 1 B@@ et@@ a 1 has been re@@ le@@ ased , and they need people to t@@ est it out and see if it fi@@ x@@ es the pro@@ bl@@ ems , so if you are having is@@ su@@ es with WordPress 2.@@ 8 , tr@@ y the bet@@ a , and see if that res@@ ol@@ ves it . +some plug@@ ins are c@@ a@@ using g@@ ri@@ e@@ f for those up@@ gr@@ ading to 2.@@ 8 . H@@ y@@ per@@ D@@ B needs to be updat@@ ed to the l@@ at@@ est version , o@@ th@@ er@@ w@@ is@@ e ta@@ g@@ s w@@ on 't sa@@ ve . +plug@@ ins that lo@@ ad old ver@@ si@@ ons of j@@ Qu@@ ery for all ad@@ m@@ in pa@@ ges will bre@@ a@@ k all k@@ in@@ ds of st@@ u@@ ff . plug@@ ins should use the version of j@@ Qu@@ ery that sh@@ i@@ p@@ s with W@@ P . +K@@ en@@ o X@@ per@@ i@@ ment is an ex@@ t@@ end@@ ed version of the K@@ en@@ o L@@ ottery g@@ ame . +the play@@ er is pres@@ en@@ ted with a g@@ ame bo@@ ard con@@ si@@ sting of the numb@@ ers 1 @-@ 8@@ 0 from which he has to choose his l@@ uc@@ k@@ y 2 @-@ 1@@ 0 numb@@ ers . +fol@@ lo@@ wing that , you can ei@@ ther cl@@ ick � P@@ l@@ ay one � to st@@ ar@@ t the rou@@ nd or � P@@ l@@ ay F@@ i@@ ve � to pl@@ ay fi@@ ve con@@ sec@@ u@@ tive r@@ oun@@ ds with the same numb@@ ers you ch@@ ose at the very begin@@ ning . +you can eas@@ il@@ y des@@ el@@ ect a numb@@ er by cl@@ ic@@ king on it . +this K@@ en@@ o version allo@@ ws you to set some numb@@ ers to be p@@ ic@@ k@@ ed at ran@@ dom for the ne@@ x@@ t rou@@ nd or even to allo@@ w ran@@ dom sel@@ ec@@ tion of numb@@ ers using the � P@@ l@@ ay fi@@ ve � op@@ tion . +h@@ ow@@ ever , if you simpl@@ y cl@@ ick the ran@@ dom p@@ ick bu@@ t@@ t@@ on without set@@ ting your numb@@ er am@@ ount , this to@@ o , will be ch@@ os@@ en a@@ ut@@ om@@ ati@@ c@@ ally . +C@@ L@@ IC@@ K@@ 2@@ P@@ A@@ Y is an inst@@ ant , inter@@ na@@ tional & sec@@ ure we@@ b w@@ all@@ et that allo@@ ws you to trans@@ f@@ er fun@@ ds inst@@ an@@ tly from a vari@@ ety of different op@@ tions , in Euro@@ s , U@@ K P@@ oun@@ ds and U@@ S D@@ ol@@ l@@ ar@@ s . +you can fun@@ d your C@@ lic@@ k@@ 2@@ P@@ ay acc@@ ount with : vis@@ a , M@@ a@@ st@@ er@@ c@@ ard , O@@ n@@ line B@@ an@@ king , B@@ an@@ k De@@ pos@@ it & by di@@ rec@@ t de@@ b@@ it . +it en@@ abl@@ es you to acc@@ ess your mon@@ e@@ y rou@@ nd the g@@ lo@@ be , an@@ y@@ time , wh@@ en@@ ever you need it . +all your C@@ L@@ IC@@ K@@ 2@@ P@@ A@@ Y trans@@ ac@@ tions , hi@@ st@@ ory and acc@@ ount b@@ al@@ ance can be vi@@ e@@ w@@ ed and man@@ ag@@ ed in your on@@ line acc@@ ount @-@ are@@ a . +it 's sa@@ fe & re@@ li@@ able , using st@@ and@@ ard enc@@ r@@ yp@@ tion t@@ ec@@ h@@ n@@ ol@@ ogy me@@ ans you can rest easy that your information will re@@ ma@@ in sec@@ ure . +with its &apos@@ ; 2@@ 4 / 7 support , C@@ L@@ IC@@ K@@ 2@@ P@@ A@@ Y is a great cho@@ ice and our re@@ comm@@ end@@ ed p@@ ay@@ ment me@@ th@@ od of the mon@@ th ! +de@@ pos@@ its through C@@ L@@ IC@@ K@@ 2@@ P@@ A@@ Y will be a@@ war@@ d@@ ed with a 1@@ 0 % ex@@ tr@@ a b@@ on@@ us by the c@@ as@@ in@@ o ! +from the c@@ as@@ in@@ o lo@@ b@@ by you can simpl@@ y cl@@ ick on the c@@ as@@ in@@ o c@@ as@@ hi@@ er ic@@ on to make your de@@ pos@@ it . +there you will see a vari@@ ety of p@@ ay@@ ment me@@ th@@ ods li@@ st@@ ed at the to@@ p of the page . +sel@@ ect the p@@ ay@@ ment me@@ th@@ od that su@@ its you b@@ est and fol@@ lo@@ w the easy st@@ ep@@ s on ma@@ king a de@@ pos@@ it . +if you have any qu@@ er@@ ies th@@ ou@@ g@@ h@@ out the proc@@ ess , just cl@@ ick on the C@@ H@@ A@@ T ic@@ on to c@@ ust@@ om@@ er serv@@ ice re@@ pres@@ en@@ ta@@ tive . +get into the f@@ es@@ tive sp@@ irit as we take a look at the b@@ ig@@ g@@ est l@@ ottery g@@ ame of all times . +and if that g@@ ets you in the mo@@ od for b@@ i@@ g mon@@ e@@ y then st@@ ay t@@ un@@ ed for the a@@ ma@@ z@@ ing holiday tre@@ at@@ s that Euro@@ p@@ a C@@ as@@ in@@ o has in st@@ ore for you ! +from all of us here at Euro@@ p@@ a C@@ as@@ in@@ o , we w@@ ish you a very M@@ er@@ r@@ y Ch@@ ri@@ st@@ m@@ as and a H@@ app@@ y N@@ ew Y@@ e@@ ar ! +my S@@ pe@@ ed@@ y Al@@ er@@ t is an exc@@ ep@@ tional tool des@@ ig@@ ned for the pur@@ p@@ ose of updat@@ ing you with any of our rec@@ ent speci@@ als & pro@@ mo@@ tions . +to your com@@ puter in re@@ al time ! +don � t mis@@ s out , re@@ ad more about it here . +l@@ ottery re@@ ma@@ ins for c@@ ent@@ u@@ ri@@ es one of the most comm@@ on form@@ s of g@@ am@@ bl@@ ing end@@ or@@ sed by go@@ ver@@ n@@ ments wor@@ l@@ d@@ wi@@ de . +the very first si@@ g@@ n@@ s of this p@@ op@@ ul@@ ar tre@@ nd take us way bac@@ k to the H@@ an D@@ y@@ na@@ st@@ y in the F@@ ar E@@ ast wh@@ ere a L@@ ottery was used to a@@ id in the fin@@ ance of ma@@ j@@ or go@@ ver@@ n@@ m@@ ent@@ al pro@@ j@@ ects , t@@ ar@@ g@@ et@@ ed ma@@ in@@ ly to gl@@ or@@ i@@ f@@ y the go@@ ver@@ n@@ ment and its people . +the g@@ ame of L@@ ottery is very str@@ ong@@ ly pr@@ in@@ ted in to@@ day � s c@@ ul@@ t@@ ure and even c@@ el@@ e@@ br@@ ated with s@@ pl@@ end@@ or as we ap@@ pro@@ ach the holiday se@@ as@@ on . +the b@@ ig@@ g@@ est c@@ el@@ e@@ br@@ ation by f@@ ar ta@@ k@@ es us to our ne@@ igh@@ b@@ oring Spain , wh@@ ere loc@@ als par@@ ti@@ ci@@ p@@ ate and en@@ jo@@ y Spain � s na@@ tional Ch@@ ri@@ st@@ m@@ as l@@ ottery g@@ ame know@@ n as � E@@ l G@@ ord@@ o � ( T@@ he F@@ at O@@ n@@ e ) , since 1@@ 8@@ 1@@ 2 . +as well as being one of the ol@@ d@@ est l@@ otter@@ ies around , the t@@ ot@@ al pri@@ z@@ e p@@ o@@ ol in rec@@ ent years , have s@@ ur@@ pas@@ sed the 2 b@@ illi@@ on E@@ ur@@ o m@@ ark , ma@@ king it also the b@@ ig@@ g@@ est l@@ ottery wor@@ l@@ d@@ wi@@ de ! ! +with pri@@ z@@ es being so h@@ ig@@ h , more than 1@@ 00 new m@@ illi@@ on@@ a@@ i@@ res are produc@@ ed every Ch@@ ri@@ st@@ m@@ as . +� � E@@ l G@@ ord@@ o � works differ@@ ent@@ ly to most l@@ ottery g@@ ames play@@ ed , as coun@@ tl@@ ess people can sh@@ are one s@@ ing@@ l@@ e tic@@ ket . +a wh@@ ol@@ e l@@ ottery tic@@ ket ( � b@@ il@@ l@@ et@@ e � ) co@@ st@@ s a few th@@ ous@@ and Euro@@ s but these tic@@ k@@ ets are s@@ pl@@ it to sm@@ all@@ er sec@@ tions , e@@ ach co@@ sting around 20 Euro@@ s . +this me@@ ans that many , even str@@ ang@@ ers , may sh@@ are a wh@@ ol@@ e tic@@ ket . +the i@@ de@@ a is qu@@ ite be@@ f@@ it@@ ting es@@ p@@ eci@@ ally now , in holiday se@@ as@@ on wh@@ ere many people in one comm@@ un@@ ity sh@@ are a gr@@ and pri@@ z@@ e wor@@ th m@@ illi@@ ons . +such an ev@@ ent can even a@@ ff@@ ect the loc@@ al ec@@ on@@ om@@ y . +the E@@ l G@@ ord@@ o ta@@ k@@ es place every 2@@ 2@@ nd of D@@ ec@@ ember and ag@@ ain on the 5@@ th of J@@ an@@ u@@ ary . +the wh@@ ol@@ e dra@@ wing proc@@ ess ta@@ k@@ es at le@@ ast th@@ ree h@@ our@@ s to compl@@ et@@ e , with the enti@@ re comm@@ un@@ ity t@@ un@@ ed to the ra@@ di@@ o in s@@ us@@ p@@ ens@@ e . +par@@ ti@@ ci@@ p@@ ation is also available through the inter@@ n@@ et , so now you don � t nec@@ ess@@ ar@@ il@@ y have to be in Spain in order to enter this gr@@ and l@@ ottery ev@@ ent . +T@@ it@@ an P@@ ok@@ er play@@ ers will be j@@ et@@ ting ac@@ ro@@ s@@ s the g@@ lo@@ be in 200@@ 8 as the p@@ op@@ ul@@ ar p@@ ok@@ er ro@@ om pre@@ pa@@ res to s@@ end its to@@ p re@@ pres@@ en@@ ta@@ ti@@ ves to a ser@@ ies of exc@@ it@@ ing land @-@ based t@@ our@@ n@@ am@@ ents . +at the begin@@ ning of 200@@ 8 , T@@ it@@ an P@@ ok@@ er play@@ ers will be com@@ b@@ in@@ ing su@@ n , f@@ u@@ n and p@@ ok@@ er at the A@@ us@@ si@@ e M@@ illi@@ ons , the S@@ ou@@ th A@@ f@@ r@@ ic@@ an P@@ ok@@ er Open , the I@@ r@@ ish Open , and the W@@ P@@ S O@@ c@@ e@@ an W@@ or@@ ld in the D@@ om@@ in@@ ic@@ an R@@ e@@ publ@@ ic ! +so , don 't di@@ ve un@@ der your d@@ u@@ v@@ et this D@@ ec@@ ember ! ! +es@@ cap@@ e the w@@ in@@ ter bl@@ ues by jo@@ in@@ ing T@@ it@@ an P@@ ok@@ er &apos@@ ; s play@@ ers at some of the world 's most exc@@ it@@ ing P@@ ok@@ er E@@ v@@ ents ! +cl@@ ick here to vi@@ ew our most rec@@ ent b@@ ig@@ g@@ est w@@ in@@ ners here at Euro@@ p@@ a C@@ as@@ in@@ o ! +di@@ d you he@@ ar about the $ 3@@ ,@@ 00@@ 0@@ ,@@ 00@@ 0 T@@ en@@ n@@ es@@ see S@@ t@@ ate L@@ ottery ? +the w@@ in@@ ner g@@ ets $ 3 a year for a m@@ illi@@ on years . +the F@@ ir@@ st l@@ ottery g@@ ames were in@@ t@@ ro@@ duc@@ ed to the world some th@@ ous@@ ands of years ag@@ o ( around 2@@ 00 B@@ C ) , by Ch@@ in@@ ese gener@@ als using the g@@ ame to fin@@ ance their war@@ s as well as to a@@ id in the bu@@ il@@ ding of one par@@ tic@@ ul@@ ar@@ ly ma@@ g@@ ni@@ f@@ ic@@ ent ar@@ ch@@ it@@ ec@@ tur@@ al as@@ p@@ ir@@ ation . +these K@@ en@@ o @-@ L@@ ottery typ@@ e g@@ ames had a b@@ i@@ g part in fun@@ ding one of the � N@@ ew 7 W@@ on@@ d@@ ers � , that being the G@@ re@@ at W@@ all of Ch@@ in@@ a ! +C@@ e@@ B@@ I@@ T 200@@ 9 was a great ev@@ ent for both R@@ ap@@ id @-@ I and our vis@@ it@@ ors . +the g@@ er@@ man j@@ our@@ nal " i@@ X " for I@@ T pro@@ f@@ essi@@ on@@ als has publ@@ ished a re@@ vi@@ ew of R@@ ap@@ id@@ M@@ in@@ er which can be downlo@@ ad@@ ed b@@ el@@ ow . +hot@@ els | V@@ il@@ l@@ as and Ch@@ al@@ ets | A@@ part@@ ments | Host@@ els | C@@ amp@@ ing | Th@@ ings to do | Last Minute Offers ! +this P@@ u@@ er@@ to de S@@ an@@ t@@ a M@@ ari@@ a be@@ ac@@ h@@ f@@ ron@@ t fl@@ at in Cadiz , Spain ... +self cat@@ er@@ ing apartment in C@@ on@@ il de la F@@ ron@@ ter@@ a for 6 ... +apartments E@@ l P@@ u@@ er@@ to de S@@ an@@ t@@ a M@@ ari@@ a , all new and c@@ entr@@ ally ... +Ch@@ ri@@ st@@ op@@ h L@@ ind@@ em@@ an@@ n hol@@ ds the C@@ ha@@ ir of Com@@ puter N@@ et@@ works and D@@ i@@ st@@ ribu@@ ted S@@ yst@@ ems in the De@@ part@@ ment of Com@@ puter S@@ ci@@ ence at the U@@ ni@@ ver@@ s@@ ity of L@@ ei@@ p@@ z@@ i@@ g . +from M@@ ar@@ ch 19@@ 9@@ 8 ti@@ ll O@@ c@@ to@@ b@@ er 200@@ 5 he was an as@@ so@@ ci@@ ate pro@@ f@@ ess@@ or in the Com@@ puter S@@ ci@@ ence De@@ part@@ ment at the U@@ ni@@ ver@@ s@@ ity of D@@ or@@ t@@ m@@ un@@ d and was le@@ ading the M@@ ob@@ ile Com@@ pu@@ ting S@@ yst@@ ems grou@@ p . +he recei@@ ved the de@@ g@@ ree D@@ i@@ pl@@ om @-@ In@@ forma@@ ti@@ k@@ er ( M@@ .@@ S@@ . in Com@@ puter S@@ ci@@ ence ) from the U@@ ni@@ ver@@ s@@ ity of K@@ ar@@ l@@ s@@ ru@@ he , Ger@@ many in 19@@ 8@@ 8 and the de@@ g@@ ree D@@ ok@@ t@@ or @-@ In@@ gen@@ i@@ e@@ ur ( P@@ h@@ .@@ D@@ . in En@@ gin@@ e@@ er@@ ing ) from the T@@ ec@@ h@@ n@@ is@@ ch@@ e U@@ ni@@ ver@@ sit@@ ä@@ t B@@ er@@ l@@ in , Ger@@ many in 19@@ 9@@ 2 . +from 19@@ 9@@ 4 to 19@@ 9@@ 7 he hel@@ d pos@@ i@@ tions as res@@ earch s@@ ci@@ enti@@ st and pro@@ j@@ ect man@@ ag@@ er at the G@@ M@@ D In@@ sti@@ t@@ ut@@ e for Com@@ puter S@@ yst@@ ems and S@@ o@@ ftware T@@ ec@@ h@@ n@@ ol@@ ogy ( G@@ M@@ D F@@ I@@ R@@ S@@ T ) , know@@ n as Fra@@ un@@ ho@@ f@@ er In@@ sti@@ t@@ ut F@@ I@@ R@@ S@@ T to@@ day , in B@@ er@@ l@@ in , Ger@@ many . +in su@@ m@@ m@@ er 19@@ 9@@ 3 and d@@ uring the ac@@ ad@@ em@@ ic year 19@@ 9@@ 4 / 19@@ 9@@ 5 , he was a V@@ is@@ it@@ ing S@@ ci@@ enti@@ st at the I@@ B@@ M Al@@ ma@@ den R@@ es@@ earch C@@ enter , S@@ an J@@ ose C@@ A . +in the f@@ all s@@ em@@ est@@ er 200@@ 3 / 0@@ 4 he sp@@ end his s@@ ab@@ b@@ ati@@ c@@ al at the Com@@ puter S@@ ci@@ ence De@@ part@@ ment of the U@@ ni@@ ver@@ s@@ ity of W@@ is@@ con@@ s@@ in as a vis@@ it@@ ing pro@@ f@@ ess@@ or . +his cur@@ rent res@@ earch inter@@ ests li@@ e in m@@ ob@@ ile com@@ pu@@ ting systems , es@@ p@@ eci@@ ally m@@ ob@@ ile ad ho@@ c n@@ et@@ works and pe@@ er @-@ to @-@ pe@@ er systems as well as mo@@ d@@ ell@@ ing and per@@ for@@ man@@ ce ev@@ al@@ u@@ ation as an umb@@ rel@@ la t@@ op@@ ic . +Ch@@ ri@@ st@@ op@@ h L@@ ind@@ em@@ an@@ n is m@@ ember of the I@@ F@@ I@@ P wor@@ king grou@@ p 7@@ .@@ 3 and a s@@ en@@ i@@ or m@@ ember of the I@@ E@@ E@@ E . +he is on the ed@@ it@@ or@@ i@@ al bo@@ ard of the inter@@ na@@ tional j@@ our@@ nal P@@ er@@ for@@ man@@ ce E@@ val@@ u@@ ation since 200@@ 5 . he is also a m@@ ember of the E@@ x@@ ec@@ u@@ tive Bo@@ ard of A@@ C@@ M S@@ I@@ G@@ M@@ E@@ T@@ R@@ IC@@ S . +he has been serv@@ ing as ch@@ a@@ ir of the speci@@ al inter@@ est grou@@ p on me@@ as@@ u@@ re@@ ments , mo@@ d@@ ell@@ ing , and ev@@ al@@ u@@ ation of com@@ puter systems and comm@@ un@@ ication n@@ et@@ works with@@ in the Ger@@ man S@@ o@@ ci@@ ety of In@@ forma@@ tic@@ s ( G@@ I ) from 200@@ 5 to 200@@ 8 . +in 200@@ 5 , he serv@@ ed as G@@ en@@ er@@ al C@@ o @-@ C@@ ha@@ ir for the 1@@ 1@@ th In@@ ter@@ na@@ tional C@@ on@@ f@@ er@@ ence on M@@ ob@@ ile Com@@ pu@@ ting and N@@ et@@ wor@@ king , A@@ C@@ M M@@ ob@@ i@@ C@@ om . +Ch@@ ri@@ st@@ op@@ h L@@ ind@@ em@@ an@@ n has or@@ g@@ an@@ iz@@ ed the A@@ C@@ M M@@ ob@@ i@@ S@@ h@@ are W@@ or@@ k@@ sh@@ o@@ p in 200@@ 6 and is serv@@ ing as gener@@ al ch@@ a@@ ir of the 2@@ 6@@ th In@@ ter@@ na@@ tional S@@ y@@ m@@ pos@@ ium on Com@@ puter P@@ er@@ for@@ man@@ ce , M@@ o@@ d@@ el@@ ing , M@@ eas@@ u@@ re@@ ments , and E@@ val@@ u@@ ation , P@@ er@@ for@@ man@@ ce 200@@ 7 . +Ch@@ ri@@ st@@ op@@ h L@@ ind@@ em@@ an@@ n has also been app@@ o@@ in@@ ted to the P@@ ro@@ g@@ ra@@ m Com@@ m@@ it@@ te@@ es of n@@ u@@ mer@@ ous to@@ p @-@ l@@ ev@@ el inter@@ na@@ tional conf@@ er@@ enc@@ es , e@@ .@@ g@@ . A@@ C@@ M S@@ I@@ G@@ M@@ E@@ T@@ R@@ IC@@ S 200@@ 7 and A@@ C@@ M M@@ ob@@ i@@ H@@ o@@ c 200@@ 8 . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 1@@ 7 M@@ ay 200@@ 8 . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 0@@ 2 N@@ o@@ v 200@@ 6 . +this statistic is based on the 68@@ 19 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 0@@ 7 M@@ ay 200@@ 8 . +ple@@ ase no@@ ti@@ ce the differ@@ ence bet@@ we@@ en D@@ V@@ D + R and D@@ V@@ D @-@ R when bu@@ y@@ ing a D@@ V@@ D b@@ ur@@ ner , o@@ ft@@ en d@@ ri@@ ves can wr@@ ite only to one of those two medi@@ a typ@@ es , som@@ e@@ times to both . when you bu@@ y a d@@ ri@@ ve that only sup@@ por@@ ts one typ@@ e , you will al@@ w@@ ays have to be c@@ are@@ ful to choose the right writ@@ able D@@ V@@ D dis@@ c@@ s at the sh@@ o@@ p . +this statistic is based on the 68@@ 15 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 2@@ 2 O@@ c@@ t 200@@ 7 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 63@@ 2@@ 4 free Open ICEcat users . +the Lord war@@ n@@ s N@@ e@@ ph@@ i to de@@ part into the wil@@ der@@ ness . +his j@@ our@@ ne@@ y@@ ings in the wil@@ der@@ ness , and so for@@ th . +Le@@ h@@ i pro@@ ph@@ es@@ ies of a land of li@@ b@@ er@@ ty - H@@ is se@@ ed shall be s@@ cat@@ ter@@ ed and sm@@ it@@ ten if they re@@ j@@ ect the H@@ ol@@ y O@@ n@@ e of I@@ s@@ ra@@ el - H@@ e ex@@ h@@ or@@ ts his s@@ ons to put on the ar@@ m@@ or of righ@@ te@@ ous@@ ness . +1 And now it c@@ ame to pas@@ s that after I , N@@ e@@ ph@@ i , had made an end of te@@ ac@@ h@@ ing my bre@@ th@@ ren , our a fa@@ ther , Le@@ h@@ i , also sp@@ a@@ ke many things unto them , and re@@ hear@@ sed unto them , how great things the Lord had d@@ one for them in br@@ ing@@ ing them out of the land of J@@ er@@ us@@ al@@ em . +2 And he sp@@ a@@ ke unto them con@@ c@@ er@@ ning their a re@@ b@@ el@@ li@@ ons upon the waters , and the merci@@ es of God in sp@@ ar@@ ing their li@@ ves , that they were not s@@ w@@ allo@@ w@@ ed up in the se@@ a . +3 And he also sp@@ a@@ ke unto them con@@ c@@ er@@ ning the land of pro@@ mis@@ e , which they had ob@@ ta@@ ined - how a merci@@ ful the Lord had been in b war@@ ning us that we should f@@ le@@ e out of the land of J@@ er@@ us@@ al@@ em . +4 F@@ or , behold , said he , I have a se@@ en a b vis@@ i@@ on , in which I know that c J@@ er@@ us@@ al@@ em is d de@@ st@@ ro@@ y@@ ed ; and had we re@@ ma@@ ined in J@@ er@@ us@@ al@@ em we should also have e per@@ ished . +5 B@@ ut , said he , n@@ ot@@ with@@ st@@ an@@ ding our a@@ ff@@ lic@@ tions , we have ob@@ ta@@ ined a a land of pro@@ mis@@ e , a land which is b cho@@ ice ab@@ o@@ ve all other l@@ ands ; a land which the Lord God hath c co@@ v@@ en@@ an@@ ted with me should be a land for the in@@ h@@ er@@ it@@ ance of my se@@ ed . +yea , the Lord hath d co@@ v@@ en@@ an@@ ted this land unto me , and to my children forever , and also all those who should be e l@@ ed out of other coun@@ tri@@ es by the h@@ and of the Lord . +6 Wherefore , I , Le@@ h@@ i , pro@@ ph@@ es@@ y according to the wor@@ k@@ ings of the S@@ p@@ irit which is in me , that there shall a n@@ one come into this land sa@@ ve they shall be b@@ rou@@ ght by the h@@ and of the Lord . +7 Wherefore , this a land is con@@ sec@@ r@@ ated unto him wh@@ om he shall br@@ ing . +and if it so be that they shall ser@@ ve him according to the command@@ ments which he hath given , it shall be a land of b li@@ b@@ er@@ ty unto them ; wherefore , they shall n@@ ever be b@@ rou@@ ght down into cap@@ tiv@@ ity ; if so , it shall be because of ini@@ qu@@ ity ; for if ini@@ qu@@ ity shall ab@@ ound c cur@@ sed shall be the land for their sa@@ k@@ es , but unto the righ@@ te@@ ous it shall be bless@@ ed forever . +8 And behold , it is w@@ is@@ dom that this land should be a k@@ ep@@ t as y@@ et from the know@@ le@@ d@@ ge of other b n@@ ations ; for behold , many n@@ ations would o@@ ver@@ ru@@ n the land , that there would be no place for an in@@ h@@ er@@ it@@ ance . +9 Wherefore , I , Le@@ h@@ i , have ob@@ ta@@ ined a a pro@@ mis@@ e , that b in@@ as@@ much as those wh@@ om the Lord God shall br@@ ing out of the land of J@@ er@@ us@@ al@@ em shall ke@@ ep his command@@ ments , they shall c pro@@ sp@@ er upon the face of this land ; and they shall be k@@ ep@@ t from all other n@@ ations , that they may pos@@ s@@ ess this land unto the@@ m@@ sel@@ ves . +and if it so be that they shall d ke@@ ep his command@@ ments they shall be bless@@ ed upon the face of this land , and there shall be n@@ one to m@@ ol@@ est them , nor to take a@@ way the land of their e in@@ h@@ er@@ it@@ ance ; and they shall d@@ well sa@@ f@@ ely forever . +11 Y@@ e@@ a , he will br@@ ing a other n@@ ations unto them , and he will gi@@ ve unto them power , and he will take a@@ way from them the l@@ ands of their pos@@ s@@ essi@@ ons , and he will c@@ ause them to be b s@@ cat@@ ter@@ ed and sm@@ it@@ ten . +1@@ 2 Y@@ e@@ a , as one gener@@ ation pas@@ s@@ eth to an@@ other there shall be a bl@@ o@@ o@@ d@@ she@@ ds , and great vis@@ it@@ ations among them ; wherefore , my s@@ ons , I would that ye would rem@@ ember ; yea , I would that ye would hear@@ ken unto my words . +1@@ 3 O that ye would a@@ wa@@ ke ; a@@ wa@@ ke from a de@@ ep a s@@ le@@ ep , yea , even from the s@@ le@@ ep of b h@@ ell , and sh@@ a@@ ke o@@ ff the a@@ w@@ ful c ch@@ a@@ ins by which ye are b@@ ound , which are the ch@@ a@@ ins which b@@ in@@ d the children of men , that they are c@@ ar@@ ri@@ ed a@@ way cap@@ tive down to the et@@ ernal d g@@ ul@@ f of mis@@ ery and w@@ o@@ e . +and ar@@ is@@ e from the d@@ ust , and he@@ ar the words of a t@@ rem@@ bl@@ ing a pa@@ rent , wh@@ ose li@@ m@@ b@@ s ye must so@@ on l@@ ay down in the col@@ d and s@@ il@@ ent b g@@ ra@@ ve , from wh@@ ence no tr@@ av@@ el@@ er can c re@@ tur@@ n ; a few more d days and I go the e way of all the earth . +15 B@@ ut behold , the Lord hath a re@@ de@@ em@@ ed my sou@@ l from h@@ ell ; I have be@@ hel@@ d his b gl@@ ory , and I am enc@@ ir@@ cl@@ ed about et@@ er@@ n@@ ally in the c ar@@ ms of his d lo@@ ve . +1@@ 6 And I des@@ ir@@ e that ye should rem@@ ember to ob@@ ser@@ ve the a st@@ at@@ ut@@ es and the j@@ ud@@ g@@ ments of the Lord ; behold , this hath been the an@@ x@@ i@@ ety of my sou@@ l from the begin@@ ning . +1@@ 8 O@@ r , that a a cur@@ s@@ ing should come upon you for the sp@@ ace of b many gener@@ ations ; and ye are vis@@ ited by s@@ wor@@ d , and by f@@ am@@ ine , and are h@@ ated , and are l@@ ed according to the will and cap@@ tiv@@ ity of the c dev@@ il . +19 O my s@@ ons , that these things might not come upon you , but that ye might be a cho@@ ice and a a f@@ av@@ ore@@ d people of the Lord . +but behold , his will be d@@ one ; for his b w@@ ays are righ@@ te@@ ous@@ ness forever . +20 And he hath said that : a In@@ as@@ much as ye shall ke@@ ep my b command@@ ments ye shall c pro@@ sp@@ er in the land ; but in@@ as@@ much as ye will not ke@@ ep my command@@ ments ye shall be c@@ ut o@@ ff from my pres@@ ence . +2@@ 2 Th@@ at ye may not be a cur@@ sed with a s@@ ore cur@@ s@@ ing ; and also , that ye may not in@@ c@@ ur the dis@@ pl@@ eas@@ ure of a b just God upon you , unto the de@@ struc@@ tion , yea , the et@@ ernal de@@ struc@@ tion of both sou@@ l and bo@@ d@@ y . +2@@ 3 A@@ wa@@ ke , my s@@ ons ; put on the ar@@ m@@ or of a righ@@ te@@ ous@@ ness . +sh@@ a@@ ke o@@ ff the b ch@@ a@@ ins with which ye are b@@ ound , and come for@@ th out of ob@@ s@@ cur@@ ity , and ar@@ is@@ e from the d@@ ust . +2@@ 5 And I exc@@ e@@ ed@@ ing@@ ly fe@@ ar and t@@ rem@@ ble because of you , l@@ est he shall su@@ ff@@ er ag@@ ain ; for behold , ye have a acc@@ used him that he sou@@ ght power and b au@@ th@@ or@@ ity over you ; but I know that he hath not sou@@ ght for power nor au@@ th@@ or@@ ity over you , but he hath sou@@ ght the gl@@ ory of God , and your own et@@ ernal w@@ el@@ f@@ are . +2@@ 6 And ye have m@@ ur@@ mu@@ red because he hath been pl@@ ain unto you . +ye say that he hath used a sh@@ ar@@ p@@ ness ; ye say that he hath been an@@ gr@@ y with you ; but behold , his b sh@@ ar@@ p@@ ness was the sh@@ ar@@ p@@ ness of the power of the wor@@ d of God , which was in him ; and that which ye c@@ all ang@@ er was the tr@@ u@@ th , according to that which is in God , which he could not re@@ st@@ ra@@ in , man@@ i@@ f@@ est@@ ing b@@ ol@@ d@@ ly con@@ c@@ er@@ ning your ini@@ qui@@ ties . +2@@ 7 And it must needs be that the a power of God must be with him , even unto his comman@@ ding you that ye must o@@ be@@ y . +but behold , it was not he , but it was the b S@@ p@@ irit of the Lord which was in him , which c op@@ en@@ ed his m@@ ou@@ th to ut@@ ter@@ ance that he could not sh@@ ut it . +2@@ 8 And now my s@@ on , L@@ am@@ an , and also L@@ em@@ u@@ el and S@@ am , and also my s@@ ons who are the s@@ ons of I@@ sh@@ ma@@ el , behold , if ye will hear@@ ken unto the vo@@ ice of N@@ e@@ ph@@ i ye shall not per@@ ish . +and if ye will hear@@ ken unto him I le@@ a@@ ve unto you a a bless@@ ing , yea , even my first bless@@ ing . +2@@ 9 B@@ ut if ye will not hear@@ ken unto him I take a@@ way my a first bless@@ ing , yea , even my bless@@ ing , and it shall rest upon him . +3@@ 0 And now , Z@@ or@@ am , I spea@@ k unto you : behold , thou ar@@ t the a serv@@ ant of L@@ ab@@ an ; n@@ ever@@ th@@ el@@ ess , thou h@@ ast been b@@ rou@@ ght out of the land of J@@ er@@ us@@ al@@ em , and I know that thou ar@@ t a tr@@ ue b f@@ ri@@ end unto my s@@ on , N@@ e@@ ph@@ i , forever . +3@@ 1 Wherefore , because thou h@@ ast been fa@@ i@@ th@@ ful thy se@@ ed shall be bless@@ ed a with his se@@ ed , that they d@@ well in pro@@ sp@@ er@@ ity l@@ ong upon the face of this land ; and no@@ thing , sa@@ ve it shall be ini@@ qu@@ ity among them , shall h@@ ar@@ m or dist@@ ur@@ b their pro@@ sp@@ er@@ ity upon the face of this land forever . +3@@ 2 Wherefore , if ye shall ke@@ ep the command@@ ments of the Lord , the Lord hath con@@ sec@@ r@@ ated this land for the sec@@ ur@@ ity of thy se@@ ed with the se@@ ed of my s@@ on . +re@@ d@@ em@@ p@@ tion com@@ eth through the H@@ ol@@ y M@@ essi@@ a@@ h - F@@ re@@ e@@ dom of cho@@ ice ( ag@@ enc@@ y ) is ess@@ enti@@ al to ex@@ i@@ st@@ ence and prog@@ res@@ sion - A@@ d@@ am f@@ ell that men might be - M@@ en are free to choose li@@ b@@ er@@ ty and et@@ ernal li@@ fe . +1 And now , J@@ ac@@ o@@ b , I spea@@ k unto you : thou ar@@ t my a first -@@ b@@ or@@ n in the days of my t@@ ribu@@ l@@ ation in the wil@@ der@@ ness . +and behold , in thy child@@ ho@@ od thou h@@ ast su@@ ffer@@ ed a@@ ff@@ lic@@ tions and much s@@ or@@ ro@@ w , because of the r@@ ud@@ en@@ ess of thy bre@@ th@@ ren . +2 N@@ ever@@ th@@ el@@ ess , J@@ ac@@ o@@ b , my first @-@ b@@ or@@ n in the wil@@ der@@ ness , thou know@@ est the gre@@ at@@ ness of God ; and he shall con@@ sec@@ r@@ ate th@@ ine a a@@ ff@@ lic@@ tions for thy g@@ ain . +3 Wherefore , thy sou@@ l shall be bless@@ ed , and thou shalt d@@ well sa@@ f@@ ely with thy b@@ ro@@ ther , N@@ e@@ ph@@ i ; and thy days shall be a sp@@ ent in the serv@@ ice of thy God . +Wherefore , I know that thou ar@@ t re@@ de@@ em@@ ed , because of the righ@@ te@@ ous@@ ness of thy R@@ e@@ de@@ em@@ er ; for thou h@@ ast b be@@ hel@@ d that in the c ful@@ ness of time he com@@ eth to br@@ ing s@@ al@@ v@@ ation unto men . +4 And thou h@@ ast a be@@ hel@@ d in thy y@@ ou@@ th his gl@@ ory ; wherefore , thou ar@@ t bless@@ ed even as they unto wh@@ om he shall m@@ ini@@ st@@ er in the fl@@ es@@ h ; for the S@@ p@@ irit is the same , y@@ est@@ er@@ day , to@@ day , and forever . +and the way is pre@@ pa@@ red from the f@@ all of man , and b s@@ al@@ v@@ ation is c free . +5 And men are in@@ struc@@ ted su@@ ff@@ ici@@ ent@@ ly that they a know good from ev@@ il . +and the b law is given unto men . +and by the law no fl@@ es@@ h is c j@@ us@@ ti@@ fied ; or , by the law men are d c@@ ut o@@ ff . +yea , by the t@@ em@@ por@@ al law they were c@@ ut o@@ ff ; and also , by the sp@@ ir@@ it@@ ual law they per@@ ish from that which is good , and b@@ ecom@@ e mis@@ er@@ able forever . +6 Wherefore , a re@@ d@@ em@@ p@@ tion com@@ eth in and through the b H@@ ol@@ y c M@@ essi@@ a@@ h ; for he is full of d gr@@ ace and tr@@ u@@ th . +7 B@@ e@@ hold , he o@@ ffer@@ eth hi@@ m@@ self a a s@@ ac@@ ri@@ f@@ ice for s@@ in , to ans@@ w@@ er the en@@ ds of the law , unto all those who have a b@@ ro@@ ken hear@@ t and a con@@ tr@@ ite sp@@ irit ; and unto b n@@ one el@@ se can the c en@@ ds of the law be ans@@ w@@ er@@ ed . +9 Wherefore , he is the fir@@ st@@ f@@ ru@@ its unto God , in@@ as@@ much as he shall make a inter@@ c@@ es@@ sion for all the children of men ; and they that b@@ eli@@ e@@ ve in him shall be s@@ av@@ ed . +1@@ 0 And because of the inter@@ c@@ es@@ sion for a all , all men come unto God ; wherefore , they st@@ and in the pres@@ ence of him , to be b j@@ ud@@ g@@ ed of him according to the tr@@ u@@ th and c hol@@ in@@ ess which is in him . +11 F@@ or it must needs be , that there is an a op@@ pos@@ i@@ tion in all things . +if not so , my first @-@ b@@ or@@ n in the wil@@ der@@ ness , righ@@ te@@ ous@@ ness could not be b@@ rou@@ ght to pas@@ s , ne@@ i@@ ther w@@ ic@@ k@@ ed@@ ness , ne@@ i@@ ther hol@@ in@@ ess nor mis@@ ery , ne@@ i@@ ther good nor b@@ ad . +Wherefore , all things must needs be a com@@ p@@ ound in one ; wherefore , if it should be one bo@@ d@@ y it must needs re@@ ma@@ in as de@@ ad , having no li@@ fe ne@@ i@@ ther de@@ ath , nor cor@@ r@@ up@@ tion nor in@@ cor@@ r@@ up@@ tion , h@@ app@@ in@@ ess nor mis@@ ery , ne@@ i@@ ther s@@ ens@@ e nor in@@ s@@ en@@ si@@ b@@ il@@ ity . +1@@ 2 Wherefore , it must needs have been cre@@ ated for a thing of n@@ au@@ ght ; wherefore there would have been no a pur@@ p@@ ose in the end of its cre@@ ation . +Wherefore , this thing must needs de@@ st@@ ro@@ y the w@@ is@@ dom of God and his et@@ ernal pur@@ pos@@ es , and also the power , and the mer@@ c@@ y , and the b j@@ us@@ ti@@ ce of God . +1@@ 3 And if ye shall say there is a no law , ye shall also say there is no s@@ in . +if ye shall say there is no s@@ in , ye shall also say there is no righ@@ te@@ ous@@ ness . +and if there be no righ@@ te@@ ous@@ ness there be no h@@ app@@ in@@ ess . +and if there be no righ@@ te@@ ous@@ ness nor h@@ app@@ in@@ ess there be no p@@ un@@ ish@@ ment nor mis@@ ery . +and if these things are not b there is no God . +and if there is no God we are not , ne@@ i@@ ther the earth ; for there could have been no cre@@ ation of things , ne@@ i@@ ther to act nor to be ac@@ ted upon ; wherefore , all things must have v@@ an@@ ished a@@ way . +1@@ 4 And now , my s@@ ons , I spea@@ k unto you these things for your pro@@ f@@ it and a lear@@ ning ; for there is a God , and he hath b cre@@ ated all things , both the he@@ av@@ en@@ s and the earth , and all things that in them are , both things to act and things to be c ac@@ ted upon . +1@@ 6 Wherefore , the Lord God g@@ a@@ ve unto man that he should a act for hi@@ m@@ self . +Wherefore , man could not b act for hi@@ m@@ self sa@@ ve it should be that he was c enti@@ c@@ ed by the one or the other . +1@@ 7 And I , Le@@ h@@ i , according to the things which I have re@@ ad , must needs sup@@ p@@ ose that an a ang@@ el of God , according to that which is writ@@ ten , had b f@@ all@@ en from he@@ av@@ en ; wherefore , he bec@@ ame a c dev@@ il , having sou@@ ght that which was ev@@ il before God . +1@@ 8 And because he had f@@ all@@ en from he@@ av@@ en , and had b@@ ecom@@ e mis@@ er@@ able forever , he a sou@@ ght also the mis@@ ery of all man@@ k@@ in@@ d . +Wherefore , he said unto E@@ ve , yea , even that old ser@@ p@@ ent , who is the dev@@ il , who is the fa@@ ther of all b li@@ es , wherefore he said : par@@ take of the for@@ b@@ id@@ den f@@ ru@@ it , and ye shall not di@@ e , but ye shall be as God , c know@@ ing good and ev@@ il . +19 And after A@@ d@@ am and E@@ ve had a par@@ ta@@ ken of the for@@ b@@ id@@ den f@@ ru@@ it they were d@@ ri@@ ven out of the g@@ ar@@ den of b E@@ den , to ti@@ ll the earth . +20 And they have b@@ rou@@ ght for@@ th children ; yea , even the a family of all the earth . +2@@ 1 And the days of the children of a men were pro@@ l@@ ong@@ ed , according to the b will of God , that they might c rep@@ ent while in the fl@@ es@@ h ; wherefore , their st@@ ate bec@@ ame a st@@ ate of d pro@@ b@@ ation , and their time was l@@ en@@ g@@ th@@ en@@ ed , according to the command@@ ments which the Lord God g@@ a@@ ve unto the children of men . +for he g@@ a@@ ve command@@ ment that all men must rep@@ ent ; for he sh@@ ow@@ ed unto all men that they were e l@@ ost , because of the trans@@ g@@ res@@ sion of their pa@@ ren@@ ts . +2@@ 2 And now , behold , if A@@ d@@ am had not trans@@ g@@ res@@ sed he would not have f@@ all@@ en , but he would have re@@ ma@@ ined in the g@@ ar@@ den of E@@ den . +and all things which were cre@@ ated must have re@@ ma@@ ined in the same st@@ ate in which they were after they were cre@@ ated ; and they must have re@@ ma@@ ined forever , and had no end . +2@@ 3 And they would have had no a children ; wherefore they would have re@@ ma@@ ined in a st@@ ate of in@@ no@@ c@@ ence , having no b jo@@ y , for they k@@ new no mis@@ ery ; do@@ ing no good , for they k@@ new no c s@@ in . +2@@ 4 B@@ ut behold , all things have been d@@ one in the w@@ is@@ dom of him who a know@@ eth all things . +2@@ 5 a A@@ d@@ am b f@@ ell that men might be ; and men c are , that they might have d jo@@ y . +2@@ 6 And the a M@@ essi@@ a@@ h com@@ eth in the ful@@ ness of time , that he may b re@@ de@@ em the children of men from the f@@ all . +and because that they are c re@@ de@@ em@@ ed from the f@@ all they have b@@ ecom@@ e d free forever , know@@ ing good from ev@@ il ; to act for the@@ m@@ sel@@ ves and not to be ac@@ ted upon , sa@@ ve it be by the p@@ un@@ ish@@ ment of the e law at the great and last day , according to the command@@ ments which God hath given . +2@@ 7 Wherefore , men are a free according to the b fl@@ es@@ h ; and c all things are d given them which are ex@@ pe@@ di@@ ent unto man . +and they are free to e choose f li@@ b@@ er@@ ty and et@@ ernal g li@@ fe , through the great Medi@@ ator of all men , or to choose cap@@ tiv@@ ity and de@@ ath , according to the cap@@ tiv@@ ity and power of the dev@@ il ; for he se@@ e@@ k@@ eth that all men might be h mis@@ er@@ able like unto hi@@ m@@ self . +2@@ 9 And not choose et@@ ernal de@@ ath , according to the will of the fl@@ es@@ h and the a ev@@ il which is th@@ ere@@ in , which gi@@ v@@ eth the sp@@ irit of the dev@@ il power to b cap@@ tiv@@ ate , to br@@ ing you down to c h@@ ell , that he may re@@ ign over you in his own k@@ ing@@ dom . +3@@ 0 I have sp@@ ok@@ en these few words unto you all , my s@@ ons , in the last days of my pro@@ b@@ ation ; and I have ch@@ os@@ en the good part , according to the words of the pro@@ ph@@ et . +and I have n@@ one other ob@@ j@@ ect sa@@ ve it be the ever@@ la@@ sting a w@@ el@@ f@@ are of your sou@@ l@@ s . +the Gods fin@@ ish their pl@@ an@@ ning of the cre@@ ation of all things - T@@ he@@ y br@@ ing to pas@@ s the cre@@ ation according to their pl@@ ans - A@@ d@@ am n@@ ames every li@@ ving cre@@ at@@ ure . +1 And th@@ us we will fin@@ ish the he@@ av@@ en@@ s and the earth , and all the a ho@@ st@@ s of them . +2 And the Gods said among the@@ m@@ sel@@ ves : on the s@@ ev@@ en@@ th time we will end our work , which we have coun@@ sel@@ ed ; and we will a rest on the b s@@ ev@@ en@@ th time from all our work which we have coun@@ sel@@ ed . +3 And the Gods con@@ cl@@ ud@@ ed upon the s@@ ev@@ en@@ th time , because that on the s@@ ev@@ en@@ th time they would a rest from all their b works which they ( the Gods ) coun@@ sel@@ ed among the@@ m@@ sel@@ ves to for@@ m ; and c s@@ anc@@ ti@@ fied it . +and th@@ us were their dec@@ is@@ i@@ ons at the time that they coun@@ sel@@ ed among the@@ m@@ sel@@ ves to for@@ m the he@@ av@@ en@@ s and the earth . +5 A@@ c@@ c@@ ording to all that which they had said con@@ c@@ er@@ ning every pl@@ ant of the fi@@ el@@ d before it was in the a earth , and every h@@ er@@ b of the fi@@ el@@ d before it gre@@ w ; for the Gods had not c@@ a@@ used it to ra@@ in upon the earth when they coun@@ sel@@ ed to do them , and had not form@@ ed a man to ti@@ ll the grou@@ nd . +6 B@@ ut there w@@ ent up a mis@@ t from the earth , and wat@@ er@@ ed the wh@@ ol@@ e face of the grou@@ nd . +7 And the a Gods form@@ ed man from the b d@@ ust of the grou@@ nd , and to@@ ok his c sp@@ irit ( that is , the man 's sp@@ irit ) , and put it into him ; and bre@@ a@@ th@@ ed into his no@@ str@@ il@@ s the bre@@ ath of li@@ fe , and man bec@@ ame a li@@ ving d sou@@ l . +8 And the Gods pl@@ an@@ ted a g@@ ar@@ den , ea@@ st@@ w@@ ard in a E@@ den , and there they put the man , wh@@ ose sp@@ irit they had put into the bo@@ d@@ y which they had form@@ ed . +9 And out of the grou@@ nd made the Gods to g@@ ro@@ w every t@@ ree that is pl@@ eas@@ ant to the si@@ ght and good for f@@ o@@ od ; the a t@@ ree of li@@ fe , also , in the m@@ id@@ st of the g@@ ar@@ den , and the t@@ ree of know@@ le@@ d@@ ge of good and ev@@ il . +1@@ 0 T@@ here was a ri@@ ver r@@ un@@ ning out of E@@ den , to wat@@ er the g@@ ar@@ den , and from th@@ ence it was par@@ ted and bec@@ ame into f@@ our he@@ ad@@ s . +11 And the Gods to@@ ok the man and put him in the G@@ ar@@ den of E@@ den , to d@@ ress it and to ke@@ ep it . +1@@ 3 B@@ ut of the t@@ ree of know@@ le@@ d@@ ge of good and ev@@ il , thou shalt not e@@ at of it ; for in the time that thou e@@ at@@ est th@@ ere@@ of , thou shalt su@@ re@@ ly di@@ e . +now I , A@@ b@@ ra@@ ha@@ m , sa@@ w that it was after the Lord 's a time , which was after the time of b K@@ ol@@ o@@ b ; for as y@@ et the Gods had not app@@ o@@ in@@ ted unto A@@ d@@ am his rec@@ k@@ on@@ ing . +1@@ 4 And the Gods said : let us make an help me@@ et for the man , for it is not good that the man should be al@@ one , th@@ erefore we will for@@ m an help me@@ et for him . +1@@ 6 And of the ri@@ b which the Gods had ta@@ ken from man , form@@ ed they a a w@@ om@@ an , and b@@ rou@@ ght h@@ er unto the man . +1@@ 8 T@@ herefore shall a man le@@ a@@ ve his fa@@ ther and his m@@ other , and shall a cl@@ ea@@ ve unto his wi@@ fe , and they shall be b one fl@@ es@@ h . +19 And they were both na@@ k@@ ed , the man and his wi@@ fe , and were not a as@@ h@@ am@@ ed . +20 And out of the grou@@ nd the Gods form@@ ed every be@@ ast of the fi@@ el@@ d , and every f@@ ow@@ l of the a@@ ir , and b@@ rou@@ ght them unto A@@ d@@ am to see what he would c@@ all them ; and wh@@ at@@ so@@ ever a A@@ d@@ am called every li@@ ving cre@@ at@@ ure , that should be the name th@@ ere@@ of . +2@@ 1 And A@@ d@@ am g@@ a@@ ve a n@@ ames to all b cat@@ tl@@ e , to the f@@ ow@@ l of the a@@ ir , to every be@@ ast of the fi@@ el@@ d ; and for A@@ d@@ am , there was f@@ ound an c help me@@ et for him . +men are called as h@@ ig@@ h pri@@ ests because of their exc@@ e@@ ed@@ ing fa@@ ith and good works - T@@ he@@ y are to te@@ ach the command@@ ments - Th@@ rou@@ gh righ@@ te@@ ous@@ ness they are s@@ anc@@ ti@@ fied and enter into the rest of the Lord - M@@ el@@ ch@@ iz@@ e@@ de@@ k was one of these - An@@ g@@ els are dec@@ l@@ ar@@ ing gl@@ ad ti@@ d@@ ings throu@@ g@@ h@@ out the land - T@@ he@@ y will rev@@ e@@ al the ac@@ t@@ ual com@@ ing of Ch@@ r@@ ist . +1 And ag@@ ain , my bre@@ th@@ ren , I would c@@ ite your m@@ in@@ ds for@@ w@@ ard to the time when the Lord God g@@ a@@ ve these command@@ ments unto his children ; and I would that ye should rem@@ ember that the Lord God a ord@@ a@@ ined pri@@ ests , after his hol@@ y order , which was after the order of his S@@ on , to te@@ ach these things unto the people . +2 And those pri@@ ests were ord@@ a@@ ined after the a order of his S@@ on , in a b man@@ ner that th@@ ere@@ by the people might know in what man@@ ner to look for@@ w@@ ard to his S@@ on for re@@ d@@ em@@ p@@ tion . +4 And th@@ us they have been a called to this hol@@ y c@@ all@@ ing on acc@@ ount of their fa@@ ith , while o@@ th@@ ers would re@@ j@@ ect the S@@ p@@ irit of God on acc@@ ount of the h@@ ar@@ d@@ ness of their hear@@ ts and b bl@@ ind@@ ness of their m@@ in@@ ds , while , if it had not been for this they might have had as great c pri@@ v@@ il@@ e@@ ge as their bre@@ th@@ ren . +9 Th@@ us they b@@ ecom@@ e a h@@ ig@@ h pri@@ ests forever , after the order of the S@@ on , the O@@ n@@ ly B@@ e@@ g@@ ot@@ ten of the F@@ a@@ ther , who is without begin@@ ning of days or end of years , who is full of b gr@@ ace , e@@ qu@@ ity , and tr@@ u@@ th . +11 T@@ herefore they were called after this hol@@ y order , and were a s@@ anc@@ ti@@ fied , and their b g@@ ar@@ ments were w@@ as@@ h@@ ed wh@@ ite through the bl@@ o@@ od of the L@@ am@@ b . +1@@ 2 N@@ ow they , after being a s@@ anc@@ ti@@ fied by the b H@@ ol@@ y G@@ h@@ ost , having their g@@ ar@@ ments made wh@@ ite , being c pur@@ e and sp@@ ot@@ l@@ ess before God , could not look upon d s@@ in sa@@ ve it were with e ab@@ h@@ or@@ ren@@ ce ; and there were many , exc@@ e@@ ed@@ ing@@ ly great many , who were made pur@@ e and en@@ ter@@ ed into the rest of the Lord their God . +1@@ 3 And now , my bre@@ th@@ ren , I would that ye should h@@ um@@ ble y@@ our@@ sel@@ ves before God , and br@@ ing for@@ th a f@@ ru@@ it me@@ et for rep@@ ent@@ ance , that ye may also enter into that rest . +1@@ 4 Y@@ e@@ a , h@@ um@@ ble y@@ our@@ sel@@ ves even as the people in the days of a M@@ el@@ ch@@ iz@@ e@@ de@@ k , who was also a h@@ ig@@ h pri@@ est after this same order which I have sp@@ ok@@ en , who also to@@ ok upon him the h@@ ig@@ h pri@@ es@@ th@@ o@@ od forever . +15 And it was this same M@@ el@@ ch@@ iz@@ e@@ de@@ k to wh@@ om A@@ b@@ ra@@ ha@@ m pa@@ id a ti@@ th@@ es ; yea , even our fa@@ ther A@@ b@@ ra@@ ha@@ m pa@@ id ti@@ th@@ es of one @-@ ten@@ th part of all he pos@@ s@@ ess@@ ed . +1@@ 6 N@@ ow these a ord@@ in@@ anc@@ es were given after this b man@@ ner , that th@@ ere@@ by the people might look for@@ w@@ ard on the S@@ on of God , it being a c typ@@ e of his order , or it being his order , and this that they might look for@@ w@@ ard to him for a rem@@ is@@ sion of their s@@ ins , that they might enter into the rest of the Lord . +1@@ 8 B@@ ut M@@ el@@ ch@@ iz@@ e@@ de@@ k having ex@@ er@@ c@@ is@@ ed mig@@ h@@ ty fa@@ ith , and recei@@ ved the off@@ ice of the h@@ ig@@ h pri@@ es@@ th@@ o@@ od according to the a hol@@ y order of God , di@@ d pre@@ ach rep@@ ent@@ ance unto his people . +and behold , they di@@ d rep@@ ent ; and M@@ el@@ ch@@ iz@@ e@@ de@@ k di@@ d est@@ abl@@ ish pe@@ ace in the land in his days ; th@@ erefore he was called the pr@@ ince of pe@@ ace , for he was the king of S@@ al@@ em ; and he di@@ d re@@ ign un@@ der his fa@@ ther . +19 N@@ ow , there were a many before him , and also there were many af@@ ter@@ war@@ ds , but b n@@ one were gre@@ at@@ er ; th@@ erefore , of him they have more par@@ tic@@ ul@@ ar@@ ly made m@@ en@@ tion . +20 N@@ ow I need not re@@ hear@@ se the mat@@ ter ; what I have said may su@@ ff@@ ice . +behold , the a s@@ c@@ ri@@ p@@ t@@ ures are before you ; if ye will b w@@ rest them it shall be to your own de@@ struc@@ tion . +2@@ 2 Y@@ e@@ a , and the vo@@ ice of the Lord , by the a m@@ ou@@ th of ang@@ els , d@@ oth dec@@ l@@ are it unto all n@@ ations ; yea , d@@ oth dec@@ l@@ are it , that they may have gl@@ ad ti@@ d@@ ings of great jo@@ y ; yea , and he d@@ oth s@@ ound these gl@@ ad ti@@ d@@ ings among all his people , yea , even to them that are s@@ cat@@ ter@@ ed ab@@ ro@@ ad upon the face of the earth ; wherefore they have come unto us . +2@@ 3 And they are made know@@ n unto us in a pl@@ ain ter@@ ms , that we may un@@ der@@ st@@ and , that we c@@ an@@ not er@@ r ; and this because of our being b w@@ an@@ der@@ ers in a str@@ an@@ ge land ; th@@ erefore , we are th@@ us h@@ igh@@ ly f@@ av@@ ore@@ d , for we have these gl@@ ad ti@@ d@@ ings dec@@ la@@ red unto us in all par@@ ts of our v@@ in@@ e@@ y@@ ard . +2@@ 4 F@@ or behold , a ang@@ els are dec@@ l@@ ar@@ ing it unto many at this time in our land ; and this is for the pur@@ p@@ ose of pre@@ par@@ ing the hear@@ ts of the children of men to receive his wor@@ d at the time of his com@@ ing in his gl@@ ory . +2@@ 5 And now we only wa@@ it to he@@ ar the jo@@ y@@ ful ne@@ ws dec@@ la@@ red unto us by the m@@ ou@@ th of ang@@ els , of his com@@ ing ; for the time com@@ eth , we a know not how so@@ on . +would to God that it might be in my day ; but let it be so@@ on@@ er or l@@ at@@ er , in it I will re@@ jo@@ ice . +2@@ 6 And it shall be made know@@ n unto a just and hol@@ y men , by the m@@ ou@@ th of ang@@ els , at the time of his com@@ ing , that the words of our fa@@ th@@ ers may be ful@@ fil@@ l@@ ed , according to that which they have sp@@ ok@@ en con@@ c@@ er@@ ning him , which was according to the sp@@ irit of pro@@ ph@@ ec@@ y which was in them . +2@@ 9 a H@@ av@@ ing fa@@ ith on the Lord ; having a hop@@ e that ye shall receive et@@ ernal li@@ fe ; having the b lo@@ ve of God al@@ w@@ ays in your hear@@ ts , that ye may be li@@ f@@ ted up at the last day and enter into his c rest . +3@@ 0 And may the Lord gr@@ ant unto you rep@@ ent@@ ance , that ye may not br@@ ing down his wr@@ ath upon you , that ye may not be a b@@ ound down by the ch@@ a@@ ins of b h@@ ell , that ye may not su@@ ff@@ er the sec@@ on@@ d c de@@ ath . +3@@ 1 And Al@@ m@@ a sp@@ a@@ ke many more words unto the people , which are not writ@@ ten in a this bo@@ ok . +rev@@ el@@ ation given through J@@ os@@ ep@@ h S@@ m@@ ith , at M@@ an@@ ch@@ est@@ er , N@@ ew Y@@ or@@ k , M@@ ar@@ ch 1@@ 8@@ 3@@ 0 . H@@ C 1 : 7@@ 2 @-@ 7@@ 4 . +1 @-@ 3 , Ch@@ r@@ ist has all power ; 4 @-@ 5 , A@@ ll men must rep@@ ent or su@@ ff@@ er ; 6 @-@ 1@@ 2 , E@@ ter@@ nal p@@ un@@ ish@@ ment is God 's p@@ un@@ ish@@ ment ; 1@@ 3 @-@ 20 , Ch@@ r@@ ist su@@ ffer@@ ed for all , that they might not su@@ ff@@ er if they would rep@@ ent ; 2@@ 1 @-@ 2@@ 8 , P@@ re@@ ach the g@@ os@@ pe@@ l of rep@@ ent@@ ance ; 2@@ 9 @-@ 4@@ 1 , D@@ ec@@ l@@ are gl@@ ad ti@@ d@@ ings . +1 I am a Al@@ ph@@ a and O@@ me@@ g@@ a , b Ch@@ r@@ ist the Lord ; yea , even I am he , the begin@@ ning and the end , the R@@ e@@ de@@ em@@ er of the c world . +3 R@@ et@@ ain@@ ing all a power , even to the b de@@ st@@ ro@@ y@@ ing of S@@ at@@ an and his works at the c end of the world , and the last great day of j@@ ud@@ g@@ ment , which I shall pas@@ s upon the in@@ ha@@ b@@ it@@ an@@ ts th@@ ere@@ of , d j@@ ud@@ g@@ ing every man according to his e works and the de@@ eds which he hath d@@ one . +4 And su@@ re@@ ly every man must a rep@@ ent or b su@@ ff@@ er , for I , God , am c end@@ l@@ ess . +5 Wherefore , I a re@@ vo@@ ke not the j@@ ud@@ g@@ ments which I shall pas@@ s , but w@@ o@@ es shall go for@@ th , we@@ ep@@ ing , b w@@ ail@@ ing and g@@ n@@ as@@ h@@ ing of te@@ eth , yea , to those who are f@@ ound on my c le@@ ft h@@ and . +6 N@@ ever@@ th@@ el@@ ess , it is a not writ@@ ten that there shall be no end to this t@@ or@@ ment , but it is writ@@ ten b end@@ l@@ ess c t@@ or@@ ment . +7 A@@ g@@ ain , it is writ@@ ten a et@@ ernal d@@ am@@ n@@ ation ; wherefore it is more ex@@ p@@ ress than other s@@ c@@ ri@@ p@@ t@@ ures , that it might work upon the hear@@ ts of the children of men , al@@ to@@ ge@@ ther for my name 's gl@@ ory . +8 Wherefore , I will ex@@ pl@@ ain unto you this a m@@ yst@@ ery , for it is me@@ et unto you to know even as mine ap@@ ost@@ l@@ es . +9 I spea@@ k unto you that are ch@@ os@@ en in this thing , even as one , that you may enter into my a rest . +1@@ 0 F@@ or , behold , the a m@@ yst@@ ery of go@@ d@@ lin@@ ess , how great is it ! +for , behold , I am b end@@ l@@ ess , and the p@@ un@@ ish@@ ment which is given from my h@@ and is end@@ l@@ ess c p@@ un@@ ish@@ ment , for d En@@ d@@ l@@ ess is my name . +11 a E@@ ter@@ nal p@@ un@@ ish@@ ment is God 's p@@ un@@ ish@@ ment . +15 T@@ herefore I comm@@ and you to rep@@ ent - rep@@ ent , l@@ est I a sm@@ ite you by the ro@@ d of my m@@ ou@@ th , and by my wr@@ ath , and by my ang@@ er , and your b su@@ ffer@@ ings be s@@ ore - how s@@ ore you know not , how ex@@ qu@@ is@@ ite you know not , yea , how h@@ ard to be@@ ar you know not . +19 N@@ ever@@ th@@ el@@ ess , gl@@ ory be to the F@@ a@@ ther , and I par@@ to@@ ok and a fin@@ ished my pre@@ par@@ ations unto the children of men . +20 Wherefore , I comm@@ and you ag@@ ain to rep@@ ent , l@@ est I a h@@ um@@ ble you with my al@@ mig@@ h@@ ty power ; and that you b conf@@ ess your s@@ ins , l@@ est you su@@ ff@@ er these c p@@ un@@ ish@@ ments of which I have sp@@ ok@@ en , of which in the sm@@ all@@ est , yea , even in the le@@ ast de@@ g@@ ree you have d ta@@ st@@ ed at the time I with@@ d@@ re@@ w my S@@ p@@ irit . +2@@ 1 And I comm@@ and you that you a pre@@ ach n@@ au@@ ght but rep@@ ent@@ ance , and sh@@ ow b not these things unto the world un@@ ti@@ l it is w@@ is@@ dom in me . +2@@ 2 F@@ or they c@@ an@@ not a be@@ ar me@@ at now , but b m@@ il@@ k they must receive ; wherefore , they must not know these things , l@@ est they per@@ ish . +2@@ 3 a L@@ ear@@ n of me , and li@@ st@@ en to my words ; b w@@ al@@ k in the c me@@ e@@ k@@ ness of my S@@ p@@ irit , and you shall have d pe@@ ace in me . +2@@ 4 I am J@@ es@@ us Ch@@ r@@ ist ; I a c@@ ame by the b will of the F@@ a@@ ther , and I do his will . +2@@ 5 And ag@@ ain , I comm@@ and the@@ e that thou shalt not a co@@ v@@ et thy b ne@@ igh@@ b@@ or " s c wi@@ fe ; nor se@@ e@@ k thy ne@@ igh@@ b@@ or 's li@@ fe . +2@@ 7 W@@ h@@ ich is my wor@@ d to the a G@@ enti@@ l@@ e , that so@@ on it may go to the b J@@ ew , of wh@@ om the L@@ am@@ an@@ it@@ es are a c rem@@ n@@ ant , that they may b@@ eli@@ e@@ ve the g@@ os@@ pe@@ l , and look not for a d M@@ essi@@ a@@ h to come who has al@@ read@@ y come . +2@@ 8 And ag@@ ain , I comm@@ and the@@ e that thou shalt a pr@@ ay b v@@ oc@@ ally as well as in thy hear@@ t ; yea , before the world as well as in sec@@ re@@ t , in publ@@ ic as well as in pri@@ v@@ ate . +2@@ 9 And thou shalt a dec@@ l@@ are gl@@ ad ti@@ d@@ ings , yea , b publ@@ ish it upon the m@@ oun@@ ta@@ ins , and upon every h@@ ig@@ h place , and among every people that thou shalt be per@@ m@@ it@@ ted to see . +3@@ 0 And thou shalt do it with all h@@ um@@ il@@ ity , a tr@@ ust@@ ing in me , b rev@@ il@@ ing not ag@@ ain@@ st rev@@ il@@ ers . +3@@ 1 And of a ten@@ ets thou shalt not t@@ al@@ k , but thou shalt dec@@ l@@ are rep@@ ent@@ ance and b fa@@ ith on the S@@ av@@ i@@ or , and c rem@@ is@@ sion of s@@ ins by d b@@ ap@@ ti@@ s@@ m , and by e fir@@ e , yea , even the f H@@ ol@@ y G@@ h@@ ost . +3@@ 2 B@@ e@@ hold , this is a great and the last a command@@ ment which I shall gi@@ ve unto you con@@ c@@ er@@ ning this mat@@ ter ; for this shall su@@ ff@@ ice for thy d@@ ail@@ y w@@ al@@ k , even unto the end of thy li@@ fe . +3@@ 3 And mis@@ ery thou shalt receive if thou wil@@ t s@@ l@@ ig@@ ht these a coun@@ sel@@ s , yea , even the de@@ struc@@ tion of th@@ y@@ self and pro@@ per@@ ty . +3@@ 4 a I@@ m@@ part a por@@ tion of thy pro@@ per@@ ty , yea , even part of thy l@@ ands , and all sa@@ ve the support of thy b family . +3@@ 5 P@@ ay the a de@@ b@@ t thou h@@ ast b con@@ tr@@ ac@@ ted with the pr@@ in@@ ter . +re@@ le@@ ase th@@ y@@ self from c b@@ on@@ d@@ age . +3@@ 7 And a spea@@ k f@@ re@@ ely to all ; yea , pre@@ ach , ex@@ h@@ ort , dec@@ l@@ are the b tr@@ u@@ th , even with a l@@ ou@@ d vo@@ ice , with a s@@ ound of re@@ jo@@ ic@@ ing , c@@ r@@ y@@ ing - H@@ os@@ an@@ n@@ a , h@@ os@@ an@@ n@@ a , bless@@ ed be the name of the Lord God ! +3@@ 8 a Pr@@ ay al@@ w@@ ays , and I will b p@@ our out my S@@ p@@ irit upon you , and great shall be your bless@@ ing - yea , even more than if you should ob@@ ta@@ in c tre@@ as@@ ures of earth and cor@@ r@@ up@@ ti@@ bl@@ en@@ ess to the ex@@ t@@ ent th@@ ere@@ of . +3@@ 9 B@@ e@@ hold , c@@ an@@ st thou re@@ ad this without a re@@ jo@@ ic@@ ing and li@@ ft@@ ing up thy hear@@ t for b gl@@ ad@@ ness ? +4@@ 0 O@@ r c@@ an@@ st thou ru@@ n about l@@ ong@@ er as a a bl@@ in@@ d g@@ u@@ i@@ de ? +4@@ 1 O@@ r c@@ an@@ st thou be a h@@ um@@ ble and me@@ e@@ k , and con@@ duc@@ t th@@ y@@ self w@@ is@@ ely before me ? +rev@@ el@@ ation given to J@@ os@@ ep@@ h S@@ m@@ ith the P@@ ro@@ ph@@ et and S@@ id@@ ney R@@ ig@@ don , at or ne@@ ar F@@ ay@@ et@@ te , N@@ ew Y@@ or@@ k , D@@ ec@@ ember 1@@ 8@@ 3@@ 0 . H@@ C 1 : 12@@ 8 @-@ 1@@ 3@@ 1 . +at this time the P@@ ro@@ ph@@ et was en@@ g@@ ag@@ ed al@@ most d@@ ail@@ y in ma@@ king a trans@@ l@@ ation of the B@@ i@@ ble . +the trans@@ l@@ ation was be@@ g@@ u@@ n as ear@@ ly as J@@ un@@ e 1@@ 8@@ 3@@ 0 , and both O@@ li@@ ver C@@ ow@@ der@@ y and J@@ o@@ h@@ n W@@ h@@ it@@ m@@ er had serv@@ ed as s@@ c@@ ri@@ b@@ es . +since they had now been called to other d@@ u@@ ties , S@@ id@@ ney R@@ ig@@ don was called by di@@ v@@ ine app@@ o@@ in@@ t@@ ment to ser@@ ve as the P@@ ro@@ ph@@ et 's s@@ c@@ ri@@ be in this work ( ver@@ se 20 ) . +as a pre@@ face to his rec@@ ord of this rev@@ el@@ ation the P@@ ro@@ ph@@ et w@@ ro@@ te : " in D@@ ec@@ ember S@@ id@@ ney R@@ ig@@ don c@@ ame &#@@ 9@@ 1@@ ; from O@@ hi@@ o &#@@ 9@@ 3@@ ; to in@@ qu@@ ir@@ e of the Lord , and with him c@@ ame E@@ d@@ w@@ ard P@@ art@@ ri@@ d@@ ge . . +1 L@@ i@@ st@@ en to the vo@@ ice of the a Lord your God , even b Al@@ ph@@ a and O@@ me@@ g@@ a , the begin@@ ning and the end , wh@@ ose c c@@ our@@ se is one d et@@ ernal rou@@ nd , the e same to@@ day as y@@ est@@ er@@ day , and forever . +2 I am J@@ es@@ us Ch@@ r@@ ist , the S@@ on of God , who was a c@@ ru@@ ci@@ fied for the s@@ ins of the world , even as many as will b b@@ eli@@ e@@ ve on my name , that they may b@@ ecom@@ e the c s@@ ons of God , even d one in e me as I am f one in the F@@ a@@ ther , as the F@@ a@@ ther is one in me , that we may be one . +3 B@@ e@@ hold , ver@@ il@@ y , ver@@ il@@ y , I say unto my serv@@ ant S@@ id@@ ney , I have lo@@ ok@@ ed upon the@@ e and thy works . +I have a hear@@ d thy pr@@ ay@@ ers , and pre@@ pa@@ red the@@ e for a gre@@ at@@ er work . +4 Th@@ ou ar@@ t bless@@ ed , for thou shalt do great things . +behold thou w@@ ast s@@ ent for@@ th , even as a J@@ o@@ h@@ n , to pre@@ p@@ are the way before me , and before b E@@ li@@ j@@ a@@ h which should come , and thou k@@ ne@@ w@@ est it not . +6 B@@ ut now I gi@@ ve unto the@@ e a command@@ ment , that thou shalt a b@@ ap@@ ti@@ z@@ e by wat@@ er , and they shall receive the b H@@ ol@@ y G@@ h@@ ost by the lay@@ ing on of the c h@@ ands , even as the ap@@ ost@@ l@@ es of old . +7 And it shall come to pas@@ s that there shall be a great work in the land , even among the a G@@ enti@@ l@@ es , for their b fol@@ ly and their ab@@ om@@ in@@ ations shall be made man@@ i@@ f@@ est in the e@@ y@@ es of all people . +8 F@@ or I am God , and mine ar@@ m is not a sh@@ or@@ ten@@ ed ; and I will sh@@ ow b m@@ ir@@ ac@@ l@@ es , c si@@ g@@ n@@ s , and w@@ on@@ d@@ ers , unto all those who d b@@ eli@@ e@@ ve on my name . +9 And wh@@ os@@ o shall as@@ k it in my name in a fa@@ ith , they shall b c@@ ast out c dev@@ il@@ s ; they shall he@@ al the d s@@ ick ; they shall c@@ ause the bl@@ in@@ d to receive their e si@@ ght , and the de@@ a@@ f to he@@ ar , and the d@@ um@@ b to spea@@ k , and the l@@ ame to w@@ al@@ k . +11 B@@ ut a without fa@@ ith shall not an@@ y@@ thing be sh@@ own for@@ th exc@@ ep@@ t b des@@ ol@@ ations upon c B@@ ab@@ y@@ l@@ on , the same which has made d all n@@ ations d@@ r@@ in@@ k of the w@@ ine of the wr@@ ath of h@@ er e for@@ n@@ ication . +1@@ 2 And there are a n@@ one that do@@ eth good exc@@ ep@@ t those who are read@@ y to b receive the ful@@ ness of my g@@ os@@ pe@@ l , which I have s@@ ent for@@ th unto this gener@@ ation . +1@@ 4 And their ar@@ m shall be my ar@@ m , and I will be their a sh@@ i@@ el@@ d and their bu@@ c@@ k@@ l@@ er ; and I will g@@ ir@@ d up their lo@@ ins , and they shall f@@ ig@@ ht man@@ ful@@ ly for me ; and their b en@@ em@@ ies shall be un@@ der their fe@@ et ; and I will let c f@@ all the d s@@ wor@@ d in their be@@ h@@ al@@ f , and by the e fir@@ e of mine in@@ di@@ g@@ n@@ ation will I pres@@ er@@ ve them . +1@@ 6 And they shall lear@@ n the par@@ able of the a fi@@ g -@@ t@@ ree , for even now al@@ read@@ y su@@ m@@ m@@ er is n@@ ig@@ h . +1@@ 8 And I have given unto him the a ke@@ y@@ s of the m@@ yst@@ ery of those things which have been b se@@ al@@ ed , even things which were from the c f@@ oun@@ d@@ ation of the world , and the things which shall come from this time un@@ ti@@ l the time of my com@@ ing , if he d ab@@ i@@ de in me , and if not , e an@@ other will I pl@@ ant in his st@@ e@@ ad . +19 Wherefore , wat@@ ch over him that his fa@@ ith fa@@ il not , and it shall be given by the a Com@@ for@@ ter , the b H@@ ol@@ y G@@ h@@ ost , that know@@ eth all things . +2@@ 1 F@@ or they will he@@ ar my a vo@@ ice , and shall b see me , and shall not be c as@@ le@@ ep , and shall d ab@@ i@@ de the day of my e com@@ ing ; for they shall be f pu@@ ri@@ fied , even as I am pur@@ e . +2@@ 2 And now I say unto you , a t@@ ar@@ r@@ y with him , and he shall j@@ our@@ ney with you ; for@@ sa@@ ke him not , and su@@ re@@ ly these things shall be ful@@ fil@@ l@@ ed . +2@@ 3 And a in@@ as@@ much as ye do not wr@@ ite , behold , it shall be b given unto him to pro@@ ph@@ es@@ y ; and thou shalt pre@@ ach my g@@ os@@ pe@@ l and c@@ all on c the hol@@ y pro@@ ph@@ ets to pro@@ ve his words , as they shall be given him . +2@@ 5 And a I@@ s@@ ra@@ el shall be b s@@ av@@ ed in mine own d@@ ue time ; and by the c ke@@ y@@ s which I have given shall they be l@@ ed , and no more be conf@@ oun@@ d@@ ed at all . +2@@ 6 a L@@ i@@ ft up your hear@@ ts and be gl@@ ad , your b re@@ d@@ em@@ p@@ tion dra@@ w@@ eth n@@ ig@@ h . +2@@ 7 F@@ e@@ ar not , l@@ it@@ tl@@ e a f@@ loc@@ k , the b k@@ ing@@ dom is y@@ our@@ s un@@ ti@@ l I come . +rev@@ el@@ ation given through J@@ os@@ ep@@ h S@@ m@@ ith the P@@ ro@@ ph@@ et , on the b@@ an@@ k of the M@@ is@@ s@@ our@@ i R@@ i@@ ver , M@@ c@@ I@@ l@@ wa@@ ine 's B@@ end , A@@ ug@@ ust 1@@ 2 , 1@@ 8@@ 3@@ 1 . H@@ C 1 : 2@@ 0@@ 2 @-@ 2@@ 0@@ 5 . +on their re@@ tur@@ n tri@@ p to K@@ ir@@ t@@ land the P@@ ro@@ ph@@ et and ten el@@ d@@ ers had tr@@ av@@ el@@ ed down the M@@ is@@ s@@ our@@ i R@@ i@@ ver in c@@ an@@ o@@ es . +on the th@@ ir@@ d day of the j@@ our@@ ney many d@@ ang@@ ers were ex@@ per@@ i@@ enc@@ ed . +el@@ der W@@ illi@@ am W@@ . P@@ hel@@ p@@ s , in d@@ ay@@ l@@ ig@@ ht vis@@ i@@ on , sa@@ w the de@@ st@@ ro@@ y@@ er ri@@ ding in power upon the face of the waters . +1 @-@ 1@@ 2 , T@@ he Lord has dec@@ re@@ ed many de@@ struc@@ tions upon the waters ; 1@@ 3 @-@ 2@@ 2 , T@@ he waters were cur@@ sed by J@@ o@@ h@@ n , and the de@@ st@@ ro@@ y@@ er ri@@ det@@ h upon their face ; 2@@ 3 @-@ 2@@ 9 , S@@ om@@ e have power to comm@@ and the waters ; 3@@ 0 @-@ 3@@ 5 , E@@ l@@ d@@ ers are to j@@ our@@ ney two by two and pre@@ ach the g@@ os@@ pe@@ l ; 3@@ 6 @-@ 3@@ 9 , T@@ he@@ y are to pre@@ p@@ are for the com@@ ing of the S@@ on of M@@ an . +1 B@@ e@@ hold , and hear@@ ken unto the vo@@ ice of him who has all a power , who is from ever@@ la@@ sting to ever@@ la@@ sting , even b Al@@ ph@@ a and O@@ me@@ g@@ a , the begin@@ ning and the end . +3 B@@ ut ver@@ il@@ y I say unto you , that it is not ne@@ ed@@ ful for this wh@@ ol@@ e com@@ p@@ any of mine el@@ d@@ ers to be mo@@ ving s@@ wi@@ ft@@ ly upon the waters , wh@@ il@@ st the in@@ ha@@ b@@ it@@ an@@ ts on ei@@ ther si@@ de are per@@ ish@@ ing in un@@ b@@ eli@@ e@@ f . +5 F@@ or I , the Lord , have dec@@ re@@ ed in mine ang@@ er many de@@ struc@@ tions upon the waters ; yea , and es@@ p@@ eci@@ ally upon these waters . +6 N@@ ever@@ th@@ el@@ ess , all fl@@ es@@ h is in mine h@@ and , and he that is fa@@ i@@ th@@ ful among you shall not a per@@ ish by the waters . +7 Wherefore , it is ex@@ pe@@ di@@ ent that my serv@@ ant S@@ id@@ ney G@@ il@@ b@@ er@@ t and my serv@@ ant a W@@ illi@@ am W@@ . P@@ hel@@ p@@ s be in ha@@ st@@ e upon their er@@ r@@ and and mis@@ sion . +9 B@@ ut now , ver@@ il@@ y I say , it be@@ ho@@ o@@ v@@ eth me that ye should part . +1@@ 0 And in@@ as@@ much as they are a fa@@ i@@ th@@ ful they shall be pres@@ er@@ ved , and I , the Lord , will be b with them . +11 And let the re@@ sid@@ ue take that which is ne@@ ed@@ ful for c@@ lo@@ thing . +1@@ 2 L@@ et my serv@@ ant S@@ id@@ ney G@@ il@@ b@@ er@@ t take that which is not ne@@ ed@@ ful with him , as you shall ag@@ ree . +1@@ 3 And now , behold , for your a good I g@@ a@@ ve unto you a b command@@ ment con@@ c@@ er@@ ning these things ; and I , the Lord , will re@@ as@@ on with you as with men in days of old . +1@@ 4 B@@ e@@ hold , I , the Lord , in the begin@@ ning bless@@ ed the a waters ; but in the last days , by the m@@ ou@@ th of my serv@@ ant J@@ o@@ h@@ n , I b cur@@ sed the waters . +15 Wherefore , the days will come that no fl@@ es@@ h shall be sa@@ fe upon the waters . +1@@ 6 And it shall be said in days to come that n@@ one is able to go up to the land of Z@@ i@@ on upon the waters , but he that is up@@ right in hear@@ t . +1@@ 7 And , as I , the Lord , in the begin@@ ning a cur@@ sed the land , even so in the last days have I b bless@@ ed it , in its time , for the use of my sa@@ in@@ ts , that they may par@@ take the f@@ at@@ ness th@@ ere@@ of . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . diff --git a/subword-nmt/subword_nmt/tests/data/corpus.en b/subword-nmt/subword_nmt/tests/data/corpus.en new file mode 100644 index 0000000000000000000000000000000000000000..2c1f08d50088d5610e216d2ffcaa7497339cffe0 --- /dev/null +++ b/subword-nmt/subword_nmt/tests/data/corpus.en @@ -0,0 +1,1015 @@ +iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges ( corners ) of the steel ingot mould . +iron cement protects the ingot against the hot , abrasive steel casting process . +a fire restant repair cement for fire places , ovens , open fireplaces etc . +construction and repair of highways and ... +an announcement must be commercial character . +goods and services advancement through the P.O.Box system is NOT ALLOWED . +deliveries ( spam ) and other improper information deleted . +translator Internet is a Toolbar for MS Internet Explorer . +it allows you to translate in real time any web pasge from one language to another . +you only have to select languages and TI does all the work for you ! automatic dictionary updates .... +this software is written in order to increase your English keyboard typing speed , through teaching the basics of how to put your hand on to the keyboard and give some training examples . +each lesson teaches some extra keys , and there is also a practice , if it is chosen , one can practice the previous keys learned through the previous lessons . the words chosen in the practice are mostly meaningful and relates to the tough keys ... +are you one of millions out there who are trying to learn foreign language , but never have enough time ? +get VTeacher , a screensaver that displays words and phrases you are trying to learn and their translation . +launch it during your office break and add new words to your vocabulary while sipping hot coffee and chewing a sandwich . +this is a one time charge and you will never be rebilled ! +you will receive direct access to a registration code automatically after you place your order . the entire process of registration and cleaning your system should take less than 5 minutes . +you will also receive a confirmation email with your order information ( registration code , order # , etc ) ... +the English @-@ German Pro Dictionary contains over 50,813 words and 23,343 articles presented in rich @-@ text format articles . +the dictionary is a wonderful bilingual reference tool and can be used both by beginners and advanced learners ... +the MSDict English @-@ Spanish Pro Dictionary contains over 38,000 entries in 19,800 word articles , presented in rich @-@ text format . +the dictionary is a wonderful bilingual reference tool and can be used both by beginners and advanced learners of English and Spanish ... +pocket Oxford English Dictionary First Published in 1924 This is a reissue of the ninth edition of the world 's longest @-@ established pocket English dictionary by Oxford University Press . +it is one of the new generation Oxford dictionaries derived from the database of the highly acclaimed New Oxford Dictionary of English and is particularly user friendly with its elegant open design , with different elements starting on new lines ... +WordBanker is a unique and fun method of helping you to learn a foreign language . +rather than bog you down with complicated grammar it deals only with building a vocabulary . +trouble memorising new words or phrases ? +WordBanker 's " Visual Clue " method of testing means you learn without even realising it . +can be used by French people learning English or English people learning English ... +the English Pro Dictionary for Series 60 Smartphones is an extensive dictionary and thesaurus with over 90,000 words , geared to the needs of a wide range of users- from the student at intermediate level and above to the enthusiastic tourist , or native English speaking business professional ... +WordBanker is a unique and fun method of helping you to learn a foreign language . rather than bog you down with complicated grammar it deals only with building a vocabulary . +trouble memorising new words or phrases ? WordBanker 's " Visual Clue " method of testing means you learn without even realising it . +can be used by Italian people learning English or English people learning Italian ... +this line of LingvoSoft English Albanian Dictionaries for Windows brings you accurate and prompt two @-@ way word translations , wrapped in a user @-@ friendly interface with convenient search options . +they are built on updated linguistic databases and come in four versions designed for different types of users . LingvoSoft Dictionary Basic English Albanian is a compact , fast and easy to use dictionary ... +this screen saver displays beautiful paintings of quaint English cottages . +it makes a wonderful gift for a family member or friend . +it features a very stable screen saver engine and several different user @-@ definable features . the user interface is attractive , intuitive , and easy to use . +the Windows 95 / 98 / NT Desktop Properties dialog displays a live preview of the screen saver ... +the English Pro Dictionary for Pocket MSDict Viewer is an extensive dictionary and treasures with over 90,000 words , geared to the needs of a wide range of users- from the student at intermediate level and above to the enthusiastic tourist , or native English speaking business professional ... +pocket Oxford English Dictionary First Published in 1924 This is a reissue of the ninth edition of the world 's longest @-@ established pocket English dictionary by Oxford University Press . +it is one of the new generation Oxford dictionaries derived from the database of the highly acclaimed New Oxford Dictionary of English and is particularly user friendly with its elegant open design , with different elements starting on new lines ... +pocket Oxford English Dictionary First Published in 1924 This is a Pocket PC reissue of the ninth edition of the world 's longest @-@ established pocket English dictionary by Oxford University Press ... +the English Phrases Dictionary for Pocket MSDict Viewer includes variety of phrases , collocations and common idioms . +the database provides 11,107 definitions and over 9,800 phrases . +the dictionary works as an add @-@ on file for Pocket MSDict Viewer and is fully compatible with all the useful functionalities of the viewer . the phrases are chosen among most commonly used American and British English collocations and phrases ... +featuring 65,000 entries , this is a fast and efficient application , which will provide you with study material and immediate practical help when faced with a communication challenge requiring an instant response ... +the English @-@ German Pro Dictionary contains over 50,813 words and 23,343 articles presented in rich @-@ text format articles . +the dictionary is a wonderful bilingual reference tool and can be used both by beginners and advanced learners ... +" the potential productivity gains should be considerable . +today , QuarkXPress ® 8 has tighter integration with Photoshop ® and Illustrator ® than ever before , and through standards like HTML and CSS , QuarkXPress users can publish across media both independently and alongside Adobe ® Creative Suite ® applications like Adobe Flash ® ( SWF ) and Adobe Dreamweaver ® . +here , you 'll find out how Creative Suite users can get the best possible interaction with QuarkXPress . +you 'll be surprised how easy Quark has made it to unlock the full potential of all your design software . +QuarkXPress 8 is considered by many to have the best integration with Photoshop 's PSD file format of any layout tool available today . +in this section we 'll explain when you should use the PSD format for your images and how to get the most out of them . +for example , you may have multiple layers in your PSD with different product shots , which will vary from publication to publication . +if you use PSD , you can switch those layers on or off in QuarkXPress without having to save a separate TIFF for each publication . +another question that might tip you in favor of PSD is , " Do I need to use a spot color with this image ? +" using spot colors in most image formats is often complicated . +however , because of the way QuarkXPress supports PSD channels , it 's simpler and more flexible . +bringing the PSD files into QuarkXPress is the same as any other image . create a Box and then use File > Import ... or simply drag and drop the image from your desktop , Finder or an application like Adobe Bridge ® with or without creating a box first . +to access the special features of PSD , open the PSD Import palette . ( window > PSD Import ) You 'll instantly see thumbnails of the layers along with their names . +for example , you 've created a layer in Photoshop to give your image an antiqued look , but when you put it in your layout it seems swamped by the surrounding colors . one option might be to reduce the opacity of that layer by clicking on it and entering a new opacity level . +if you want to add an extra ink or plate to your images , you can set up a channel to do that in Photoshop . for example , maybe you intend to varnish part of an image , or you want to use a spot color within your image . +QuarkXPress can re ­ map any channel right in the PSD Import palette - non ­ destructively . +so click on the channels divider of the PSD Import Palette ; double click on the channel in question , and you can pick any color from your project 's color palette , ensuring consistency . +as powerful as the PSD support of QuarkXPress 8 is , it can 't manipulate certain kinds of layers , such as layer effects [ e.g. +if you use one of these layers , the image will import and print just fine , but you won 't get access to the layer controls of the PSD Import palette . +if you need that functionality , you can eliminate those kinds of layers from your PSD by converting the layer effects to stand @-@ alone layers or ' smart objects ' [ right click on the Layer in the Photoshop layers palette ] . +QuarkXPress supports text layers , most adjustment layers , and even 3D layers including opacity and blending mode controls . +Illustrator is a great tool for creating logos and vector illustrations . +traditionally , the route into QuarkXPress has been to export an EPS from Illustrator . +now , things are made much easier with the arrival of direct Illustrator .ai file import into QuarkXPress 8 . +simply drag and drop or import your Illustrator native file into your layout just as you would any graphics file format . +QuarkXPress has a powerful transparency engine , but it doesn 't support partially transparent objects in PDF or .ai files yet . +so if you are using something like a drop shadow in Illustrator and plan to place that over a non @-@ white background or object , you might see undesirable results . +the good news is that the drop shadow and transparency features of QuarkXPress works on imported Illustrator files , so you can apply a drop shadow or change the opacity of your .ai file right in your layout instead . +QuarkXPress is well equipped for drawing tasks and in our newest release we have standardised many of our graphics tools to work more like Illustrator , Freehand and similar tools . +watch a video on how to create compelling illustrations in QuarkXPress 8 . +page @-@ layout professionals can create rich Flash projects - without compromising design - using the built @-@ in Flash authoring capabilities included in every edition of QuarkXPress 8 . +working in the same familiar print environment of QuarkXPress 8 , you can take existing print jobs to Flash , or create new Flash projects , in minutes - no additional purchase or coding required ! +watch a video on how to create sophisticated Flash designs in QuarkXPress 8 . +* InDesign CS 4 : interactive elements such as hyperlinks , page transitions , and button actions are not included in the XFL file . +disclaimer : this document is based on publicly available information and not based on hands @-@ on software evaluation . +its content may be revised at any time . +Quark Inc. accepts no responsibility legal or otherwise for the accuracy of this content . +QuarkXPress includes Web layouts that can create menus and hyperlinks , convert print graphics or fancy text treatments to Web graphics , and then write a standards @-@ based HTML file with CSS that can be opened directly in Adobe Dreamweaver . +QuarkXPress 8 can import PDF files up to version 1.7 ( the default PDF version from the Creative Suite applications when using the Press Quality PDF setting is PDF 1.4 . +" once you add a variety of productivity gains , it becomes clear that ... they can represent thousands of dollars of return on investment over the course of a year . +© 2009 Quark Inc. and Quark Media House Sàrl , Switzerland . +arbitration is a form of alternative dispute resolution - specifically , a legal alternative to litigation whereby the parties to a dispute agree to submit their respective positions ( through agreement or hearing ) to a neutral third party ( the arbitrator ( s ) or arbiter ( s ) ) for resolution . +mediation is a process of alternative dispute resolution in which a neutral third party , the mediator , assists two or more parties in order to help them negotiate an agreement on a matter of common interest . +company Law regulates company formations , directors ' duties and shareholder agreements and the interpretation of relevant statutory or other law . +as a member of the European Union Hungary continues to demonstrate economic growth . +many embassies and transnational companies located in the capital bring many expatriate foreigners and their families to town , creating demand for private and international schools . +established in 1990 , the office of Haidegger & Partner in Budapest has been providing a full range of legal services offering individual tailored advice . +apart from being Hungary 's principal political , commercial , industrial and transportation centre , the city of Budapest boasts sites , monuments and spas of worldwide renown . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 19 Oct 2007 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6328 free Open ICEcat users . +Acer projector accessories can optimize your Acer X1160 / X1260 projectors and expand the usage and mobility of your product . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 07 Sep 2005 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6328 free Open ICEcat users . +this statistic is based on the 6821 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 08 Jun 2006 . +this statistic is based on the 6815 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 14 Sep 2007 . +AKVIS Sketch converts digital photos to amazing pencil sketches and watercolor drawings . +convert your photo into an oil painting with AKVIS ArtWork ! +AKVIS Chameleon is a fun to use tool for photo collage creation . +AKVIS Enhancer is an image enhancement program for revealing details on a picture . +Enhancer allows detecting details from underexposed , overexposed and mid tone areas of a photo . +SmartMask is an efficient masking tool that saves you time on complex selections and is fun to use . +AKVIS Coloriage allows colorizing B & W photos and replacing colors in color photos . +AKVIS Noise Buster is software for noise suppression on digital and scanned images . +ArtSuite is an impressive collection of versatile effects for decoration of photos . +there are two effect groups : photo frames which let you generate a frame for your photo , and artistic effects which allow experimenting with photos , for example , converting a photo into a B & W image ; replacing colors on an image , adding a texture , etc . +AKVIS Magnifier allows resizing images without loss in quality . +AKVIS Retoucher Plug @-@ in is an efficient program for photo restoration and photo retouching . +Retoucher removes dust , scratches , stains and other defects that appear on damaged photos . it reconstructs the lacking parts of the photo using the information of the sourrounding areas . +AKVIS ArtSuite is a collection of effects for photo decoration . +version 5.0 offers a new interface and improves compatibility on Macintosh . +the standalone version presents new beautiful Hand Painted Frames designed by artists . +the person in the image has tattoos all around the chest area . I 've tried so many times to cover the tattoos and couldn 't . +I downloaded a trial of Retoucher to see how it would work . one pass over the area with Retoucher and the tattoos were gone . +I was absolutely shocked at how fast and easy Akvis Retoucher made it . you would never know the young lady in the image has a tattoo . +there 's often too much design around , but never enough good design . +this insight prompted the launching in 1983 of the Design Plus competition by Messe Frankfurt in cooperation with the " Rat für Formgebung " ( Design Council ) and the Deutschen Industrie- und Handelskammertag / Association of German Chambers of Industry and Commerce ( DIHK ) . +the Design Plus award , presented by a distinguished jury in conjunction with the Ambiente international consumer goods trade fair in Frankfurt , stands for product design which is not merely an end in itself but instead very much exemplary and trend @-@ setting . +holiday apartments | Hotels | Hostels | Camping , Dormis & Bungalows | Last Minute Offers ! +barbecue and Mate Tea are Argentinian inventions with a social function . +many Argentinian families have in their own grill in the backyard of their homes ( called parrillas ) , where they meet on the weekends for a meal together with friends and relatives . +the beef here has different cuts to that of Europe . +the " baby bife " is approximately 1 / 2kg of steak , which could nourish a whole family . +the " bife de chorizo " is a gigantic rump steak , the " bife de lomo " a filet . +" asado de tira " is the part of the ribs , the " bife de costilla " is a T @-@ bone Steak , the " matambre " is a very thin cut piece of meat with lots of fat . +like in Europe , meat is prepared rare , mid rare , done , well done etc . +the sauce which comes with the meat dish is called " chimichurri . " however , because of the excellent taste of the meat , it can well be eaten without sauce . +a full plate of mixed grill contains practically all animal innards from various animals . +the Sausage ( chorizo ) and the black pudding ( morcilla ) are very popular . +served with the meats are vegetables ( sweet potatoes , pumpkin puree ) and mixed salads . +in a restaurant it is of course possible to order other parts of meats . +depending on the season and region , it is possible to eat in Argentina very good food and meat such as deer , lamb , sheep , goat , fish , and other seafood . +special delicacies are : King prawns ( centolla ) from Tierra de Fuego and wild boar ( jabalí ) or trout in Bariloche . +special dishes , going back as far as colonial times are : " pastel de choclo " or the " humita . " +more typical are chicken " pollo " in its most varied prepartions , " milanesa " ( Viennese escalope ) , " empanadas " ( fried pastry shells filled with chopped beef , chicken , maize or cheese and ham ) or " choripan " similar to a Hot Dog . +many Italian restaurants are found here which serve pizza and pasta . +next to fresh fruit salads and good ice cream , a wide range of desserts are offered . +two Argentine specialities are the " dulce de leche , " milk @-@ caramel marmalade which is also used to as a bread spread and the " alfajores " ( small cake @-@ formed biscuits with different fillings ) . +apartment for holiday lets in Cadiz . good condition , 2 ... +if you are searching for places to stay on the Costa del ... +apartment in Sierra Nevada , close to the chairlifts . ... +studio apartments | Two Bedroom Apartments | Three or More Bedroom Apartments | Holiday Houses | Hotels | Hostels | Last Minute Offers ! +Barcelona , Spain is a city located at the northeast side of the Iberian Peninsula , in the heart of Catalonia and bordered by the Mediterranean Sea to the east . +the city of Tarragona lies south of it , Lleida to the west , and Girona to the north . in this section you will find out how to get there by many transportation means . +to get to Barcelona by car from Madrid , you should take the Nacional II motorway . +from the coast , take the A @-@ 7 or the Autovía del Mediterráneo . Barcelona is about 600 kilometers from Madrid , at 350 from Valencia , 300 from Zaragoza , and at 1000 from Malaga . +from other European cities , Paris is 1,200 kilometers away from Barcelona , 1,098 from Berna , and 1,600 from London . +there are bus routes from almost every city in Spain to Barcelona and as well from most major European cities . +this is a number to call to get up to date information on the status of traffic in Barcelona Spain . +phone number for Prat Airport in Barcelona , Spain . +this great lodging in Cala Ratjada , Mallorca is located in ... +there are many Cadiz apartments for rent around the city . ... +bookmark Manager ( BKM for short ) is an open source tool for managing your bookmarks . +you 'll find everything about this tool on this Web site . current news , new versions and other information . +BKM consists of two projects : vbBKM and pyBKM . +vbBKM is a program developed in Visual Basic which works on the Microsoft Windows platform . +pyBKM is a tool developed in Python running on multiple platforms . +vbBKM is a stable product and has as many features as any other bookmark management software available . +but pyBKM is at very early stages and is not ready for production use , yet . +see the two project pages ( vbBKM and pyBKM ) for details . +holiday houses | Hotels | Guesthouses ( B & B ) | Hostels | Campsites | Top ten things to do and see | Last Minute Offers ! +apartment to rent in Nerja , Andalusia . spacious and ... +cosy apartment in the ski station of SIerra Nevada , ... +there are many Cadiz apartments for rent around the city . ... +Benalmadena studio apartment in a residential development ... +this charter firm based in Cadiz offers boats for hire from ... +apartments | Holiday houses | Hostels | Charter / Rent boats | Hotels | Last Minute Offers ! +Cadiz , Spain is fascinating ; it 's the oldest city in Europe . +in the winter you can enjoy the old -su @-@ interior @-@ pueblos @-@ blancos , town , and in the summer it 's ... +Apartamentos , chalets , hoteles , casas rurales , and lodging of all kinds . +Costa de la Luz is the ideal coast to charter . +Cadiz , Spain is the most important city in the province . +do you want to work for costasur . +to eat in Cadiz , Spain in the region of Andalusia is easy . +Cadiz , Spain is fascinating ; it 's the oldest city in Europe . +almost entirely surrounded by water , the city appears isolated . it stands on a peninsula jutting out into the bay , dramatically defining the surrounding landscape . +this ancient city on the Costa de la Luz in Andalusia is approximately 3,000 years old . the Phoenicians , Carthaginians , and Romans have all settlled here at one point over the years . +next were the Carthaginians and then the Romans who turned it into a thriving port . during the 17th century , trading between Cádiz and the rest of the world increased dramatically ; the economy boomed . +Cadiz on the other hand , is very relaxed and easy @-@ going . even at night , you 'd feel safe walking around the city . +never more than a few blocks away from the coastline . +the narrow and cobbled streets open out onto pretty little squares . people sit outside in cafes all day long enjoying the heat , and gazing up at the Moorish architechture . +Cadiz ( or little Havana ) as it 's often called , has strong relations with Cuba . +there has been a continuous flow of traffic between the two cites over the years . +the two cities even look very simililar . Cuba n scenes from the latest James Bond film ( Die another day ) were shot here in Cadiz . +you can walk around the old -su @-@ interior @-@ pueblos @-@ blancos , town in about an hour . +there are also some lovely parks you can visit with spectacular views out to the Bay . +if you are looking to rent apartments in Cadiz , on the Costa de la Luz of Spain , this accommodation is located on the beach promenade of the city of Cadiz , right near the beach . +there are many Cadiz apartments for rent around the city . +this particular one is fully furnished and equipped for up to 4 people . +if you 're looking for Cadiz flats in the southern region of Andalusia , Spain than we have the perfect one for you . +this home is fully furnished and equipped for your comfort . +Cadiz holiday rentals for let in Andalusia , Spain . +this lovely apartment is located in the new area of Cadiz , near the hospital . +this 2 @-@ star hotel is located in a tranquil area , half @-@ way between the entrance of the city of Cadiz and it 's old city center . +the hotel is well @-@ situated on one of the city ? s main boulevards , just 100 metres from the well @-@ known blue flag Victoria beach and close to the conference centre . +this is a brand new hotel in Cadiz ? historical city centre , right by the best shopping centres and historical areas . +located in the very heart of the thousand @-@ year @-@ old city of Cadiz and a few minutes away from the Santa Maria del Mar beach and Conference Centre . +the Hotel Hospederia Las Cortes de Cádiz is found in the historic centre of Cadiz , in a landmark building typical of local architecture of the 19th century . +the Hostel Mirador in Vejer de la Frontera is located in a tranquil area of this town in the province of Cadis . +it 's guests have 15 double rooms and 4 triple rooms available to them . +this hostel is located in one of the old town squares in Rota . +a perfect place to enjoy the center of town and the Costilla beach which is only 5 minutes walking distance . +Cadiz Mountains vacation homes for rent in Spain . +this holiday rental is situated in the pueblo blanco called Alcala de los Gazules and is situated in the Cadiz Mountains . +this Tarifa holiday letting in Spain has one bedroom , and is a nice choice for your vacation in Tarifa . +if you 're looking for accomodation in Cadiz , Spain this home for rent in Algeciras is a great choice . +with a capacity for 2 @-@ 3 people , this Spanish villa for let Canos de Meca is in the Cadiz Province of Spain . +if you are searching for places to stay on the Costa del ... +beautiful apartment in Chiclana , Cadiz for rent . +apartment for holiday lets in Cadiz . good condition , 2 ... +apartments | Rural Homes | Hotels | Hostels | Houses | Aparthotels | Yacht Charter | Last Minute Offers ! +the north east corner of the island has four championship standard golf courses . +they 're located at Canyamel , Capdepera , Pula @-@ Golf and Son Servera . +there 's an organistation on the island called the Mallorca Golf Connection , who can organise all you golfing needs - tee @-@ off times , transport to and from the courses , discounted green fees etc . they are the official TUI Thomson golf booking agency for Mallorca . +this holiday lettings in Majorca Spain , is located in Sa ... +apartments for rent in Cala Ratjada , Majorca . this ... +self catering holiday in nice apartment of 90sqm in Cala ... +Cala Ratjada is located on a small rocky peninsula on the extreme north east corner of Mallorca , ... +here you will find all kinds of Cala Ratjada accommodation and lodging , places to stay from top ... +there are plenty of things to do and see in Ratjada . +Cala ratajada 's main town beach is called " Son Moll . " +Cala Ratjada is a fantastic place to practise outdoor sports . +it goes without saying that eating and dirinking are important holiday pasttimes while you are ... +in Cala Ratjada you are sure to find a nightclub you like . +Charter a yacht or boat in Cala Ratjada for your holidays , it will be a decision you will not ... +there are many questions you may need answers to when planning your holidays in the beautiful ... +an easy way to get an overall look at Cala Ratjada is with panoramic photos . +going shopping in Cala Ratjada is a great option . +Cala Ratjada offers some very lively nightlife with plenty of bars , clubs and discos staying open well into the early hours . +one of the most popular discos is called " physical . " it 's popular with both locals and holiday makers . +they play a wide variety of music and regularly hold foam parties and lazer shows . +I hereby acknowledge having full knowledge of the booking terms . +complete guide of 3252 campsites in France . for each camp @-@ site containing the classification , the services , the hirings , the lodging and the situation . +our multicriterion research assistant allows you to refine your research while answering simple questions . +at the heart of the matter is the issue of trust : trust in institutions , in counterparties , in the market , not least , in information . +the investment strategy of Credit Suisse Fund ( Lux ) Global Responsible Equities focuses on appropriate capital growth in combination with environmental and social considerations . +copyright © 1997 - 2009 CREDIT SUISSE GROUP and / or its affiliates . +reads an entire file into an array . +note : context support was added with PHP 5.0.0 . for a description of contexts , refer to Stream Functions . +note : if PHP is not properly recognizing the line endings when reading files either on or created by a Macintosh computer , enabling the auto _ detect _ line _ endings run @-@ time configuration option may help resolve the problem . +/ / Get a file into an array . +/ / Another example , let 's get a web page into a string . see also file _ get _ contents ( ) . +when using SSL , Microsoft IIS will violate the protocol by closing the connection without sending a close _ notify indicator . PHP will report this as " SSL : fatal Protocol Error " when you reach the end of the data . +to work around this , the value of error _ reporting should be lowered to a level that does not include warnings . +PHP 4.3.7 and higher can detect buggy IIS server software when you open the stream using the https : / / wrapper and will suppress the warning . when using fsockopen ( ) to create an ssl : / / socket , the developer is responsible for detecting and suppressing this warning . +UpdateStar lets you stay up to date and secure with the personal software installations on your PC . +the Free Edition provides basic protection against outdated major versions of your software with limited features . +UpdateStar Premium delivers 20 times more updates . +it delivers all minor and major updates for your software setup . the Premium saves you so much time searching for all the newly available updates every day . +go Premium risk @-@ free with our unconditional 30 day money back guarantee and let the Premium Edition thoroughly deliver all of your PC 's updates . +UpdateStar Premium is available stand @-@ alone and bundled with other world @-@ class software products from our promotions webpage . corporate users use our volume licensing options . +the UpdateStar client offers access to our comprehensive database with more than 259,000 software recognized products . as our database is user @-@ driven , safe and mantained by the users it is constantly growing and currently the most complete software database around . +UpdateStar Premium delivers 20 times more updates and upgrades , whereas the Free only delivers major updates . +premium adds important features such as complete software maintenance , security advisory , frequent minor upgrade versions , exports and imports , scheduling and more . +go Premium for only $ 29.95 and stop missing all of your PC 's software updates . +we offer our customers a 100 % customer satisfaction or money @-@ back @-@ guarantee . +UpdateStar Premium can be licensed stand @-@ alone , but also comes bundled with world @-@ class software products . +for more information , please visit our promotions webpage . +UpdateStar transactions are processed via our ecommerce partner cleverbridge . +our store supports a variety of payment options including credit cards , cheques , and PayPal . +all transaction communication is encrypted and stored securely . +please make sure to check our current UpdateStar Suite promotional offers to get the best deal for you . +soon after you have requested your Premium trial license or purchased your Premium license you will receive an email containing installation instructions and your personal License Key to register UpdateStar . +if you are having problems with your order you can contact cleverbridge customer service . +it has been tested on Windows 2000 , Windows XP , and Windows Vista . +simply double @-@ click the downloaded file to install it . +UpdateStar Free and UpdateStar Premium are included come with the same installer . +UpdateStar includes support for many languages such as English , German , French , Italian , Hungarian , Russian and many more . +you can choose your language settings from within the program . +you can purchase your UpdateStar Premium upgrade license ( 1 year license ) for only $ 19.95 directly form our online shop or choose your favorite UpdateStar promotional offer to get your favorite deal for your license renewal . +if you are upgrading from a previous version of UpdateStar , simply install the downloaded version - your licensing information will be retained and Premium features will be activated . +if you first uninstalled your current UpdateStar , you 'll need to re @-@ enter your license key information to access the Premium features . +apartments | Houses | Aparthotels | Hotels | Last Minute Offers ! +self catering apartment in Conil de la Frontera for 6 ... +this Puerto de Santa Maria beachfront flat in Cadiz , Spain ... +located in the Golf course " Urbanizacion Cigüeña VI , " ... +holiday chalet in Conil for 6 people in the area of El ... +Attribution - You must attribute the work in the manner specified by the author or licensor ( but not in any way that suggests that they endorse you or your use of the work ) . +the page you came from contained embedded licensing metadata , including how the creator wishes to be attributed for re @-@ use . +you can use the HTML here to cite the work . +doing so will also include metadata on your page so that others can find the original work as well . +Noncommercial - You may not use this work for commercial purposes . +waiver - Any of the above conditions can be waived if you get permission from the copyright holder . +in addition to the right of licensors to request removal of their name from the work when used in a derivative or collective they don 't like , copyright laws in most jurisdictions around the world ( with the notable exception of the US except in very limited circumstances ) grant creators " moral rights " which may provide some redress if a derivative work represents a " derogatory treatment " of the licensor 's work . +publicity rights allow individuals to control how their voice , image or likeness is used for commercial purposes in public . if a CC @-@ licensed work includes the voice or image of anyone other than the licensor , a user of the work may need to get permission from those individuals before using the work for commercial purposes . +it is simply a handy reference for understanding the Legal Code ( the full license ) - it is a human @-@ readable expression of some of its key terms . think of it as the user @-@ friendly interface to the Legal Code beneath . +Attribution - You must attribute the work in the manner specified by the author or licensor ( but not in any way that suggests that they endorse you or your use of the work ) . +the page you came from contained embedded licensing metadata , including how the creator wishes to be attributed for re @-@ use . +you can use the HTML here to cite the work . +doing so will also include metadata on your page so that others can find the original work as well . +share Alike - If you alter , transform , or build upon this work , you may distribute the resulting work only under the same , similar or a compatible license . +waiver - Any of the above conditions can be waived if you get permission from the copyright holder . +rights other persons may have either in the work itself or in how the work is used , such as publicity or privacy rights . +notice - For any reuse or distribution , you must make clear to others the license terms of this work . +the best way to do this is with a link to this web page . +CC licenses anticipate that a licensor may want to waive compliance with a specific condition , such as attribution . +all jurisdictions allow some limited uses of copyrighted material without permission . +CC licenses do not affect the rights of users under those copyright limitations and exceptions , such as fair use and fair dealing where applicable . +in addition to the right of licensors to request removal of their name from the work when used in a derivative or collective they don 't like , copyright laws in most jurisdictions around the world ( with the notable exception of the US except in very limited circumstances ) grant creators " moral rights " which may provide some redress if a derivative work represents a " derogatory treatment " of the licensor 's work . +publicity rights allow individuals to control how their voice , image or likeness is used for commercial purposes in public . +if a CC @-@ licensed work includes the voice or image of anyone other than the licensor , a user of the work may need to get permission from those individuals before using the work for commercial purposes . +it is simply a handy reference for understanding the Legal Code ( the full license ) - it is a human @-@ readable expression of some of its key terms . think of it as the user @-@ friendly interface to the Legal Code beneath . +this Deed itself has no legal value , and its contents do not appear in the actual license . +creative Commons is not a law firm and does not provide legal services . +distributing of , displaying of , or linking to this Commons Deed does not create an attorney @-@ client relationship . +this is a human @-@ readable summary of the Legal Code ( the full license ) . +use this license for your own work . +a new version of this license is available . +you should use it for new works , and you may want to relicense existing works under it . +no works are automatically put under the new license , however . +running at the Speed of Light ... +all other elements © 2009 DC Comics . +SOE and the SOE logo are registered trademarks of Sony Online Entertainment LLC . +“ PlayStation ” and “ PS ” Family logo are registered trademarks and “ PS3 ” is a trademark of Sony Computer Entertainment Inc . all other trademarks and trade names are the property of their respective owners . +the ratings icon is a registered trademark of the Entertainment Software Association . +DC UNIVERSE and all related characters and elements are trademarks of and © DC Comics . +and as we know , emotions are good for business . +in this section you will find information on our products and the licence conditions . +please select a product from the graphic on the left . +this section holds the most general questions about PHP : what it is and what it does . +can I run several versions of PHP at the same time ? +what are the differences between PHP 3 and PHP 4 ? +what are the differences between PHP 4 and PHP 5 ? +I think I found a bug ! who should I tell ? +much of its syntax is borrowed from C , Java and Perl with a couple of unique PHP @-@ specific features thrown in . +the goal of the language is to allow web developers to write dynamically generated pages quickly . +this confuses many people because the first word of the acronym is the acronym . +this type of acronym is called a recursive acronym . +for more information , the curious can visit " Free On @-@ Line Dictionary of Computing or the " Wikipedia entry on recursive acronyms . +PHP / FI 2.0 is an early and no longer supported version of PHP . +PHP 3 is the successor to PHP / FI 2.0 and is a lot nicer . +PHP 5 is the current generation of PHP , which uses the " Zend engine 2 which , among other things , offers many additional OOP features . +please see the " What 's new in PHP 4 overview for a detailed explanation of these features and more . +while PHP 5 was purposely designed to be as compatible as possible with previous versions , there are some significant changes . +for more detailed information , please view the section on Migrating from PHP 4 to PHP 5 and the section on Backwards Incompatible Changes . +you should go to the PHP Bug Database and make sure the bug isn 't a known bug . +if you don 't see it in the database , use the reporting form to report the bug . +it is important to use the bug database instead of just sending an email to one of the mailing lists because the bug will have a tracking number assigned and it will then be possible for you to go back later and check on the status of the bug . +tag and the correspondant HTTP content type . +note : note that JPC and JP2 are capable of having components with different bit depths . +in this case , the value for " bits " is the highest bit depth encountered . +also , JP2 files may contain multiple JPEG 2000 codestreams . +in this case , getimagesize ( ) returns the values for the first codestream it encounters in the root of the file . +note : the information about icons are retrieved from the icon with the highest bitrate . +it can reference a local file or ( configuration permitting ) a remote file using one of the supported streams . +this optional parameter allows you to extract some extended information from the image file . +currently , this will return the different JPG APP markers as an associative array . +some programs use these APP markers to embed text information in images . +a very common one is to embed " IPTC information in the APP13 marker . +you can use the iptcparse ( ) function to parse the binary APP13 marker into something readable . +returns an array with 7 elements . +index 0 and 1 contains respectively the width and the height of the image . +note : some formats may contain no image or may contain multiple images . +in these cases , getimagesize ( ) might not be able to properly determine the image size. getimagesize ( ) will return zero for width and height in these cases . +index 2 is one of the IMAGETYPE _ XXX constants indicating the type of the image . +index 3 is a text string with the correct height = " yyy " width = " xxx " string that can be used directly in an IMG tag . +channels will be 3 for RGB pictures and 4 for CMYK pictures . +support for JPC , JP2 , JPX , JB2 , XBM , and WBMP became available . +note : this function does not require the GD image library . +video for Windows AVI , Quicktime MOV , MPEG MPG , Windows Media Video WMV or ASF , etc . ) , you will find the getid3 library to be indispensible . +now every Depositfiles user can upload his file even more easier , safer and at any time ! +there is available a new version of Depositfiles Uploader 1.3.15 with multiupload function ! +it allows to upload up to 10 files simultaneously ! +- uploading process is both easy and pleasent ! +* In order not to loose your files we recommend to upload these into your account . +go to " Options " in the menu in the " Account " tag and type in your login and password ( if you don 't have an account with Depositfiles , you can register one here - this is totally free ) . +* The program automatically splits big files ( more then 100 Mb ) onto smaller parts with help of archiver so that you could add the files bigger then 100 Mb to the queue for upload . +to switch this option on , please indicate the route to the Winrar archiver on your computer . +to do this please check the box with " compress files bigger then ... " and indicate the route to the folder with archiver . +the archiver can be downloaded here . +export the links to the uploaded files. you can set up yourself the format of the links which is more convinient ! +- You can choose the way to export the links to uploaded files : either to copy to clipboard or to save it as text file . to do this , please go to " Options " in the menu and in the tag " Main " choose " clipboard " or " notepad . " +- You can choose the format of the links to uploaded files . +any link format @-@ choose any that 's convinient for you ! +if you want to upload a file bigger then 100Mb - just add this to the upload queue and the program will offer splitting the file into smaller files and then will add these to the upload queue ! +pay attention that for this function to function properly , you need to go to " Options , " choose submenu " Main " and indicate the route to the archiver program , mentioning the minimum size of the file and mark the option if you wish to activate it . +also please pay attention to such options as : minimize to tray , transparency of the program 's panel , option of starting and stopping uploading according to the settings you make , etc . +forums about San Juan de los Terreros - Almería . +forums about Conil de la Frontera - Cádiz . +forums about Horta - Isla Faial - Azores . +forums about El Mocanal - El Hierro . +forums about La Restinga - El Hierro . +forums about Ten - bel , urbanizacion - Tenerife . +forums about Playa son bou - Ibiza . +is a string object that can have zero or more values , each of which must be chosen from a list of allowed values specified when the table is created . +Member values in the table definition when a table is created . +column are displayed using the lettercase that was used in the column definition . +columns can be assigned a character set and collation . +for binary or case @-@ sensitive collations , lettercase is taken into account when assigning values to the column . +values numerically , with the low @-@ order bit of the stored value corresponding to the first set member . +value in a numeric context , the value retrieved has bits set corresponding to the set members that make up the column value . +column , the bits that are set in the binary representation of the number determine the set members in the column value . +element , it does not matter what order the elements are listed in when you insert the value . +it also does not matter how many times a given element is listed in the value . +when the value is retrieved later , each element in the value appears once , with elements listed according to the order in which they were specified at table creation time . +anywhere , even as a substring of another set member . +the first of these statements looks for values containing the first set member . +the second looks for an exact match . +be careful with comparisons of the second type . +you should specify the values in the same order they are listed in the column definition . +my startling comic Die zwei lustigen Raver with the two jolly ravers Acid & E has been published in the awesome book Shake Your Tree # 3 . +I was drawing this story with Microsoft Paintbrush . +examine the original drawings in full size ! +there is also some Hupel Pupel in the book ! +one bedroom Apartments and Studios | 2 Bedroom Apartments | 3 or more Bedrooms | Spanish Villas | Hotels | Hostels | Last Minute Offers ! +this is the main location for kitesurf in this part of Cadiz province . +beach with golden sands , located in the residential area next to the Castillo ( castle ) of San ... +this beach is situated next to the ruins of the Castillo de Santa Catalina , of savage aspect , ... +beach on the port of fine golden sands and semi @-@ urban charakter with excellent services and ... +the El Levante beach , also known as Los Toruños is a beach isolated from the urban centre of the ... +semi @-@ urban , very busy , large beach of golden sands . +this place has many visitors . here you can enjoy nautic sports , fishing , wind @-@ surf , kayak etc . +located in Chiclana de la Frontera , this holiday home for ... +located in Conil de la Frontera , Andalusia , this holiday ... +the Top 70 Finalists shortlist has been selected from the entries submitted to the eLearning Awards in 2008 . +if you are among the shortlisted project , you can include a logo ' TOP 70 ' on your website . +in the spirit of newness , let 's take a look at the only deck in the Top 8 of Turin that made use of the new Arena Grand Melee cards . +basically , to have true control over the tempo of the game is like being a policeman controlling traffic . +if you are able to successfully put up a stop sign for your opponent 's plans and / or consistently make a green light for yourself , you have achieved tempo advantage . +we just got a huge influx of new playables ( the new set is simply awesome ! +) , and everyone is scrambling to determine the best way to use them . +apartments | Hotels | Hostels | Campings | Thing to do | Last Minute Offers ! +holiday lettings available on the Costa del Sol of the ... +this flat for rent in Benalmadena , Malaga is located in the ... +the EuroLinux Alliance for a Free Information Infrastructure is an open coalition of commercial companies and non @-@ profit associations united to promote and protect a vigourous European Software Culture based on copyright , open standards , open competition and open source software such as Linux . +corporate members or sponsors of EuroLinux develop or sell software under free , semi @-@ free and non @-@ free licenses for operating systems such as GNU / Linux , MacOS or MS Windows . +for the last few years the European Patent Office ( EPO ) has , contrary to the letter and spirit of the existing law , granted more than 30000 patents on rules of organisation and calculation claimed in terms of general @-@ purpose computing equipment , called " programs for computers " in the law of 1973 and " computer @-@ implemented inventions " in EPO Newspeak since 2000 . +Europe 's patent movement is pressing to legitimate this practise by writing a new law . +although the patent movement has lost major battles in November 2000 and September 2003 , Europe 's programmers and citizens are still facing considerable risks . +here you find the basic documentation , starting from the latest news and a short overview . +the patent movement has during several decades won the support of large corporations and governments for its expansionist cause . +yet FFII , Eurolinux and others have devoted themselves to this work with considerable success . +still , we continue to have more tasks than free hands . +here we tell you how you can help us move forward more quickly . +a database of the monopolies on programming problems , which the European Patent Office has granted against the letter and spirit of the existing laws , and about which it is unsufficiently informing the public , delivering only chunks of graphical data hidden behind input masks . +the FFII software patent workgroup is trying to single out the software patents , make them better accessible and show their effects on software development . +during the last few years , the European Patent Office ( EPO ) has granted several 10000 patents on computer @-@ implemented rules of organisation and calculation , i.e. programs for computers [ as such ] . +we are systematically collecting these patents and republishing them in a more accessible form . +having been appointed as the exclusive distributor for the Sufix brand in the UK , Shimano is now able to offer one of the most comprehensive mono , braid & hooklink ranges in the market . +aged 29 and from West Bay , Dorset , Dave started fishing at the tender age of 5 and joined the West Bay Sea Angling Club when he was 14 , going on to win pretty much every trophy there was . +" Shimano are pleased to announce the signing of Alex Bones , to bolster our already impressive match line @-@ up . +Shimano is pleased to a announce the appointment of Darran Goulder to their consultant team . +Shimano is proud to announce availability of its new catalogues for 2009 . +Seon katõgoorian ommaq järgmädseq 19 lehekülge ( kokko 19 ) . +SEO leht om viimäte muudõt 07 : 59 , 25. lehekuu 2009 . +houses | Apartments | Hotels | Things to do | Last Minute Offers ! +Formigal also known as Fornigal in Aragon is a small town in the province of Huesca of Northern ... +if you 're looking for things to do in Formigal , Pyrenees in Spain the main attraction here ... +the climate in Formigal is that of the Pyrenees in Aragon , Spain , cold in the winter and mild ... +we are amongst one of the main ski resorts in the Iberian Penninsula and the Pyrnees . +below these lines we offer you some information of interest in Formigal , Pyrenees , Spain . +to speak of nature in Formigal is to speak of the Tena Valley , the Pyrenees and incredible areas ... +Formigal is a ski resort located in a small municipality of about 200 people . +there are various sports you can practice in Formigal , Spain . +if you want to get a great meal in Formigal , it will not be difficult as there are various ... +Formigal , Spain in the Pyrnees belongs to the province of Aragon and is located in the ... +Sallent de Gallego is a small town in the province of Huesca in the Pyrenees in Spain that has ... +Formigal is located in Sallent de Gallego , in the Pyrenees of Spain . +Formigal also known as Fornigal in Aragon is a small town in the province of Huesca of Northern Spain in the heart of the Pyrenees . +it is part of the locality of Sallent de Gallego and has a population of about 200 inhabitants , although during the ski season this amount is multiplied by 5 . +Formigal is located at only a few kilometers from the French border , located at 90 kilometers from Huesca . +it is located in the beautiful Tena Valley and is one of the most important and biggest ski resorts in all of Spain . +the capacity of this ski resort is for approximately 30,000 people . +Aramon Formigal is the Ski resort in Formigal , and this resort is located between 1500 @-@ 2250 meters above sea level , and has more than 100 kilometers of skiable slopes . +this resort has all types of equipment such as ski lifts , and conveyer belts to get around . +it has the capacity to allow 25,000 people ski each hour , and has services for material rental , restaurants , accommodation , accommodation such as hotels and apartments , health clinics , and a variety of slopes special for snowboarding , slalom and tubers . +located in the historic city center of la Villa de Sallent , hotel Balaitus is a traditional mountain lodging . the outside is classic in style with an arch of stone . +situated in Huesca the hotel is located in an area of natural beauty , the hotel offers an ambience of peace and tranquillity in the heart of the Pyrenees . +it is located in an exception location . very cozy home and well equipped with all of the necessities for your holidays . +this holiday apartment in Formigal , Pyrenees in northern Spain is located right by the slopes and very centric . +near the supermarkets , bars , restaurants and much much more ! +this flat for rent in Benalmadena , Malaga is located in the ... +self catering holiday in nice apartment of 90sqm in Cala ... +hotels | Holiday apartments | Hostels | Rent a Car | Last Minute Offers ! +real estate , constructions , transfers etc. in Fuengirola . +if you are searching for places to stay on the Costa del ... +these Fuengirola apartment rentals in Malaga , Spain are ... +this studio for rent in Torremolinos , Malaga is only a very ... +once reservations have been confirmed we kindly ask you for a down payment of € 300 , -- per room to be remitted to our account with Raiffeisenbank Ehrwald . +in case of premature departure or late arrival we will charge room rates as booked . +we accept cash , travellers cheques , EC- , Visa- or Mastercard as well as advance bank transfer . +we would like to advise you that it is only possible to book individual room categories . reservations of specific rooms or floors are not accepted . +rooms are available for guests from 3 p.m. +guests are asked to vacate rooms by 11 a.m. +with your reservation you can take out hogast holiday insurance to cover you for cancellation and other unforeseen eventualities . +this statistic is based on the 6815 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 01 Mar 2008 . +the exact speed varies depending on the system configuration , software program , and document complexity . +time needed before printing after power switched on ; expressed in seconds . +size of pallets ( Width x Depth x Height ) in millimeter . +all personal data is encrypted and will be processed in a secure way . +HROS takes the privacy of your personal data very serious . +your personal information will only be used to process your booking . +for more information , read our privacy statement . +HROS will not charge you any reservation fees for making this booking , nor charge your credit card . +you will simply pay for your stay at the hotel . +cancellation is free of charge ; provided you adhere to the notification period stated in the hotel cancellation policy ( see " Hotel Policies " below ) . +more information can be found in our terms and conditions . +see your offers here too ? register online ( free ) ! +this statistic is based on the 6862 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 17 Sep 2005 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6368 free Open ICEcat users . +this statistic is based on the 6865 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 14 Jul 2005 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6371 free Open ICEcat users . +this statistic is based on the 6827 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 23 Jul 2006 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6336 free Open ICEcat users . +this statistic is based on the 6870 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 24 Sep 2005 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6376 free Open ICEcat users . +this statistic is based on the 6828 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 24 Sep 2005 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6336 free Open ICEcat users . +this statistic is based on the 6863 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 24 Oct 2006 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6369 free Open ICEcat users . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 02 Nov 2006 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6328 free Open ICEcat users . +this statistic is based on the 6859 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 15 Nov 2006 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6365 free Open ICEcat users . +this statistic is based on the 6861 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 20 Mar 2008 . +this statistic is based on the 6862 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 11 Mar 2007 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6368 free Open ICEcat users . +this statistic is based on the 6864 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 15 Jul 2005 . +ICEcat : creating the world 's largest open catalogue with 2300000 products , 604865 data @-@ sheets , 4001 brands . +work towards the harmonisation of terminology in the field of audit . +from its very beginning EUROSAI has been active in organizing a fruitful and mutually beneficial cooperation in the field of public audit among all the countries of the previously divided Europe , prioritizing support for the establishment of independent public audit bodies in Central and Eastern European transition countries . +drivers and Updates includes hardware drivers ( e.g. , NI @-@ DAQ , NI @-@ 488.2 , NI @-@ IMAQ , etc . ) , firmware updates , and application software updates ( e.g. , NI LabVIEW and NI Measurement Studio ) . +to obtain instrument drivers for NI products ( e.g. , NI @-@ SCOPE , NI @-@ DMM , etc . +) or for third @-@ party products ( e.g. , HP 34401A Digital Multimeter ) , visit the Instrument Driver Network . +my Profile | RSS | Privacy | Legal | Contact NI © 2009 National Instruments Corporation . +it 's the last month of the year ... time to look back on this year and time to embrace the new one . +for the Kindo team it was a very exciting one and we are busy working in the arms of the MyHeritage family . +you can read the announcement here ( pdf ) , or get the complete FAQ here , but I wanted to give you some more background on why Kindo and MyHeritage have teamed up . +the first time we met Gilad , the founder and CEO of Myheritage , was in early 2007 - a few weeks before we released the first public version of Kindo . +Gareth and me were invited to have lunch with him in Soho in central London , and went there with the objective of learning everything we could about the " enemy . " +this proved to be pretty naive , since Gilad is much too nice to be called an " enemy . " +but there we were : a Swede , a South African and an Israeli , all with very different professional background and life stories , talking about the future of families online from very different perspectives . +I didn 't expect this , but we found that we had much in common . +we shared the same ideas and vision for what we wanted to achieve with our businesses , even though our approach was far from similar . +Kindo had set out to build a site that would help you interact with the family that is around you here and now . +we were trying to come up with tools to help you share information and communicate with the people that matter most to you right now . +MyHeritage on the other hand had developed amazing technologies to help you find out everything about your family 's history , and had spent years perfecting these technologies . +ultimately though , what interested us both was the opportunity to help families discover more about who they are and their past , and use the web to bring them closer together . +as Gareth and I travelled back on the tube , we talked about how nice it would be to be able to offer our own users the same tools as MyHeritage already had . +what really got us excited was their SmartMatching Technology , which matches people in your family tree with 250 Million other names , and suggests who you might be related too . +during this summer , we 've been thinking long and hard about the future of Kindo , and what the best option would be for taking Kindo to the next level . +the more time we spent with Gilad as well as the rest of the team in Israel ( not to mention the very loud rooster that runs around in their campus ) , the more convinced we all became - we 'll be better off together . +so we join the MyHeritage family because we share the same vision and values ( as families should ) , and because we think that we can build an amazing product together - bringing real benefits to families around the world . +this is what we 're planning to do over the next years . +as you see from the smiles in the picture below , this is a happy day for us here in the Putney Offices . +this years Swedish Genealogy Days the yearly conference of the Swedish Federation of Genealogical Societies was held in the city of Malmö in the province of Skåne in southern Sweden . +over 100 exibitors crowded in the convention center at Europaporten in Malmö . everything from genealogical societies , archives to companies offering original sources online . +in the meanwhile , you can still work on your profiles and upload the photos from the weekend 's family reunion . +update : now everything is fixed ! it was the database acting a bit weird but Stephen has done a great job to fix it . +everyone hits a brick wall in genealogy , where the birth and death records just aren 't easy to find . in that case , try some different records . +deeds often contain information on who sold what to whom ; who inherited what from whom ; or how some land was divided among a family . historic tax records on land also sometimes have interesting information , such as co @-@ ownership . +just as the squirrels and different kinds of birds collect nuts and seeds , someone in the Kindo office seems to be doing some really solid forward planning . +judging from this picture - it 's going to be a looong winter . +as I point out in my book , Genealogy Online 8th Edition , you can do a lot of good research using search engines and Boolean operators ( AND , NOT , OR , and parentheses . ) Recently , Yahoo ! +advanced Google still accepts the most popular Boolean terms , and Exalead even supports the NEAR operator , which really helps with common surnames , but Live Search is now the only major search engine with full Boolean support . +for more details on how to do the Boolean Boogie for genealogy , read Genealogy Online 8th Edition . +most beginners to genealogy are fuzzy about cousinship . for example , while many understand first cousins are people with common grandparents , many folks confuse first cousin once removed and second cousin . +then , there are families like mine : my mother 's siblings were born from 1911 to 1932 . that means her youngest brother " Mike " went to school with the oldest sister 's oldest child , his niece " Karen . " +as it turned out , Uncle " Mike " and niece " Karen " married two people who were siblings , " Michelle " and " Kevin , " making their children both first cousins and first cousins once removed . +showing degrees of relationship by blood . +-first it does not replace paper documentation since it does not name common ancestors , it is more a tool to prove lineage . +several companies , like Family Tree DNA , can assist you to confirm your family tree or allow you to determine whether two people are related if you give them DNA of someone . +- It 's less reffective for females to use DNA than for males because thanks to Y @-@ DNA , we can determine paternal and maternal lines , whereas for females it is just maternal . +here are a few suggestions to get you started : introduction to Genealogy from National Genealogical Society ( U.S. ) Most genealogists take this course first . it is very American @-@ centric , but the techniques can be applied to any country 's vital statistics . +members of the National Genealogical Society ( NGS ) receive a tuition discount . family History Personal Enrichment Classes Learn from Brigham Young University about research in the United States , France and Germany . +Lufthansa owes its origins to " Deutsche Luft Hansa Aktiengesellschaft " ( renamed " Lufthansa " in 1933 ) , which is formed from a merger between " Deutsche Aero Lloyd " ( DAL ) and " Junkers Luftverkehr " on January 6 . +the new airline inherits its crane logo , designed by " Deutsche Luft @-@ Reederei " in 1919 , from DAL , the blue @-@ and @-@ yellow house colours from Junkers . +it commences scheduled flights on April 6 with a fleet of 162 aircraft , of 18 different types . +a flying expedition to China is the event of the year . +following its acquisition of shares in 1926 in the German @-@ Russian " Dereluft " airline , which was founded in 1921 , Lufthansa is influential in the founding of the Spanish Iberia , the Brazilian " Syndicato Condor " and the Chinese " Eurasia " airlines . +Lufthansa opens the first trans @-@ oceanic , scheduled airmail service across the South Atlantic . +between 1936 and 1938 , it also experiments with scheduled air services across the North Atlantic . +after substantial expansion of the route network in 1939 — including flights to Bangkok and Santiago de Chile — wartime air services , except for a few European countries , are suspended . +all flights are discontinued in 1945 and Lufthansa goes into receivership and is finally wound up and struck from the Berlin commercial register in 1965 . +the Federal Transport Minister sets up a working committee in 1951 to prepare for the resumption of air traffic in postwar Germany and entrusts the job of implementation to " Büro Bongers , " the office headed by Hans M. Bongers , the traffic chief of the old Lufthansa in Cologne . a new company to run air services and named " Aktiengesellschaft für Luftverkehrsbedarf " ( Luftag ) is founded in Cologne on January 6 , 1953 . +Lufthansa enters the jet age , initially on long @-@ haul routes , with the arrival in the fleet of the Boeing 707 . +the last of the propeller @-@ driven aircraft , a Vickers Viscount , is retired in 1971 . +conversion to jet aircraft continues with the start of flights on medium @-@ haul routes with the Boeing 727 and , on short @-@ haul , with the Boeing 737 , the city jet largely fathered by Lufthansa . +the wide @-@ body era begins at Lufthansa with the delivery of its first Boeing 747 jumbo jet in 1970 , later to be joined by the McDonnell @-@ Douglas DC10 and the A300 , the first of the jets from the newly founded European aircraftmaker . +Lufthansa resumes flights to Berlin 45 years after the end of World War Two following Germany 's reunification . +Lufthansa masters its worst @-@ ever economic crisis with a sweeping rehabilitation programme . +the airline , largely owned by the state , is privatised step by step . +its MRO , cargo and IT businesses are spun off as independent companies . +Lufthansa , Air Canada , SAS , Thai Airways und United Airlines create the " Star Alliance , " the world 's first multilateral airline grouping , later to be joined by other carriers . +the Lufthansa Aviation Group equips itself for the new millenium , training its focus on innovation and quality . +placement of orders for 15 Airbus A380 megaliners charts the airline 's path into the future . +even during times of crisis in the aviation industry , Lufthansa remains on the ascent : with the " Future European Operations " programme , the airline reorganises its regional markets , while gaining new partner airlines to expand the Star Alliance global route network . +passengers enjoy greater comfort in a completely revamped Business Class with fast broadband Internet connectivity in the aircraft cabin . +Lufthansa creates new perspectives for Germany 's future as a business location : Lufthansa orders 20 Boeing 747 @-@ 8s and is the launch customer for the aircraft . +preparations for the A380 include route proving with Airbus , a new A380 maintenance hangar and a new terminal area in Frankfurt . +the Lufthansa Aviation Center becomes an architectural flagship . +Lufthansa Cargo founds the cargo airline AeroLogic with DHL Express . +further partners strengthen the Star Alliance , which now encompasses 21 members . +adjust settings for the connection to devices that support DLNA . +enable connection to devices that support DLNA . +disable connection to devices that support DLNA . +DLNA ( Digital Living Network Alliance ) is a standard that enables digital devices such as personal computers , digital video recorders , and TVs to be connected on a network and to share data that is on other connected , DLNA @-@ compatible devices . +" servers " distribute media such as image , music , or video files , and " clients " receive and play the media . +using a PS3 ™ system as a client , you can display images , or play music or video files that are stored on a device with DLNA Media Server functionality over a network . +connect the PS3 ™ system and DLNA Media Server using a wired or wireless connection . +set up the DLNA Media Server so that it can be used by the PS3 ™ system . +the following devices can be used as DLNA Media Servers . +enable the DLNA Media Server function of the connected device to make its content available for shared access . +the setup method varies depending on the connected device . +for details , refer to the instructions supplied with the device . +a Microsoft ® Windows ® personal computer can be used as a DLNA Media Server by using Windows Media ® Player 11 functions . +from the list of devices under the [ Share media ] checkbox , select the devices that you want to share data with , and then select [ Allow ] . +setup for the DLNA Media Server is completed . +Windows Media ® Player 11 is not installed by default on a Microsoft ® Windows ® personal computer . +download the installer from the Microsoft ® Web site to install Windows Media ® Player 11 . +for details about how to use Windows Media ® Player 11 , refer to the Windows Media ® Player 11 Help feature . +in some cases , original DLNA Media Server software may be installed on the personal computer . +for details , refer to the instructions supplied with the computer . +all available folders and files that can be played by the PS3 ™ system will be displayed . +select the file that you want to play . +the PS3 ™ system must be connected to a network . +the folder names that are displayed vary depending on the DLNA Media Server . +depending on the DLNA Media Server , some files may not be playable or operations that can be performed during playback may be restricted . +file names for data that is stored on servers that are not compliant with DLNA may have an asterisk appended to the file name . +in some cases , these files cannot be played on the PS3 ™ system . +also , even if the files can be played on the PS3 ™ system , it might not be possible to play the files on other devices . +you can initiate a search for DLNA Media Servers on the same network . +use this feature if no DLNA Media Server is detected when the PS3 ™ system is turned on . +when the search results are displayed and you return to the home menu , a list of DLNA Media Servers that can be connected will be displayed . +apartments | Hotels | Hostel | Holiday Houses | Things to do | Last Minute Offers ! +holiday lettings available on the Costa del Sol of the ... +apartment to rent in Fuengirola , 15 minutes walk from the ... +at only steps from the Paseo Maritimo , this beach ... +196 / 2003 regarding the processing of personal data and consensus to process such data . data are being acquired here in order to register the person involved and to initiate a service through which said person will receive information about offers , prices and similar initiatives pertaining to the Romantik Hotel Posta Cavallino Bianco . +even though providing such data , which will be processed by means of electronic procedures , is not mandatory , not agreeing to furnish such information will make processing by the handler impossible . +the person involved can request to update , correct and supplement incomplete or inaccurate data , and to cancel such data when the processing violates legal standards or regulations , and can exercise the other rights established by art . +196 / 2003 by contacting the owner of the processing , the Romantik Hotel Posta Cavallino Bianco . +which navigation systems are compatible with Tele Atlas maps ? +do I receive confirmation of my order ? +what are Tele Atlas " general terms and conditions ? +which navigation systems are compatible with Tele Atlas maps ? +this package contains the documentation for otrs2 in English language . +this service is sponsored by 1 & 1 Internet AG . +holiday Apartments | Hotels | Hostels | Top ten things to do | Last Minute Offers ! +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 29 Oct 2006 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6328 free Open ICEcat users . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 29 Oct 2006 . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 22 Dec 2006 . +der englischsprachige Channel listet Inhalte ausgewählter Blogs auf , die regelmäßig Beiträge zu WordPress publizieren . +normally when I set up plugin @-@ level SEO on a WordPress blog , I 'll need 5 @-@ 8 plugins to provide all the desired SEO functionality . wouldn 't it be cool if there was one plugin that incorporated all that functionality and more into one easy @-@ to @-@ use suite ? +there is finally an official answer to the question of whether or not WordPress themes must " inherit " the GPL license that WordPress itself uses . +Matt asked the Software Freedom Law Center to examine the WordPress source and how themes fit in . the final , official answer to whether themes must be GPL ? +the WordPress Plugin Competition is the best moment of the year , plugin wise . +this year 's edition is still running till the end of the month of July and as of writing , there are about 10 plugins ( with some really interesting stuff ) . still , new plugins are released every day . +why aren 't these authors joining the competition ? participating in the competition is Instructive : you * will * learn something . +if WordPress were a country , our Bill of Rights would be the GPL because it protects our core freedoms . +we 've always done our best to keep WordPress.org clean and only promote things that are completely compatible and legal with WordPress " license . +there have been some questions in the community about whether the GPL applies to themes like we 've always assumed . +the Official WordPress Commercial Theme Directory is now open . it is linked off the WordPress Extend and is part of the parent theme directory set of links . +it is not as much a directory as it is a listing of sites that offer commercial GPL themes . +if you aren 't already following along , I highly recommend checking out the How To Create A WordPress Theme tutorial series by Ian Stewart ( ThemeShaper.com ) . +this 12 @-@ part series ( 8 complete at the time of this writing ) aims to take you from nothing to a fully functional , semantically rich , flexible WordPress theme in digestible chunks . +along the way , Ian describes the changes being made , and why you are making them . +a critical vulnerability has been discovered in the WordPress Plugin Related Sites plugin . +an exploit is available in the wild and available on Milw0rm , making this attack easier to exploit . +although , the vulnerability says that version 2.1 is vulnerable . +you should assume previous versions are vulnerable as well . +DM Albums ™ is an inline photo album / gallery plugin that displays high quality images and thumbnails perfectly sized to your blog . two vulnerabilities have been made public : 1 . +wp Roadmap aims to create a detailed view of how WordPress works , by organizing the list of calls by the order in which they are executed The code currently tracks actions , filters , includes , and requires . +wp Roadmap can track the order of these calls from different page views and different versions of WordPress . +WordPress Configuration Tricks : if you have ever installed WordPress and wanted to know what else you could do with your wp @-@ config.php file , this is the blog post to read . +as Ozh points out in the comments , a couple of tricks were left out but nothing that could not be remedied with a simple Google search ( e.g. +wp _ HTTP ) as long as you know what you are looking for . +tips from that page that got me thinking ( things that make you go hmmmmm ? +beta 2 is here . get it while it 's ... still not quite out of Beta ... or something . +Dev4Press has done some interesting work on benchmarking various aspects of WordPress and then testing out some popular questions on page loads etc . +the methodologies are well documented and the benchmarking setup is standardized . +they are testing three version of WordPress including 2.6.5 , 2.7.1 and 2.8 ( I wish they would have waited for 2.8.1 to be released ) . +are you one of those worried about the progress of your WordPress blog ? do you wanna know how your visitors found you and how they interact with your site ? +well , it requires you to drill down the site analytics to get to the bottom . +Google Analytics is a trusted third party service that provides an accurate view and detailed analysis of performance statistics of your website without asking for your own resource . it may sound strange that some other website is tracking your performance without you having to provide anything . +what do you mean by a virtual world ? a place where you can come and work and chat and play and leave . +what if I tell you about a whole new world where you can live , breathe , play , sing , dance , do every damn thing you can do in this ( sorry ? +live Streaming video from WordCamp Dallas thanks to Cali Lewis of GeekBrief.tv. Thanks to WPTavern for the link . +I was at WordCamp Dallas last year but could not make it to this one . I am jealous ! +WordCamp Dallas 2009 is going on this weekend in the DFW area , and The WordPress Podcast is happy to be one the sponsors . +Cali Lewis is live streaming the entire event , so if you want to catch some good WordPress info , that can 't be beat ! +2.8.1 Beta 2 is ready for testing . download it , check out the changes since beta 1 , and review all tickets fixed in 2.8.1 . +we especially suggest , recommend , and beg that plugin developers test their plugins against beta 2 and let us know of any issues . +one of the sites I 'm running is a long established community website with zillions of posts and bazillions of comments . +the whole stuff is powered by a home made CMS written in Perl , something I did nearly 10 years ago and that is totally outdated today . +it 's not uncommon that a blog runs by different authors , so it may be useful if you can have a quick look at the drafts of all authors . +in our joint blog WP Engineer we created a feed , which keeps us up to date if a new draft of all authors were created . the work of every author is different and the dashboard is the center of information . +therefore we have decided to supplement a widget in the Dashboard , which shows the last five drafts of all authors . I enhanced the existing Plugin Draft Feed . +so , if I didn 't forget to upload a zip archive or to press a " Commit " button , it should be all here for you : the GPL 'd URL shortener I wrote about earlier that Lester Chan and I have been using for a while now . +our URL shortener service is called YOURLS , as in Your Own URL Shortener . it 's all GPL , free , do @-@ what @-@ you @-@ want @-@ with @-@ it . +based upon the emails we get each month from readers who are trying to find our Twitter feed , I feel this post is probably long overdue , but I wanted to point out to our readership that we do in fact have a Twitter page ! +if you 'd like to follow WPHacks.com on Twitter , you can get our updates here ( @ HackWordPress ) . +here 's a snippet of his reasoning ... I got a tip that Chris Anderson 's upcoming book Free has the following to say about WordPress : 2 . feature limited ( Basic version free , more sophisticated version paid . +today we have a little link tip for our readers : at wp topics you can find all useful news , hacks , tutorials about WordPress in one place right away ! +in each category are the most relevant and interesting websites about WordPress listed . +you can also vote for each website , how much you like their content . +WordPress 2.8 just crossed the 1 million download mark today , you can see the live counter for WordPress downloads on the download counter page . +this is definitely great news since this milestone was reached in 12 days , have you upgraded to WordPress 2.8 yet ? thanks @ photomatt via twitter . +by the time you 're reading this , it is quite possible that the BuddyPress revolution will have happened , but if you 're reading this soon after this was posted , I can safely say that the BuddyPress revolution hasn 't happened yet . it 's under way , but is hasn 't happened . +one week after AC / DC 's Black Ice Tour in Paris , I 've had yet another musical rendez @-@ vous : Hellfest Open Air 2009 ( momentarily closed as of writing ) , a music festival held about 30 kms from where I currently live . +I 've seen live on stage : ( ... ) Read the rest of Bands I 've Seen This Weekend : Hellfest 2009 , My Recap ( 216 words ) ( cc ) Ozh for planetOzh , 2009 . +since so many people are having simple problems with their WordPress 2.8 installations , the WordPress community is working tirelessly to get a point @-@ one update out as soon as possible , and the first step on that road is to test a beta of the release . +WordPress 2.8.1 Beta 1 has been released , and they need people to test it out and see if it fixes the problems , so if you are having issues with WordPress 2.8 , try the beta , and see if that resolves it . +some plugins are causing grief for those upgrading to 2.8 . HyperDB needs to be updated to the latest version , otherwise tags won 't save . +plugins that load old versions of jQuery for all admin pages will break all kinds of stuff . plugins should use the version of jQuery that ships with WP . +Keno Xperiment is an extended version of the Keno Lottery game . +the player is presented with a game board consisting of the numbers 1 @-@ 80 from which he has to choose his lucky 2 @-@ 10 numbers . +following that , you can either click � Play one � to start the round or � Play Five � to play five consecutive rounds with the same numbers you chose at the very beginning . +you can easily deselect a number by clicking on it . +this Keno version allows you to set some numbers to be picked at random for the next round or even to allow random selection of numbers using the � Play five � option . +however , if you simply click the random pick button without setting your number amount , this too , will be chosen automatically . +CLICK2PAY is an instant , international & secure web wallet that allows you to transfer funds instantly from a variety of different options , in Euros , UK Pounds and US Dollars . +you can fund your Click2Pay account with : visa , Mastercard , Online Banking , Bank Deposit & by direct debit . +it enables you to access your money round the globe , anytime , whenever you need it . +all your CLICK2PAY transactions , history and account balance can be viewed and managed in your online account @-@ area . +it 's safe & reliable , using standard encryption technology means you can rest easy that your information will remain secure . +with its ' 24 / 7 support , CLICK2PAY is a great choice and our recommended payment method of the month ! +deposits through CLICK2PAY will be awarded with a 10 % extra bonus by the casino ! +from the casino lobby you can simply click on the casino cashier icon to make your deposit . +there you will see a variety of payment methods listed at the top of the page . +select the payment method that suits you best and follow the easy steps on making a deposit . +if you have any queries thoughout the process , just click on the CHAT icon to customer service representative . +get into the festive spirit as we take a look at the biggest lottery game of all times . +and if that gets you in the mood for big money then stay tuned for the amazing holiday treats that Europa Casino has in store for you ! +from all of us here at Europa Casino , we wish you a very Merry Christmas and a Happy New Year ! +my Speedy Alert is an exceptional tool designed for the purpose of updating you with any of our recent specials & promotions . +to your computer in real time ! +don � t miss out , read more about it here . +lottery remains for centuries one of the most common forms of gambling endorsed by governments worldwide . +the very first signs of this popular trend take us way back to the Han Dynasty in the Far East where a Lottery was used to aid in the finance of major governmental projects , targeted mainly to glorify the government and its people . +the game of Lottery is very strongly printed in today � s culture and even celebrated with splendor as we approach the holiday season . +the biggest celebration by far takes us to our neighboring Spain , where locals participate and enjoy Spain � s national Christmas lottery game known as � El Gordo � ( The Fat One ) , since 1812 . +as well as being one of the oldest lotteries around , the total prize pool in recent years , have surpassed the 2 billion Euro mark , making it also the biggest lottery worldwide ! ! +with prizes being so high , more than 100 new millionaires are produced every Christmas . +� � El Gordo � works differently to most lottery games played , as countless people can share one single ticket . +a whole lottery ticket ( � billete � ) costs a few thousand Euros but these tickets are split to smaller sections , each costing around 20 Euros . +this means that many , even strangers , may share a whole ticket . +the idea is quite befitting especially now , in holiday season where many people in one community share a grand prize worth millions . +such an event can even affect the local economy . +the El Gordo takes place every 22nd of December and again on the 5th of January . +the whole drawing process takes at least three hours to complete , with the entire community tuned to the radio in suspense . +participation is also available through the internet , so now you don � t necessarily have to be in Spain in order to enter this grand lottery event . +Titan Poker players will be jetting across the globe in 2008 as the popular poker room prepares to send its top representatives to a series of exciting land @-@ based tournaments . +at the beginning of 2008 , Titan Poker players will be combining sun , fun and poker at the Aussie Millions , the South African Poker Open , the Irish Open , and the WPS Ocean World in the Dominican Republic ! +so , don 't dive under your duvet this December ! ! +escape the winter blues by joining Titan Poker ' s players at some of the world 's most exciting Poker Events ! +click here to view our most recent biggest winners here at Europa Casino ! +did you hear about the $ 3,000,000 Tennessee State Lottery ? +the winner gets $ 3 a year for a million years . +the First lottery games were introduced to the world some thousands of years ago ( around 200 BC ) , by Chinese generals using the game to finance their wars as well as to aid in the building of one particularly magnificent architectural aspiration . +these Keno @-@ Lottery type games had a big part in funding one of the � New 7 Wonders � , that being the Great Wall of China ! +CeBIT 2009 was a great event for both Rapid @-@ I and our visitors . +the german journal " iX " for IT professionals has published a review of RapidMiner which can be downloaded below . +hotels | Villas and Chalets | Apartments | Hostels | Camping | Things to do | Last Minute Offers ! +this Puerto de Santa Maria beachfront flat in Cadiz , Spain ... +self catering apartment in Conil de la Frontera for 6 ... +apartments El Puerto de Santa Maria , all new and centrally ... +Christoph Lindemann holds the Chair of Computer Networks and Distributed Systems in the Department of Computer Science at the University of Leipzig . +from March 1998 till October 2005 he was an associate professor in the Computer Science Department at the University of Dortmund and was leading the Mobile Computing Systems group . +he received the degree Diplom @-@ Informatiker ( M.S. in Computer Science ) from the University of Karlsruhe , Germany in 1988 and the degree Doktor @-@ Ingenieur ( Ph.D. in Engineering ) from the Technische Universität Berlin , Germany in 1992 . +from 1994 to 1997 he held positions as research scientist and project manager at the GMD Institute for Computer Systems and Software Technology ( GMD FIRST ) , known as Fraunhofer Institut FIRST today , in Berlin , Germany . +in summer 1993 and during the academic year 1994 / 1995 , he was a Visiting Scientist at the IBM Almaden Research Center , San Jose CA . +in the fall semester 2003 / 04 he spend his sabbatical at the Computer Science Department of the University of Wisconsin as a visiting professor . +his current research interests lie in mobile computing systems , especially mobile ad hoc networks and peer @-@ to @-@ peer systems as well as modelling and performance evaluation as an umbrella topic . +Christoph Lindemann is member of the IFIP working group 7.3 and a senior member of the IEEE . +he is on the editorial board of the international journal Performance Evaluation since 2005 . he is also a member of the Executive Board of ACM SIGMETRICS . +he has been serving as chair of the special interest group on measurements , modelling , and evaluation of computer systems and communication networks within the German Society of Informatics ( GI ) from 2005 to 2008 . +in 2005 , he served as General Co @-@ Chair for the 11th International Conference on Mobile Computing and Networking , ACM MobiCom . +Christoph Lindemann has organized the ACM MobiShare Workshop in 2006 and is serving as general chair of the 26th International Symposium on Computer Performance , Modeling , Measurements , and Evaluation , Performance 2007 . +Christoph Lindemann has also been appointed to the Program Committees of numerous top @-@ level international conferences , e.g. ACM SIGMETRICS 2007 and ACM MobiHoc 2008 . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 17 May 2008 . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 02 Nov 2006 . +this statistic is based on the 6819 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 07 May 2008 . +please notice the difference between DVD + R and DVD @-@ R when buying a DVD burner , often drives can write only to one of those two media types , sometimes to both . when you buy a drive that only supports one type , you will always have to be careful to choose the right writable DVD discs at the shop . +this statistic is based on the 6815 using ecommerce sites ( eshops , distributors , comparison sites , ecommerce ASPs , purchase systems , etc ) downloading this ICEcat data @-@ sheet since 22 Oct 2007 . +only sponsoring brands are included in the free Open ICEcat content distribution as used by 6324 free Open ICEcat users . +the Lord warns Nephi to depart into the wilderness . +his journeyings in the wilderness , and so forth . +Lehi prophesies of a land of liberty - His seed shall be scattered and smitten if they reject the Holy One of Israel - He exhorts his sons to put on the armor of righteousness . +1 And now it came to pass that after I , Nephi , had made an end of teaching my brethren , our a father , Lehi , also spake many things unto them , and rehearsed unto them , how great things the Lord had done for them in bringing them out of the land of Jerusalem . +2 And he spake unto them concerning their a rebellions upon the waters , and the mercies of God in sparing their lives , that they were not swallowed up in the sea . +3 And he also spake unto them concerning the land of promise , which they had obtained - how a merciful the Lord had been in b warning us that we should flee out of the land of Jerusalem . +4 For , behold , said he , I have a seen a b vision , in which I know that c Jerusalem is d destroyed ; and had we remained in Jerusalem we should also have e perished . +5 But , said he , notwithstanding our afflictions , we have obtained a a land of promise , a land which is b choice above all other lands ; a land which the Lord God hath c covenanted with me should be a land for the inheritance of my seed . +yea , the Lord hath d covenanted this land unto me , and to my children forever , and also all those who should be e led out of other countries by the hand of the Lord . +6 Wherefore , I , Lehi , prophesy according to the workings of the Spirit which is in me , that there shall a none come into this land save they shall be brought by the hand of the Lord . +7 Wherefore , this a land is consecrated unto him whom he shall bring . +and if it so be that they shall serve him according to the commandments which he hath given , it shall be a land of b liberty unto them ; wherefore , they shall never be brought down into captivity ; if so , it shall be because of iniquity ; for if iniquity shall abound c cursed shall be the land for their sakes , but unto the righteous it shall be blessed forever . +8 And behold , it is wisdom that this land should be a kept as yet from the knowledge of other b nations ; for behold , many nations would overrun the land , that there would be no place for an inheritance . +9 Wherefore , I , Lehi , have obtained a a promise , that b inasmuch as those whom the Lord God shall bring out of the land of Jerusalem shall keep his commandments , they shall c prosper upon the face of this land ; and they shall be kept from all other nations , that they may possess this land unto themselves . +and if it so be that they shall d keep his commandments they shall be blessed upon the face of this land , and there shall be none to molest them , nor to take away the land of their e inheritance ; and they shall dwell safely forever . +11 Yea , he will bring a other nations unto them , and he will give unto them power , and he will take away from them the lands of their possessions , and he will cause them to be b scattered and smitten . +12 Yea , as one generation passeth to another there shall be a bloodsheds , and great visitations among them ; wherefore , my sons , I would that ye would remember ; yea , I would that ye would hearken unto my words . +13 O that ye would awake ; awake from a deep a sleep , yea , even from the sleep of b hell , and shake off the awful c chains by which ye are bound , which are the chains which bind the children of men , that they are carried away captive down to the eternal d gulf of misery and woe . +and arise from the dust , and hear the words of a trembling a parent , whose limbs ye must soon lay down in the cold and silent b grave , from whence no traveler can c return ; a few more d days and I go the e way of all the earth . +15 But behold , the Lord hath a redeemed my soul from hell ; I have beheld his b glory , and I am encircled about eternally in the c arms of his d love . +16 And I desire that ye should remember to observe the a statutes and the judgments of the Lord ; behold , this hath been the anxiety of my soul from the beginning . +18 Or , that a a cursing should come upon you for the space of b many generations ; and ye are visited by sword , and by famine , and are hated , and are led according to the will and captivity of the c devil . +19 O my sons , that these things might not come upon you , but that ye might be a choice and a a favored people of the Lord . +but behold , his will be done ; for his b ways are righteousness forever . +20 And he hath said that : a Inasmuch as ye shall keep my b commandments ye shall c prosper in the land ; but inasmuch as ye will not keep my commandments ye shall be cut off from my presence . +22 That ye may not be a cursed with a sore cursing ; and also , that ye may not incur the displeasure of a b just God upon you , unto the destruction , yea , the eternal destruction of both soul and body . +23 Awake , my sons ; put on the armor of a righteousness . +shake off the b chains with which ye are bound , and come forth out of obscurity , and arise from the dust . +25 And I exceedingly fear and tremble because of you , lest he shall suffer again ; for behold , ye have a accused him that he sought power and b authority over you ; but I know that he hath not sought for power nor authority over you , but he hath sought the glory of God , and your own eternal welfare . +26 And ye have murmured because he hath been plain unto you . +ye say that he hath used a sharpness ; ye say that he hath been angry with you ; but behold , his b sharpness was the sharpness of the power of the word of God , which was in him ; and that which ye call anger was the truth , according to that which is in God , which he could not restrain , manifesting boldly concerning your iniquities . +27 And it must needs be that the a power of God must be with him , even unto his commanding you that ye must obey . +but behold , it was not he , but it was the b Spirit of the Lord which was in him , which c opened his mouth to utterance that he could not shut it . +28 And now my son , Laman , and also Lemuel and Sam , and also my sons who are the sons of Ishmael , behold , if ye will hearken unto the voice of Nephi ye shall not perish . +and if ye will hearken unto him I leave unto you a a blessing , yea , even my first blessing . +29 But if ye will not hearken unto him I take away my a first blessing , yea , even my blessing , and it shall rest upon him . +30 And now , Zoram , I speak unto you : behold , thou art the a servant of Laban ; nevertheless , thou hast been brought out of the land of Jerusalem , and I know that thou art a true b friend unto my son , Nephi , forever . +31 Wherefore , because thou hast been faithful thy seed shall be blessed a with his seed , that they dwell in prosperity long upon the face of this land ; and nothing , save it shall be iniquity among them , shall harm or disturb their prosperity upon the face of this land forever . +32 Wherefore , if ye shall keep the commandments of the Lord , the Lord hath consecrated this land for the security of thy seed with the seed of my son . +redemption cometh through the Holy Messiah - Freedom of choice ( agency ) is essential to existence and progression - Adam fell that men might be - Men are free to choose liberty and eternal life . +1 And now , Jacob , I speak unto you : thou art my a first -born in the days of my tribulation in the wilderness . +and behold , in thy childhood thou hast suffered afflictions and much sorrow , because of the rudeness of thy brethren . +2 Nevertheless , Jacob , my first @-@ born in the wilderness , thou knowest the greatness of God ; and he shall consecrate thine a afflictions for thy gain . +3 Wherefore , thy soul shall be blessed , and thou shalt dwell safely with thy brother , Nephi ; and thy days shall be a spent in the service of thy God . +Wherefore , I know that thou art redeemed , because of the righteousness of thy Redeemer ; for thou hast b beheld that in the c fulness of time he cometh to bring salvation unto men . +4 And thou hast a beheld in thy youth his glory ; wherefore , thou art blessed even as they unto whom he shall minister in the flesh ; for the Spirit is the same , yesterday , today , and forever . +and the way is prepared from the fall of man , and b salvation is c free . +5 And men are instructed sufficiently that they a know good from evil . +and the b law is given unto men . +and by the law no flesh is c justified ; or , by the law men are d cut off . +yea , by the temporal law they were cut off ; and also , by the spiritual law they perish from that which is good , and become miserable forever . +6 Wherefore , a redemption cometh in and through the b Holy c Messiah ; for he is full of d grace and truth . +7 Behold , he offereth himself a a sacrifice for sin , to answer the ends of the law , unto all those who have a broken heart and a contrite spirit ; and unto b none else can the c ends of the law be answered . +9 Wherefore , he is the firstfruits unto God , inasmuch as he shall make a intercession for all the children of men ; and they that believe in him shall be saved . +10 And because of the intercession for a all , all men come unto God ; wherefore , they stand in the presence of him , to be b judged of him according to the truth and c holiness which is in him . +11 For it must needs be , that there is an a opposition in all things . +if not so , my first @-@ born in the wilderness , righteousness could not be brought to pass , neither wickedness , neither holiness nor misery , neither good nor bad . +Wherefore , all things must needs be a compound in one ; wherefore , if it should be one body it must needs remain as dead , having no life neither death , nor corruption nor incorruption , happiness nor misery , neither sense nor insensibility . +12 Wherefore , it must needs have been created for a thing of naught ; wherefore there would have been no a purpose in the end of its creation . +Wherefore , this thing must needs destroy the wisdom of God and his eternal purposes , and also the power , and the mercy , and the b justice of God . +13 And if ye shall say there is a no law , ye shall also say there is no sin . +if ye shall say there is no sin , ye shall also say there is no righteousness . +and if there be no righteousness there be no happiness . +and if there be no righteousness nor happiness there be no punishment nor misery . +and if these things are not b there is no God . +and if there is no God we are not , neither the earth ; for there could have been no creation of things , neither to act nor to be acted upon ; wherefore , all things must have vanished away . +14 And now , my sons , I speak unto you these things for your profit and a learning ; for there is a God , and he hath b created all things , both the heavens and the earth , and all things that in them are , both things to act and things to be c acted upon . +16 Wherefore , the Lord God gave unto man that he should a act for himself . +Wherefore , man could not b act for himself save it should be that he was c enticed by the one or the other . +17 And I , Lehi , according to the things which I have read , must needs suppose that an a angel of God , according to that which is written , had b fallen from heaven ; wherefore , he became a c devil , having sought that which was evil before God . +18 And because he had fallen from heaven , and had become miserable forever , he a sought also the misery of all mankind . +Wherefore , he said unto Eve , yea , even that old serpent , who is the devil , who is the father of all b lies , wherefore he said : partake of the forbidden fruit , and ye shall not die , but ye shall be as God , c knowing good and evil . +19 And after Adam and Eve had a partaken of the forbidden fruit they were driven out of the garden of b Eden , to till the earth . +20 And they have brought forth children ; yea , even the a family of all the earth . +21 And the days of the children of a men were prolonged , according to the b will of God , that they might c repent while in the flesh ; wherefore , their state became a state of d probation , and their time was lengthened , according to the commandments which the Lord God gave unto the children of men . +for he gave commandment that all men must repent ; for he showed unto all men that they were e lost , because of the transgression of their parents . +22 And now , behold , if Adam had not transgressed he would not have fallen , but he would have remained in the garden of Eden . +and all things which were created must have remained in the same state in which they were after they were created ; and they must have remained forever , and had no end . +23 And they would have had no a children ; wherefore they would have remained in a state of innocence , having no b joy , for they knew no misery ; doing no good , for they knew no c sin . +24 But behold , all things have been done in the wisdom of him who a knoweth all things . +25 a Adam b fell that men might be ; and men c are , that they might have d joy . +26 And the a Messiah cometh in the fulness of time , that he may b redeem the children of men from the fall . +and because that they are c redeemed from the fall they have become d free forever , knowing good from evil ; to act for themselves and not to be acted upon , save it be by the punishment of the e law at the great and last day , according to the commandments which God hath given . +27 Wherefore , men are a free according to the b flesh ; and c all things are d given them which are expedient unto man . +and they are free to e choose f liberty and eternal g life , through the great Mediator of all men , or to choose captivity and death , according to the captivity and power of the devil ; for he seeketh that all men might be h miserable like unto himself . +29 And not choose eternal death , according to the will of the flesh and the a evil which is therein , which giveth the spirit of the devil power to b captivate , to bring you down to c hell , that he may reign over you in his own kingdom . +30 I have spoken these few words unto you all , my sons , in the last days of my probation ; and I have chosen the good part , according to the words of the prophet . +and I have none other object save it be the everlasting a welfare of your souls . +the Gods finish their planning of the creation of all things - They bring to pass the creation according to their plans - Adam names every living creature . +1 And thus we will finish the heavens and the earth , and all the a hosts of them . +2 And the Gods said among themselves : on the seventh time we will end our work , which we have counseled ; and we will a rest on the b seventh time from all our work which we have counseled . +3 And the Gods concluded upon the seventh time , because that on the seventh time they would a rest from all their b works which they ( the Gods ) counseled among themselves to form ; and c sanctified it . +and thus were their decisions at the time that they counseled among themselves to form the heavens and the earth . +5 According to all that which they had said concerning every plant of the field before it was in the a earth , and every herb of the field before it grew ; for the Gods had not caused it to rain upon the earth when they counseled to do them , and had not formed a man to till the ground . +6 But there went up a mist from the earth , and watered the whole face of the ground . +7 And the a Gods formed man from the b dust of the ground , and took his c spirit ( that is , the man 's spirit ) , and put it into him ; and breathed into his nostrils the breath of life , and man became a living d soul . +8 And the Gods planted a garden , eastward in a Eden , and there they put the man , whose spirit they had put into the body which they had formed . +9 And out of the ground made the Gods to grow every tree that is pleasant to the sight and good for food ; the a tree of life , also , in the midst of the garden , and the tree of knowledge of good and evil . +10 There was a river running out of Eden , to water the garden , and from thence it was parted and became into four heads . +11 And the Gods took the man and put him in the Garden of Eden , to dress it and to keep it . +13 But of the tree of knowledge of good and evil , thou shalt not eat of it ; for in the time that thou eatest thereof , thou shalt surely die . +now I , Abraham , saw that it was after the Lord 's a time , which was after the time of b Kolob ; for as yet the Gods had not appointed unto Adam his reckoning . +14 And the Gods said : let us make an help meet for the man , for it is not good that the man should be alone , therefore we will form an help meet for him . +16 And of the rib which the Gods had taken from man , formed they a a woman , and brought her unto the man . +18 Therefore shall a man leave his father and his mother , and shall a cleave unto his wife , and they shall be b one flesh . +19 And they were both naked , the man and his wife , and were not a ashamed . +20 And out of the ground the Gods formed every beast of the field , and every fowl of the air , and brought them unto Adam to see what he would call them ; and whatsoever a Adam called every living creature , that should be the name thereof . +21 And Adam gave a names to all b cattle , to the fowl of the air , to every beast of the field ; and for Adam , there was found an c help meet for him . +men are called as high priests because of their exceeding faith and good works - They are to teach the commandments - Through righteousness they are sanctified and enter into the rest of the Lord - Melchizedek was one of these - Angels are declaring glad tidings throughout the land - They will reveal the actual coming of Christ . +1 And again , my brethren , I would cite your minds forward to the time when the Lord God gave these commandments unto his children ; and I would that ye should remember that the Lord God a ordained priests , after his holy order , which was after the order of his Son , to teach these things unto the people . +2 And those priests were ordained after the a order of his Son , in a b manner that thereby the people might know in what manner to look forward to his Son for redemption . +4 And thus they have been a called to this holy calling on account of their faith , while others would reject the Spirit of God on account of the hardness of their hearts and b blindness of their minds , while , if it had not been for this they might have had as great c privilege as their brethren . +9 Thus they become a high priests forever , after the order of the Son , the Only Begotten of the Father , who is without beginning of days or end of years , who is full of b grace , equity , and truth . +11 Therefore they were called after this holy order , and were a sanctified , and their b garments were washed white through the blood of the Lamb . +12 Now they , after being a sanctified by the b Holy Ghost , having their garments made white , being c pure and spotless before God , could not look upon d sin save it were with e abhorrence ; and there were many , exceedingly great many , who were made pure and entered into the rest of the Lord their God . +13 And now , my brethren , I would that ye should humble yourselves before God , and bring forth a fruit meet for repentance , that ye may also enter into that rest . +14 Yea , humble yourselves even as the people in the days of a Melchizedek , who was also a high priest after this same order which I have spoken , who also took upon him the high priesthood forever . +15 And it was this same Melchizedek to whom Abraham paid a tithes ; yea , even our father Abraham paid tithes of one @-@ tenth part of all he possessed . +16 Now these a ordinances were given after this b manner , that thereby the people might look forward on the Son of God , it being a c type of his order , or it being his order , and this that they might look forward to him for a remission of their sins , that they might enter into the rest of the Lord . +18 But Melchizedek having exercised mighty faith , and received the office of the high priesthood according to the a holy order of God , did preach repentance unto his people . +and behold , they did repent ; and Melchizedek did establish peace in the land in his days ; therefore he was called the prince of peace , for he was the king of Salem ; and he did reign under his father . +19 Now , there were a many before him , and also there were many afterwards , but b none were greater ; therefore , of him they have more particularly made mention . +20 Now I need not rehearse the matter ; what I have said may suffice . +behold , the a scriptures are before you ; if ye will b wrest them it shall be to your own destruction . +22 Yea , and the voice of the Lord , by the a mouth of angels , doth declare it unto all nations ; yea , doth declare it , that they may have glad tidings of great joy ; yea , and he doth sound these glad tidings among all his people , yea , even to them that are scattered abroad upon the face of the earth ; wherefore they have come unto us . +23 And they are made known unto us in a plain terms , that we may understand , that we cannot err ; and this because of our being b wanderers in a strange land ; therefore , we are thus highly favored , for we have these glad tidings declared unto us in all parts of our vineyard . +24 For behold , a angels are declaring it unto many at this time in our land ; and this is for the purpose of preparing the hearts of the children of men to receive his word at the time of his coming in his glory . +25 And now we only wait to hear the joyful news declared unto us by the mouth of angels , of his coming ; for the time cometh , we a know not how soon . +would to God that it might be in my day ; but let it be sooner or later , in it I will rejoice . +26 And it shall be made known unto a just and holy men , by the mouth of angels , at the time of his coming , that the words of our fathers may be fulfilled , according to that which they have spoken concerning him , which was according to the spirit of prophecy which was in them . +29 a Having faith on the Lord ; having a hope that ye shall receive eternal life ; having the b love of God always in your hearts , that ye may be lifted up at the last day and enter into his c rest . +30 And may the Lord grant unto you repentance , that ye may not bring down his wrath upon you , that ye may not be a bound down by the chains of b hell , that ye may not suffer the second c death . +31 And Alma spake many more words unto the people , which are not written in a this book . +revelation given through Joseph Smith , at Manchester , New York , March 1830 . HC 1 : 72 @-@ 74 . +1 @-@ 3 , Christ has all power ; 4 @-@ 5 , All men must repent or suffer ; 6 @-@ 12 , Eternal punishment is God 's punishment ; 13 @-@ 20 , Christ suffered for all , that they might not suffer if they would repent ; 21 @-@ 28 , Preach the gospel of repentance ; 29 @-@ 41 , Declare glad tidings . +1 I am a Alpha and Omega , b Christ the Lord ; yea , even I am he , the beginning and the end , the Redeemer of the c world . +3 Retaining all a power , even to the b destroying of Satan and his works at the c end of the world , and the last great day of judgment , which I shall pass upon the inhabitants thereof , d judging every man according to his e works and the deeds which he hath done . +4 And surely every man must a repent or b suffer , for I , God , am c endless . +5 Wherefore , I a revoke not the judgments which I shall pass , but woes shall go forth , weeping , b wailing and gnashing of teeth , yea , to those who are found on my c left hand . +6 Nevertheless , it is a not written that there shall be no end to this torment , but it is written b endless c torment . +7 Again , it is written a eternal damnation ; wherefore it is more express than other scriptures , that it might work upon the hearts of the children of men , altogether for my name 's glory . +8 Wherefore , I will explain unto you this a mystery , for it is meet unto you to know even as mine apostles . +9 I speak unto you that are chosen in this thing , even as one , that you may enter into my a rest . +10 For , behold , the a mystery of godliness , how great is it ! +for , behold , I am b endless , and the punishment which is given from my hand is endless c punishment , for d Endless is my name . +11 a Eternal punishment is God 's punishment . +15 Therefore I command you to repent - repent , lest I a smite you by the rod of my mouth , and by my wrath , and by my anger , and your b sufferings be sore - how sore you know not , how exquisite you know not , yea , how hard to bear you know not . +19 Nevertheless , glory be to the Father , and I partook and a finished my preparations unto the children of men . +20 Wherefore , I command you again to repent , lest I a humble you with my almighty power ; and that you b confess your sins , lest you suffer these c punishments of which I have spoken , of which in the smallest , yea , even in the least degree you have d tasted at the time I withdrew my Spirit . +21 And I command you that you a preach naught but repentance , and show b not these things unto the world until it is wisdom in me . +22 For they cannot a bear meat now , but b milk they must receive ; wherefore , they must not know these things , lest they perish . +23 a Learn of me , and listen to my words ; b walk in the c meekness of my Spirit , and you shall have d peace in me . +24 I am Jesus Christ ; I a came by the b will of the Father , and I do his will . +25 And again , I command thee that thou shalt not a covet thy b neighbor " s c wife ; nor seek thy neighbor 's life . +27 Which is my word to the a Gentile , that soon it may go to the b Jew , of whom the Lamanites are a c remnant , that they may believe the gospel , and look not for a d Messiah to come who has already come . +28 And again , I command thee that thou shalt a pray b vocally as well as in thy heart ; yea , before the world as well as in secret , in public as well as in private . +29 And thou shalt a declare glad tidings , yea , b publish it upon the mountains , and upon every high place , and among every people that thou shalt be permitted to see . +30 And thou shalt do it with all humility , a trusting in me , b reviling not against revilers . +31 And of a tenets thou shalt not talk , but thou shalt declare repentance and b faith on the Savior , and c remission of sins by d baptism , and by e fire , yea , even the f Holy Ghost . +32 Behold , this is a great and the last a commandment which I shall give unto you concerning this matter ; for this shall suffice for thy daily walk , even unto the end of thy life . +33 And misery thou shalt receive if thou wilt slight these a counsels , yea , even the destruction of thyself and property . +34 a Impart a portion of thy property , yea , even part of thy lands , and all save the support of thy b family . +35 Pay the a debt thou hast b contracted with the printer . +release thyself from c bondage . +37 And a speak freely to all ; yea , preach , exhort , declare the b truth , even with a loud voice , with a sound of rejoicing , crying - Hosanna , hosanna , blessed be the name of the Lord God ! +38 a Pray always , and I will b pour out my Spirit upon you , and great shall be your blessing - yea , even more than if you should obtain c treasures of earth and corruptibleness to the extent thereof . +39 Behold , canst thou read this without a rejoicing and lifting up thy heart for b gladness ? +40 Or canst thou run about longer as a a blind guide ? +41 Or canst thou be a humble and meek , and conduct thyself wisely before me ? +revelation given to Joseph Smith the Prophet and Sidney Rigdon , at or near Fayette , New York , December 1830 . HC 1 : 128 @-@ 131 . +at this time the Prophet was engaged almost daily in making a translation of the Bible . +the translation was begun as early as June 1830 , and both Oliver Cowdery and John Whitmer had served as scribes . +since they had now been called to other duties , Sidney Rigdon was called by divine appointment to serve as the Prophet 's scribe in this work ( verse 20 ) . +as a preface to his record of this revelation the Prophet wrote : " in December Sidney Rigdon came [ from Ohio ] to inquire of the Lord , and with him came Edward Partridge . . +1 Listen to the voice of the a Lord your God , even b Alpha and Omega , the beginning and the end , whose c course is one d eternal round , the e same today as yesterday , and forever . +2 I am Jesus Christ , the Son of God , who was a crucified for the sins of the world , even as many as will b believe on my name , that they may become the c sons of God , even d one in e me as I am f one in the Father , as the Father is one in me , that we may be one . +3 Behold , verily , verily , I say unto my servant Sidney , I have looked upon thee and thy works . +I have a heard thy prayers , and prepared thee for a greater work . +4 Thou art blessed , for thou shalt do great things . +behold thou wast sent forth , even as a John , to prepare the way before me , and before b Elijah which should come , and thou knewest it not . +6 But now I give unto thee a commandment , that thou shalt a baptize by water , and they shall receive the b Holy Ghost by the laying on of the c hands , even as the apostles of old . +7 And it shall come to pass that there shall be a great work in the land , even among the a Gentiles , for their b folly and their abominations shall be made manifest in the eyes of all people . +8 For I am God , and mine arm is not a shortened ; and I will show b miracles , c signs , and wonders , unto all those who d believe on my name . +9 And whoso shall ask it in my name in a faith , they shall b cast out c devils ; they shall heal the d sick ; they shall cause the blind to receive their e sight , and the deaf to hear , and the dumb to speak , and the lame to walk . +11 But a without faith shall not anything be shown forth except b desolations upon c Babylon , the same which has made d all nations drink of the wine of the wrath of her e fornication . +12 And there are a none that doeth good except those who are ready to b receive the fulness of my gospel , which I have sent forth unto this generation . +14 And their arm shall be my arm , and I will be their a shield and their buckler ; and I will gird up their loins , and they shall fight manfully for me ; and their b enemies shall be under their feet ; and I will let c fall the d sword in their behalf , and by the e fire of mine indignation will I preserve them . +16 And they shall learn the parable of the a fig -tree , for even now already summer is nigh . +18 And I have given unto him the a keys of the mystery of those things which have been b sealed , even things which were from the c foundation of the world , and the things which shall come from this time until the time of my coming , if he d abide in me , and if not , e another will I plant in his stead . +19 Wherefore , watch over him that his faith fail not , and it shall be given by the a Comforter , the b Holy Ghost , that knoweth all things . +21 For they will hear my a voice , and shall b see me , and shall not be c asleep , and shall d abide the day of my e coming ; for they shall be f purified , even as I am pure . +22 And now I say unto you , a tarry with him , and he shall journey with you ; forsake him not , and surely these things shall be fulfilled . +23 And a inasmuch as ye do not write , behold , it shall be b given unto him to prophesy ; and thou shalt preach my gospel and call on c the holy prophets to prove his words , as they shall be given him . +25 And a Israel shall be b saved in mine own due time ; and by the c keys which I have given shall they be led , and no more be confounded at all . +26 a Lift up your hearts and be glad , your b redemption draweth nigh . +27 Fear not , little a flock , the b kingdom is yours until I come . +revelation given through Joseph Smith the Prophet , on the bank of the Missouri River , McIlwaine 's Bend , August 12 , 1831 . HC 1 : 202 @-@ 205 . +on their return trip to Kirtland the Prophet and ten elders had traveled down the Missouri River in canoes . +on the third day of the journey many dangers were experienced . +elder William W. Phelps , in daylight vision , saw the destroyer riding in power upon the face of the waters . +1 @-@ 12 , The Lord has decreed many destructions upon the waters ; 13 @-@ 22 , The waters were cursed by John , and the destroyer rideth upon their face ; 23 @-@ 29 , Some have power to command the waters ; 30 @-@ 35 , Elders are to journey two by two and preach the gospel ; 36 @-@ 39 , They are to prepare for the coming of the Son of Man . +1 Behold , and hearken unto the voice of him who has all a power , who is from everlasting to everlasting , even b Alpha and Omega , the beginning and the end . +3 But verily I say unto you , that it is not needful for this whole company of mine elders to be moving swiftly upon the waters , whilst the inhabitants on either side are perishing in unbelief . +5 For I , the Lord , have decreed in mine anger many destructions upon the waters ; yea , and especially upon these waters . +6 Nevertheless , all flesh is in mine hand , and he that is faithful among you shall not a perish by the waters . +7 Wherefore , it is expedient that my servant Sidney Gilbert and my servant a William W. Phelps be in haste upon their errand and mission . +9 But now , verily I say , it behooveth me that ye should part . +10 And inasmuch as they are a faithful they shall be preserved , and I , the Lord , will be b with them . +11 And let the residue take that which is needful for clothing . +12 Let my servant Sidney Gilbert take that which is not needful with him , as you shall agree . +13 And now , behold , for your a good I gave unto you a b commandment concerning these things ; and I , the Lord , will reason with you as with men in days of old . +14 Behold , I , the Lord , in the beginning blessed the a waters ; but in the last days , by the mouth of my servant John , I b cursed the waters . +15 Wherefore , the days will come that no flesh shall be safe upon the waters . +16 And it shall be said in days to come that none is able to go up to the land of Zion upon the waters , but he that is upright in heart . +17 And , as I , the Lord , in the beginning a cursed the land , even so in the last days have I b blessed it , in its time , for the use of my saints , that they may partake the fatness thereof . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . +Frau Präsidentin , zur Geschäftsordnung . diff --git a/subword-nmt/subword_nmt/tests/test_bpe.py b/subword-nmt/subword_nmt/tests/test_bpe.py new file mode 100755 index 0000000000000000000000000000000000000000..d8c84857b4aa22903907ae3d217bf9468b168c88 --- /dev/null +++ b/subword-nmt/subword_nmt/tests/test_bpe.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals +import unittest +import codecs + +import os,sys,inspect +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) +sys.path.insert(0,parentdir) + +from learn_bpe import learn_bpe +from apply_bpe import BPE + + +class TestBPELearnMethod(unittest.TestCase): + + def test_learn_bpe(self): + infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8') + outfile = codecs.open(os.path.join(currentdir,'data','bpe.out'), 'w', encoding='utf-8') + learn_bpe(infile, outfile, 1000) + infile.close() + outfile.close() + + outlines = open(os.path.join(currentdir,'data','bpe.out')) + reflines = open(os.path.join(currentdir,'data','bpe.ref')) + + for line, line2 in zip(outlines, reflines): + self.assertEqual(line, line2) + + outlines.close() + reflines.close() + +class TestBPESegmentMethod(unittest.TestCase): + + def setUp(self): + + with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile: + self.bpe = BPE(bpefile) + + self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8') + self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8') + + def tearDown(self): + + self.infile.close() + self.reffile.close() + + def test_apply_bpe(self): + + for line, ref in zip(self.infile, self.reffile): + out = self.bpe.process_line(line) + self.assertEqual(out, ref) + + def test_trailing_whitespace(self): + """BPE.proces_line() preserves leading and trailing whitespace""" + + orig = ' iron cement \n' + exp = ' ir@@ on c@@ ement \n' + + out = self.bpe.process_line(orig) + self.assertEqual(out, exp) + + def test_utf8_whitespace(self): + """UTF-8 whitespace is treated as normal character, not word boundary""" + + orig = 'iron\xa0cement\n' + exp = 'ir@@ on@@ \xa0@@ c@@ ement\n' + + out = self.bpe.process_line(orig) + self.assertEqual(out, exp) + + def test_empty_line(self): + + orig = '\n' + exp = '\n' + + out = self.bpe.process_line(orig) + self.assertEqual(out, exp) + +if __name__ == '__main__': + unittest.main() diff --git a/subword-nmt/subword_nmt/tests/test_glossaries.py b/subword-nmt/subword_nmt/tests/test_glossaries.py new file mode 100755 index 0000000000000000000000000000000000000000..2ff7da19fb00a8b8c9e7d33a67d6db4f0c72ef6c --- /dev/null +++ b/subword-nmt/subword_nmt/tests/test_glossaries.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import unittest +import mock + +import os,sys,inspect +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) +sys.path.insert(0,parentdir) + +from apply_bpe import isolate_glossary, BPE + +class TestIsolateGlossaryFunction(unittest.TestCase): + + def setUp(self): + self.glossary = 'like' + + def _run_test_case(self, test_case): + orig, expected = test_case + out = isolate_glossary(orig, self.glossary) + self.assertEqual(out, expected) + + def test_empty_string(self): + orig = '' + exp = [''] + test_case = (orig, exp) + self._run_test_case(test_case) + + def test_no_glossary(self): + orig = 'word' + exp = ['word'] + test_case = (orig, exp) + self._run_test_case(test_case) + + def test_isolated_glossary(self): + orig = 'like' + exp = ['like'] + test_case = (orig, exp) + self._run_test_case(test_case) + + def test_word_one_side(self): + orig = 'likeword' + exp = ['like', 'word'] + test_case = (orig, exp) + self._run_test_case(test_case) + + def test_words_both_sides(self): + orig = 'wordlikeword' + exp = ['word', 'like', 'word'] + test_case = (orig, exp) + self._run_test_case(test_case) + + def test_back_to_back_glossary(self): + orig = 'likelike' + exp = ['like', 'like'] + test_case = (orig, exp) + self._run_test_case(test_case) + + def test_multiple_glossaries(self): + orig = 'wordlikewordlike' + exp = ['word', 'like', 'word', 'like'] + test_case = (orig, exp) + self._run_test_case(test_case) + +class TestBPEIsolateGlossariesMethod(unittest.TestCase): + + def setUp(self): + + amock = mock.MagicMock() + amock.readline.return_value = 'something' + glossaries = ['like', 'Manuel', 'USA'] + self.bpe = BPE(amock, glossaries=glossaries) + + def _run_test_case(self, test_case): + orig, expected = test_case + out = self.bpe._isolate_glossaries(orig) + self.assertEqual(out, expected) + + def test_multiple_glossaries(self): + orig = 'wordlikeUSAwordManuelManuelwordUSA' + exp = ['word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA'] + test_case = (orig, exp) + self._run_test_case(test_case) + +class TestRegexIsolateGlossaries(unittest.TestCase): + + def setUp(self): + + amock = mock.MagicMock() + amock.readline.return_value = 'something' + glossaries = ["\w*", "\w*", "\d+"] + self.bpe = BPE(amock, glossaries=glossaries) + + def _run_test_case(self, test_case): + orig, expected = test_case + out = self.bpe._isolate_glossaries(orig) + self.assertEqual(out, expected) + + def test_regex_glossaries(self): + orig = 'wordlikeUSAword10001wordManuelwordUSA' + exp = ['wordlike', 'USA', 'word', '10001', 'word', 'Manuel', 'word', 'USA'] + test_case = (orig, exp) + self._run_test_case(test_case) + +def encode_mock(segment, x2, x3, x4, x5, x6, x7, glosses, dropout): + if glosses.match(segment): + return (segment,) + else: + l = len(segment) + return (segment[:l//2], segment[l//2:]) + +class TestBPESegmentMethod(unittest.TestCase): + + def setUp(self): + + amock = mock.MagicMock() + amock.readline.return_value = 'something' + glossaries = ['like', 'Manuel', 'USA'] + self.bpe = BPE(amock, glossaries=glossaries) + + @mock.patch('apply_bpe.encode', side_effect=encode_mock) + def _run_test_case(self, test_case, encode_function): + + orig, expected = test_case + out = self.bpe.segment(orig) + + self.assertEqual(out, expected) + + def test_multiple_glossaries(self): + orig = 'wordlikeword likeManuelword' + exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd' + test_case = (orig, exp) + self._run_test_case(test_case) + +if __name__ == '__main__': + unittest.main()