Spaces:
Runtime error
Runtime error
harveen
commited on
Commit
•
74fc30d
1
Parent(s):
e96a01b
Harveen | Adding code
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- IndicTrans_training.ipynb +752 -0
- LICENSE +21 -0
- api.py +86 -0
- app.py +36 -0
- apply_bpe_traindevtest_notag.sh +41 -0
- apply_single_bpe_traindevtest_notag.sh +40 -0
- binarize_training_exp.sh +24 -0
- compute_bleu.sh +28 -0
- indicTrans_Finetuning.ipynb +0 -0
- indicTrans_python_interface.ipynb +462 -0
- indic_nlp_library/LICENSE +9 -0
- indic_nlp_library/README.md +142 -0
- indic_nlp_library/contrib/README.md +7 -0
- indic_nlp_library/contrib/correct_moses_tokenizer.py +29 -0
- indic_nlp_library/contrib/hindi_to_kannada_transliterator.py +62 -0
- indic_nlp_library/contrib/indic_scraper_project_sample.ipynb +569 -0
- indic_nlp_library/docs/Makefile +153 -0
- indic_nlp_library/docs/cmd.rst +8 -0
- indic_nlp_library/docs/code.rst +5 -0
- indic_nlp_library/docs/conf.py +242 -0
- indic_nlp_library/docs/index.rst +22 -0
- indic_nlp_library/docs/indicnlp.MD +122 -0
- indic_nlp_library/docs/indicnlp.cli.rst +11 -0
- indic_nlp_library/docs/indicnlp.morph.rst +11 -0
- indic_nlp_library/docs/indicnlp.normalize.rst +15 -0
- indic_nlp_library/docs/indicnlp.pdf +0 -0
- indic_nlp_library/docs/indicnlp.rst +47 -0
- indic_nlp_library/docs/indicnlp.script.rst +26 -0
- indic_nlp_library/docs/indicnlp.syllable.rst +11 -0
- indic_nlp_library/docs/indicnlp.tokenize.rst +26 -0
- indic_nlp_library/docs/indicnlp.transliterate.rst +34 -0
- indic_nlp_library/docs/make.bat +35 -0
- indic_nlp_library/docs/modules.rst +7 -0
- indic_nlp_library/indicnlp/__init__.py +10 -0
- indic_nlp_library/indicnlp/cli/__init__.py +0 -0
- indic_nlp_library/indicnlp/cli/cliparser.py +266 -0
- indic_nlp_library/indicnlp/common.py +58 -0
- indic_nlp_library/indicnlp/langinfo.py +488 -0
- indic_nlp_library/indicnlp/loader.py +35 -0
- indic_nlp_library/indicnlp/morph/__init__.py +0 -0
- indic_nlp_library/indicnlp/morph/unsupervised_morph.py +142 -0
- indic_nlp_library/indicnlp/normalize/__init__.py +0 -0
- indic_nlp_library/indicnlp/normalize/indic_normalize.py +984 -0
- indic_nlp_library/indicnlp/script/__init__.py +0 -0
- indic_nlp_library/indicnlp/script/english_script.py +154 -0
- indic_nlp_library/indicnlp/script/indic_scripts.py +301 -0
- indic_nlp_library/indicnlp/script/phonetic_sim.py +59 -0
- indic_nlp_library/indicnlp/syllable/__init__.py +0 -0
- indic_nlp_library/indicnlp/syllable/syllabifier.py +302 -0
- indic_nlp_library/indicnlp/test/__init__.py +0 -0
IndicTrans_training.ipynb
ADDED
@@ -0,0 +1,752 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"colab_type": "text",
|
7 |
+
"id": "view-in-github"
|
8 |
+
},
|
9 |
+
"source": [
|
10 |
+
"<a href=\"https://colab.research.google.com/github/gowtham1997/indicTrans-1/blob/main/IndicTrans_training.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"metadata": {
|
17 |
+
"colab": {
|
18 |
+
"base_uri": "https://localhost:8080/"
|
19 |
+
},
|
20 |
+
"id": "FdyHSnoj7Iun",
|
21 |
+
"outputId": "d0624c60-68c4-470f-9ade-c517e3296044"
|
22 |
+
},
|
23 |
+
"outputs": [
|
24 |
+
{
|
25 |
+
"name": "stdout",
|
26 |
+
"output_type": "stream",
|
27 |
+
"text": [
|
28 |
+
"/content/training\n"
|
29 |
+
]
|
30 |
+
}
|
31 |
+
],
|
32 |
+
"source": [
|
33 |
+
"# create a seperate folder to store everything\n",
|
34 |
+
"!mkdir training\n",
|
35 |
+
"%cd training"
|
36 |
+
]
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"cell_type": "code",
|
40 |
+
"execution_count": 2,
|
41 |
+
"metadata": {
|
42 |
+
"colab": {
|
43 |
+
"base_uri": "https://localhost:8080/"
|
44 |
+
},
|
45 |
+
"id": "y55OfxBz8QeP",
|
46 |
+
"outputId": "6d0ab016-0f96-4671-ddee-f06b50506dcd"
|
47 |
+
},
|
48 |
+
"outputs": [
|
49 |
+
{
|
50 |
+
"name": "stdout",
|
51 |
+
"output_type": "stream",
|
52 |
+
"text": [
|
53 |
+
"Cloning into 'indicTrans'...\n",
|
54 |
+
"remote: Enumerating objects: 432, done.\u001b[K\n",
|
55 |
+
"remote: Counting objects: 100% (139/139), done.\u001b[K\n",
|
56 |
+
"remote: Compressing objects: 100% (34/34), done.\u001b[K\n",
|
57 |
+
"remote: Total 432 (delta 122), reused 105 (delta 105), pack-reused 293\u001b[K\n",
|
58 |
+
"Receiving objects: 100% (432/432), 1.43 MiB | 14.11 MiB/s, done.\n",
|
59 |
+
"Resolving deltas: 100% (248/248), done.\n",
|
60 |
+
"/content/training/indicTrans\n",
|
61 |
+
"Cloning into 'indic_nlp_library'...\n",
|
62 |
+
"remote: Enumerating objects: 1325, done.\u001b[K\n",
|
63 |
+
"remote: Counting objects: 100% (147/147), done.\u001b[K\n",
|
64 |
+
"remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
|
65 |
+
"remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
|
66 |
+
"Receiving objects: 100% (1325/1325), 9.57 MiB | 10.51 MiB/s, done.\n",
|
67 |
+
"Resolving deltas: 100% (688/688), done.\n",
|
68 |
+
"Cloning into 'indic_nlp_resources'...\n",
|
69 |
+
"remote: Enumerating objects: 133, done.\u001b[K\n",
|
70 |
+
"remote: Counting objects: 100% (7/7), done.\u001b[K\n",
|
71 |
+
"remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
|
72 |
+
"remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
|
73 |
+
"Receiving objects: 100% (133/133), 149.77 MiB | 34.05 MiB/s, done.\n",
|
74 |
+
"Resolving deltas: 100% (51/51), done.\n",
|
75 |
+
"Checking out files: 100% (28/28), done.\n",
|
76 |
+
"Cloning into 'subword-nmt'...\n",
|
77 |
+
"remote: Enumerating objects: 580, done.\u001b[K\n",
|
78 |
+
"remote: Counting objects: 100% (4/4), done.\u001b[K\n",
|
79 |
+
"remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
|
80 |
+
"remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n",
|
81 |
+
"Receiving objects: 100% (580/580), 237.41 KiB | 5.28 MiB/s, done.\n",
|
82 |
+
"Resolving deltas: 100% (349/349), done.\n",
|
83 |
+
"/content/training\n"
|
84 |
+
]
|
85 |
+
}
|
86 |
+
],
|
87 |
+
"source": [
|
88 |
+
"# clone the repo for running finetuning\n",
|
89 |
+
"!git clone https://github.com/AI4Bharat/indicTrans.git\n",
|
90 |
+
"%cd indicTrans\n",
|
91 |
+
"# clone requirements repositories\n",
|
92 |
+
"!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
|
93 |
+
"!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
|
94 |
+
"!git clone https://github.com/rsennrich/subword-nmt.git\n",
|
95 |
+
"%cd .."
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": 3,
|
101 |
+
"metadata": {
|
102 |
+
"colab": {
|
103 |
+
"base_uri": "https://localhost:8080/"
|
104 |
+
},
|
105 |
+
"id": "ziWWl-1a8SMw",
|
106 |
+
"outputId": "d7908a62-9573-4693-e7cb-44aeeebaaa15"
|
107 |
+
},
|
108 |
+
"outputs": [
|
109 |
+
{
|
110 |
+
"name": "stdout",
|
111 |
+
"output_type": "stream",
|
112 |
+
"text": [
|
113 |
+
"Reading package lists... Done\n",
|
114 |
+
"Building dependency tree \n",
|
115 |
+
"Reading state information... Done\n",
|
116 |
+
"The following NEW packages will be installed:\n",
|
117 |
+
" tree\n",
|
118 |
+
"0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.\n",
|
119 |
+
"Need to get 40.7 kB of archives.\n",
|
120 |
+
"After this operation, 105 kB of additional disk space will be used.\n",
|
121 |
+
"Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]\n",
|
122 |
+
"Fetched 40.7 kB in 0s (133 kB/s)\n",
|
123 |
+
"debconf: unable to initialize frontend: Dialog\n",
|
124 |
+
"debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)\n",
|
125 |
+
"debconf: falling back to frontend: Readline\n",
|
126 |
+
"debconf: unable to initialize frontend: Readline\n",
|
127 |
+
"debconf: (This frontend requires a controlling tty.)\n",
|
128 |
+
"debconf: falling back to frontend: Teletype\n",
|
129 |
+
"dpkg-preconfigure: unable to re-open stdin: \n",
|
130 |
+
"Selecting previously unselected package tree.\n",
|
131 |
+
"(Reading database ... 160772 files and directories currently installed.)\n",
|
132 |
+
"Preparing to unpack .../tree_1.7.0-5_amd64.deb ...\n",
|
133 |
+
"Unpacking tree (1.7.0-5) ...\n",
|
134 |
+
"Setting up tree (1.7.0-5) ...\n",
|
135 |
+
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
|
136 |
+
"Collecting sacremoses\n",
|
137 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
|
138 |
+
"\u001b[K |████████████████████████████████| 901kB 4.0MB/s \n",
|
139 |
+
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
|
140 |
+
"Collecting mock\n",
|
141 |
+
" Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
|
142 |
+
"Collecting sacrebleu\n",
|
143 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
|
144 |
+
"\u001b[K |████████████████████████████████| 61kB 7.4MB/s \n",
|
145 |
+
"\u001b[?25hCollecting tensorboardX\n",
|
146 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n",
|
147 |
+
"\u001b[K |████████████████████████████████| 133kB 24.0MB/s \n",
|
148 |
+
"\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
|
149 |
+
"Collecting indic-nlp-library\n",
|
150 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
|
151 |
+
"\u001b[K |████████████████████████████████| 40kB 5.4MB/s \n",
|
152 |
+
"\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
|
153 |
+
"Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
|
154 |
+
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
|
155 |
+
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
|
156 |
+
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
|
157 |
+
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
|
158 |
+
"Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
|
159 |
+
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
|
160 |
+
"Collecting portalocker==2.0.0\n",
|
161 |
+
" Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
|
162 |
+
"Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
|
163 |
+
"Collecting morfessor\n",
|
164 |
+
" Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
|
165 |
+
"Collecting sphinx-argparse\n",
|
166 |
+
" Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
|
167 |
+
"Collecting sphinx-rtd-theme\n",
|
168 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
|
169 |
+
"\u001b[K |████████████████████████████████| 9.2MB 21.7MB/s \n",
|
170 |
+
"\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
|
171 |
+
"Requirement already satisfied: sphinx>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx-argparse->indic-nlp-library) (1.8.5)\n",
|
172 |
+
"Collecting docutils<0.17\n",
|
173 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
|
174 |
+
"\u001b[K |████████████████████████████████| 552kB 38.5MB/s \n",
|
175 |
+
"\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (20.9)\n",
|
176 |
+
"Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.0)\n",
|
177 |
+
"Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.23.0)\n",
|
178 |
+
"Requirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.2.4)\n",
|
179 |
+
"Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.6.1)\n",
|
180 |
+
"Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.1.0)\n",
|
181 |
+
"Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.9.1)\n",
|
182 |
+
"Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (0.7.12)\n",
|
183 |
+
"Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.11.3)\n",
|
184 |
+
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.4.7)\n",
|
185 |
+
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.24.3)\n",
|
186 |
+
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.10)\n",
|
187 |
+
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (3.0.4)\n",
|
188 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2021.5.30)\n",
|
189 |
+
"Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (1.1.5)\n",
|
190 |
+
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx>=1.2.0->sphinx-argparse->indic-nlp-library) (2.0.1)\n",
|
191 |
+
"Building wheels for collected packages: sphinx-argparse\n",
|
192 |
+
" Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
193 |
+
" Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=0f3830a0bf7a6cfa99000091da945e9dd814b2f1e1f9ca5d773f99aaa0d3a4a5\n",
|
194 |
+
" Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
|
195 |
+
"Successfully built sphinx-argparse\n",
|
196 |
+
"\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
|
197 |
+
"Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, morfessor, sphinx-argparse, docutils, sphinx-rtd-theme, indic-nlp-library\n",
|
198 |
+
" Found existing installation: docutils 0.17.1\n",
|
199 |
+
" Uninstalling docutils-0.17.1:\n",
|
200 |
+
" Successfully uninstalled docutils-0.17.1\n",
|
201 |
+
"Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n",
|
202 |
+
"Cloning into 'fairseq'...\n",
|
203 |
+
"remote: Enumerating objects: 28410, done.\u001b[K\n",
|
204 |
+
"remote: Counting objects: 100% (229/229), done.\u001b[K\n",
|
205 |
+
"remote: Compressing objects: 100% (127/127), done.\u001b[K\n",
|
206 |
+
"remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n",
|
207 |
+
"Receiving objects: 100% (28410/28410), 11.96 MiB | 24.45 MiB/s, done.\n",
|
208 |
+
"Resolving deltas: 100% (21310/21310), done.\n",
|
209 |
+
"/content/training/fairseq\n",
|
210 |
+
"Obtaining file:///content/training/fairseq\n",
|
211 |
+
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
|
212 |
+
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
|
213 |
+
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
|
214 |
+
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
|
215 |
+
"Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n",
|
216 |
+
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n",
|
217 |
+
"Collecting omegaconf<2.1\n",
|
218 |
+
" Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
|
219 |
+
"Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n",
|
220 |
+
"Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n",
|
221 |
+
"Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n",
|
222 |
+
"Collecting hydra-core<1.1\n",
|
223 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
|
224 |
+
"\u001b[K |████████████████████████████████| 133kB 4.7MB/s \n",
|
225 |
+
"\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n",
|
226 |
+
"Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n",
|
227 |
+
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n",
|
228 |
+
"Collecting PyYAML>=5.1.*\n",
|
229 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
|
230 |
+
"\u001b[K |████████████████████████████████| 645kB 32.4MB/s \n",
|
231 |
+
"\u001b[?25hRequirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n",
|
232 |
+
"Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n",
|
233 |
+
"Collecting antlr4-python3-runtime==4.8\n",
|
234 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
|
235 |
+
"\u001b[K |████████████████████████████████| 112kB 53.0MB/s \n",
|
236 |
+
"\u001b[?25hRequirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n",
|
237 |
+
"Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n",
|
238 |
+
"Building wheels for collected packages: antlr4-python3-runtime\n",
|
239 |
+
" Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
240 |
+
" Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=52f59bfe6322a04598da6960d2d5675a581273a45e4391e04cf1240c97346019\n",
|
241 |
+
" Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
|
242 |
+
"Successfully built antlr4-python3-runtime\n",
|
243 |
+
"Installing collected packages: PyYAML, omegaconf, antlr4-python3-runtime, hydra-core, fairseq\n",
|
244 |
+
" Found existing installation: PyYAML 3.13\n",
|
245 |
+
" Uninstalling PyYAML-3.13:\n",
|
246 |
+
" Successfully uninstalled PyYAML-3.13\n",
|
247 |
+
" Running setup.py develop for fairseq\n",
|
248 |
+
"Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
|
249 |
+
"/content/training\n"
|
250 |
+
]
|
251 |
+
}
|
252 |
+
],
|
253 |
+
"source": [
|
254 |
+
"! sudo apt install tree\n",
|
255 |
+
"\n",
|
256 |
+
"# Install the necessary libraries\n",
|
257 |
+
"!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
|
258 |
+
"# Install fairseq from source\n",
|
259 |
+
"!git clone https://github.com/pytorch/fairseq.git\n",
|
260 |
+
"%cd fairseq\n",
|
261 |
+
"# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
|
262 |
+
"!pip install --editable ./\n",
|
263 |
+
"%cd .."
|
264 |
+
]
|
265 |
+
},
|
266 |
+
{
|
267 |
+
"cell_type": "code",
|
268 |
+
"execution_count": 1,
|
269 |
+
"metadata": {
|
270 |
+
"colab": {
|
271 |
+
"base_uri": "https://localhost:8080/"
|
272 |
+
},
|
273 |
+
"id": "tmfGYkd58UiO",
|
274 |
+
"outputId": "3b83bcf6-bbbf-4e49-c2bb-7d0fb999297d"
|
275 |
+
},
|
276 |
+
"outputs": [
|
277 |
+
{
|
278 |
+
"name": "stdout",
|
279 |
+
"output_type": "stream",
|
280 |
+
"text": [
|
281 |
+
"^C\n"
|
282 |
+
]
|
283 |
+
},
|
284 |
+
{
|
285 |
+
"name": "stderr",
|
286 |
+
"output_type": "stream",
|
287 |
+
"text": [
|
288 |
+
"--2021-12-18 21:31:57-- https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
|
289 |
+
"Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.160.144, 216.58.196.176, 142.250.71.16, ...\n",
|
290 |
+
"Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.160.144|:443... connected.\n",
|
291 |
+
"HTTP request sent, awaiting response... 200 OK\n",
|
292 |
+
"Length: 7301872 (7.0M) [application/zip]\n",
|
293 |
+
"Saving to: 'benchmarks.zip'\n",
|
294 |
+
"\n",
|
295 |
+
" 0K .......... .......... .......... .......... .......... 0% 774K 9s\n",
|
296 |
+
" 50K .......... .......... .......... .......... .......... 1% 2.10M 6s\n",
|
297 |
+
" 100K .......... .......... .......... .......... .......... 2% 2.46M 5s\n",
|
298 |
+
" 150K .......... .......... .......... .......... .......... 2% 2.68M 4s\n",
|
299 |
+
" 200K .......... .......... .......... .......... .......... 3% 1.44M 4s\n",
|
300 |
+
" 250K .......... .......... .......... .......... .......... 4% 2.48M 4s\n",
|
301 |
+
" 300K .......... .......... .......... .......... .......... 4% 3.41M 4s\n",
|
302 |
+
" 350K .......... .......... .......... .......... .......... 5% 2.22M 4s\n",
|
303 |
+
" 400K .......... .......... .......... .......... .......... 6% 1.20M 4s\n",
|
304 |
+
" 450K .......... .......... .......... .......... .......... 7% 2.65M 4s\n",
|
305 |
+
" 500K .......... .......... .......... .......... .......... 7% 2.97M 3s\n",
|
306 |
+
" 550K .......... .......... .......... .......... .......... 8% 887K 4s\n",
|
307 |
+
" 600K .......... .......... .......... .......... .......... 9% 2.90M 4s\n",
|
308 |
+
" 650K .......... .......... .......... .......... .......... 9% 2.76M 4s\n",
|
309 |
+
" 700K .......... .......... .......... .......... .......... 10% 980K 4s\n",
|
310 |
+
" 750K .......... .......... .......... .......... .......... 11% 2.55M 4s\n",
|
311 |
+
" 800K .......... .......... .......... .......... .......... 11% 2.86M 3s\n",
|
312 |
+
" 850K .......... .......... .......... .......... .......... 12% 3.04M 3s\n",
|
313 |
+
" 900K .......... .......... .......... .......... .......... 13% 1.01M 3s\n",
|
314 |
+
" 950K .......... .......... .......... .......... .......... 14% 3.35M 3s\n",
|
315 |
+
" 1000K .......... .......... .......... .......... .......... 14% 5.04M 3s\n",
|
316 |
+
" 1050K .......... .......... .......... .......... .......... 15% 14.5M 3s\n",
|
317 |
+
" 1100K .......... .......... .......... .......... .......... 16% 1.01M 3s\n",
|
318 |
+
" 1150K .......... .......... .......... .......... .......... 16% 4.48M 3s\n",
|
319 |
+
" 1200K .......... .......... .......... .......... .......... 17% 4.34M 3s\n",
|
320 |
+
" 1250K .......... .......... .......... .......... .......... 18% 2.90M 3s\n",
|
321 |
+
" 1300K .......... .......... .......... .......... .......... 18% 1.14M 3s\n",
|
322 |
+
" 1350K .......... .......... .......... .......... .......... 19% 3.00M 3s\n",
|
323 |
+
" 1400K .......... .......... .......... .......... .......... 20% 5.09M 3s\n",
|
324 |
+
" 1450K .......... .......... .......... .......... .......... 21% 1.91M 3s\n",
|
325 |
+
" 1500K .......... .......... .......... .......... .......... 21% 7.70M 3s\n",
|
326 |
+
" 1550K .......... .......... .......... .......... .......... 22% 1.27M 3s\n",
|
327 |
+
" 1600K .......... .......... .......... .......... .......... 23% 3.06M 3s\n",
|
328 |
+
" 1650K .......... .......... .......... .......... .......... 23% 4.11M 3s\n",
|
329 |
+
" 1700K .......... .......... .......... .......... .......... 24% 3.34M 3s\n",
|
330 |
+
" 1750K .......... .......... .......... .......... .......... 25% 4.13M 2s\n",
|
331 |
+
" 1800K .......... .......... .......... .......... .......... 25% 7.95M 2s\n",
|
332 |
+
" 1850K .......... .......... .......... .......... .......... 26% 3.69M 2s\n",
|
333 |
+
" 1900K .......... .......... .......... .......... .......... 27% 4.00M 2s\n",
|
334 |
+
" 1950K .......... .......... .......... .......... .......... 28% 3.50M 2s\n",
|
335 |
+
" 2000K .......... .......... .......... .......... .......... 28% 4.04M 2s\n",
|
336 |
+
" 2050K .......... .......... .......... .......... .......... 29% 3.31M 2s\n",
|
337 |
+
" 2100K .......... .......... .......... .......... .......... 30% 2.49M 2s\n",
|
338 |
+
" 2150K .......... .......... .......... .......... .......... 30% 4.19M 2s\n",
|
339 |
+
" 2200K .......... .......... .......... .......... .......... 31% 5.18M 2s\n",
|
340 |
+
" 2250K .......... .......... .......... .......... .......... 32% 9.49M 2s\n",
|
341 |
+
" 2300K .......... .......... .......... .......... .......... 32% 8.67M 2s\n",
|
342 |
+
" 2350K .......... .......... .......... .......... .......... 33% 4.88M 2s\n",
|
343 |
+
" 2400K .......... .......... .......... .......... .......... 34% 4.56M 2s\n",
|
344 |
+
" 2450K .......... .......... .......... .......... .......... 35% 4.94M 2s\n",
|
345 |
+
" 2500K .......... .......... .......... .......... .......... 35% 4.38M 2s\n",
|
346 |
+
" 2550K .......... .......... .......... .......... .......... 36% 3.78M 2s\n",
|
347 |
+
" 2600K .......... .......... .......... .......... .......... 37% 4.95M 2s\n",
|
348 |
+
" 2650K .......... .......... .......... .......... .......... 37% 5.50M 2s\n",
|
349 |
+
" 2700K .......... .......... .......... .......... .......... 38% 5.23M 2s\n",
|
350 |
+
" 2750K .......... .......... .......... .......... .......... 39% 3.77M 2s\n",
|
351 |
+
" 2800K .......... .......... .......... .......... .......... 39% 10.7M 2s\n",
|
352 |
+
" 2850K .......... .......... .......... .......... .......... 40% 7.16M 2s\n",
|
353 |
+
" 2900K .......... .......... .......... .......... .......... 41% 5.36M 2s\n",
|
354 |
+
" 2950K .......... .......... .......... .......... .......... 42% 6.80M 1s\n",
|
355 |
+
" 3000K .......... .......... .......... .......... .......... 42% 6.57M 1s\n",
|
356 |
+
" 3050K .......... .......... .......... .......... .......... 43% 7.21M 1s\n",
|
357 |
+
" 3100K .......... .......... .......... .......... .......... 44% 6.66M 1s\n",
|
358 |
+
" 3150K .......... .......... .......... .......... .......... 44% 6.42M 1s\n",
|
359 |
+
" 3200K .......... .......... .......... .......... .......... 45% 8.02M 1s\n",
|
360 |
+
" 3250K .......... .......... .......... .......... .......... 46% 5.96M 1s\n",
|
361 |
+
" 3300K .......... .......... .......... .......... .......... 46% 5.13M 1s\n",
|
362 |
+
" 3350K .......... .......... .......... .......... .......... 47% 5.19M 1s\n",
|
363 |
+
" 3400K .......... .......... .......... .......... .......... 48% 7.64M 1s\n",
|
364 |
+
" 3450K .......... .......... .......... .......... .......... 49% 6.11M 1s\n",
|
365 |
+
" 3500K .......... .......... .......... .......... .......... 49% 4.01M 1s\n",
|
366 |
+
" 3550K .......... .......... .......... .......... .......... 50% 4.52M 1s\n",
|
367 |
+
" 3600K .......... .......... .......... .......... .......... 51% 6.72M 1s\n",
|
368 |
+
" 3650K .......... .......... .......... .......... .......... 51% 5.45M 1s\n",
|
369 |
+
" 3700K .......... .......... .......... .......... .......... 52% 4.37M 1s\n",
|
370 |
+
" 3750K .......... .......... .......... .......... .......... 53% 5.39M 1s\n",
|
371 |
+
" 3800K .......... .......... .......... .......... .......... 53% 7.40M 1s\n",
|
372 |
+
" 3850K .......... .......... .......... .......... .......... 54% 6.70M 1s\n",
|
373 |
+
" 3900K .......... .......... .......... .......... .......... 55% 5.14M 1s\n",
|
374 |
+
" 3950K .......... .......... .......... .......... .......... 56% 5.02M 1s\n",
|
375 |
+
" 4000K .......... .......... .......... .......... .......... 56% 6.70M 1s\n",
|
376 |
+
" 4050K .......... .......... .......... .......... .......... 57% 6.76M 1s\n",
|
377 |
+
" 4100K .......... .......... .......... .......... .......... 58% 2.52M 1s\n",
|
378 |
+
" 4150K .......... .......... .......... .......... .......... 58% 887K 1s\n",
|
379 |
+
" 4200K .......... .......... .......... .......... .......... 59% 9.25M 1s\n",
|
380 |
+
" 4250K .......... .......... .......... .......... .......... 60% 1.27M 1s\n",
|
381 |
+
" 4300K .......... .......... .......... .......... .......... 61% 5.72M 1s\n",
|
382 |
+
" 4350K .......... .......... .......... .......... .......... 61% 4.48M 1s\n",
|
383 |
+
" 4400K .......... .......... .......... .......... .......... 62% 5.20M 1s\n",
|
384 |
+
" 4450K .......... .......... .......... .......... .......... 63% 6.21M 1s\n",
|
385 |
+
" 4500K .......... .......... .......... .......... .......... 63% 7.94M 1s\n",
|
386 |
+
" 4550K .......... .......... .......... .......... .......... 64% 4.76M 1s\n",
|
387 |
+
" 4600K .......... .......... .......... .......... .......... 65% 4.74M 1s\n",
|
388 |
+
" 4650K .......... .......... .......... .......... .......... 65% 6.94M 1s\n",
|
389 |
+
" 4700K .......... .......... .......... .......... .......... 66% 5.62M 1s\n",
|
390 |
+
" 4750K .......... .......... .......... .......... .......... 67% 4.44M 1s\n",
|
391 |
+
" 4800K .......... .......... .......... .......... .......... 68% 6.02M 1s\n",
|
392 |
+
" 4850K .......... .......... .......... .......... .......... 68% 6.61M 1s\n",
|
393 |
+
" 4900K .......... .......... .......... .......... .......... 69% 3.04M 1s\n",
|
394 |
+
" 4950K .......... .......... .......... .......... .......... 70% 5.34M 1s\n",
|
395 |
+
" 5000K .......... .......... .......... .......... .......... 70% 3.03M 1s\n",
|
396 |
+
" 5050K .......... .......... .......... .......... .......... 71% 19.8M 1s\n",
|
397 |
+
" 5100K .......... .......... .......... .......... .......... 72% 6.17M 1s\n",
|
398 |
+
" 5150K .......... .......... .......... .......... .......... 72% 5.58M 1s\n",
|
399 |
+
" 5200K .......... .......... .......... .......... .......... 73% 7.38M 1s\n",
|
400 |
+
" 5250K .......... .......... .......... .......... .......... 74% 7.11M 1s\n",
|
401 |
+
" 5300K .......... .......... .......... .......... .......... 75% 6.24M 1s\n",
|
402 |
+
" 5350K .......... .......... .......... .......... .......... 75% 4.62M 1s\n",
|
403 |
+
" 5400K .......... .......... .......... .......... .......... 76% 7.64M 0s\n",
|
404 |
+
" 5450K .......... .......... .......... .......... .......... 77% 6.06M 0s\n",
|
405 |
+
" 5500K .......... .......... .......... .......... .......... 77% 5.56M 0s\n",
|
406 |
+
" 5550K .......... .......... .......... .......... .......... 78% 2.96M 0s\n",
|
407 |
+
" 5600K .......... .......... .......... .......... .......... 79% 6.17M 0s\n",
|
408 |
+
" 5650K .......... .......... .......... .......... .......... 79% 9.58M 0s\n",
|
409 |
+
" 5700K .......... .......... .......... .......... .......... 80% 2.58M 0s\n",
|
410 |
+
" 5750K .......... .......... .......... .......... .......... 81% 4.23M 0s\n",
|
411 |
+
" 5800K .......... .......... .......... .......... .......... 82% 5.70M 0s\n",
|
412 |
+
" 5850K .......... .......... .......... .......... .......... 82% 4.72M 0s\n",
|
413 |
+
" 5900K .......... .......... .......... .......... .......... 83% 6.52M 0s\n",
|
414 |
+
" 5950K .......... .......... .......... .......... .......... 84% 5.86M 0s\n",
|
415 |
+
" 6000K .......... .......... .......... .......... .......... 84% 5.22M 0s\n",
|
416 |
+
" 6050K .......... .......... .......... .......... .......... 85% 5.50M 0s\n",
|
417 |
+
" 6100K .......... .......... .......... .......... .......... 86% 6.29M 0s\n",
|
418 |
+
" 6150K .......... .......... .......... .......... .......... 86% 6.93M 0s\n",
|
419 |
+
" 6200K .......... .......... .......... .......... .......... 87% 5.50M 0s\n",
|
420 |
+
" 6250K .......... .......... .......... .......... .......... 88% 5.82M 0s\n",
|
421 |
+
" 6300K .......... .......... .......... .......... .......... 89% 6.76M 0s\n",
|
422 |
+
" 6350K .......... .......... .......... .......... .......... 89% 3.73M 0s\n",
|
423 |
+
" 6400K .......... .......... .......... .......... .......... 90% 5.98M 0s\n",
|
424 |
+
" 6450K .......... .......... .......... .......... .......... 91% 5.78M 0s\n",
|
425 |
+
" 6500K .......... .......... .......... .......... .......... 91% 5.60M 0s\n",
|
426 |
+
" 6550K .......... .......... .......... .......... .......... 92% 4.84M 0s\n",
|
427 |
+
" 6600K .......... .......... .......... .......... .......... 93% 7.25M 0s\n",
|
428 |
+
" 6650K .......... .......... .......... .......... .......... 93% 2.60M 0s\n",
|
429 |
+
" 6700K .......... .......... .......... .......... .......... 94% 6.02M 0s\n",
|
430 |
+
" 6750K .......... .......... .......... .......... .......... 95% 6.57M 0s\n",
|
431 |
+
" 6800K .......... .......... .......... .......... .......... 96% 8.30M 0s\n",
|
432 |
+
" 6850K .......... .......... .......... .......... .......... 96% 14.4M 0s\n",
|
433 |
+
" 6900K .......... .......... .......... .......... .......... 97% 4.58M 0s\n",
|
434 |
+
" 6950K .......... .......... .......... .......... .......... 98% 3.31M 0s\n",
|
435 |
+
" 7000K .......... .......... .......... .......... .......... 98% 6.88M 0s\n",
|
436 |
+
" 7050K .......... .......... .......... .......... .......... 99% 4.40M 0s\n",
|
437 |
+
" 7100K .......... .......... .......... 100% 15.1M=1.9s\n",
|
438 |
+
"\n",
|
439 |
+
"2021-12-18 21:32:01 (3.64 MB/s) - 'benchmarks.zip' saved [7301872/7301872]\n",
|
440 |
+
"\n"
|
441 |
+
]
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"name": "stdout",
|
445 |
+
"output_type": "stream",
|
446 |
+
"text": [
|
447 |
+
"Archive: samanatar-en-indic-v0.2.zip\n"
|
448 |
+
]
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"name": "stderr",
|
452 |
+
"output_type": "stream",
|
453 |
+
"text": [
|
454 |
+
" End-of-central-directory signature not found. Either this file is not\n",
|
455 |
+
" a zipfile, or it constitutes one disk of a multi-part archive. In the\n",
|
456 |
+
" latter case the central directory and zipfile comment will be found on\n",
|
457 |
+
" the last disk(s) of this archive.\n",
|
458 |
+
"unzip: cannot find zipfile directory in one of samanatar-en-indic-v0.2.zip or\n",
|
459 |
+
" samanatar-en-indic-v0.2.zip.zip, and cannot find samanatar-en-indic-v0.2.zip.ZIP, period.\n"
|
460 |
+
]
|
461 |
+
},
|
462 |
+
{
|
463 |
+
"name": "stdout",
|
464 |
+
"output_type": "stream",
|
465 |
+
"text": [
|
466 |
+
"Archive: benchmarks.zip\n",
|
467 |
+
" creating: benchmarks/\n",
|
468 |
+
" creating: benchmarks/pmi/\n",
|
469 |
+
" creating: benchmarks/pmi/en-as/\n",
|
470 |
+
" inflating: benchmarks/pmi/en-as/dev.as \n",
|
471 |
+
" inflating: benchmarks/pmi/en-as/dev.en \n",
|
472 |
+
" inflating: benchmarks/pmi/en-as/test.as \n",
|
473 |
+
" inflating: benchmarks/pmi/en-as/test.en \n",
|
474 |
+
" creating: benchmarks/wat2021-devtest/\n",
|
475 |
+
" inflating: benchmarks/wat2021-devtest/dev.gu \n",
|
476 |
+
" inflating: benchmarks/wat2021-devtest/dev.en \n",
|
477 |
+
" inflating: benchmarks/wat2021-devtest/test.bn \n",
|
478 |
+
" inflating: benchmarks/wat2021-devtest/dev.bn \n",
|
479 |
+
" inflating: benchmarks/wat2021-devtest/test.hi \n",
|
480 |
+
" inflating: benchmarks/wat2021-devtest/dev.kn \n",
|
481 |
+
" inflating: benchmarks/wat2021-devtest/dev.ta \n",
|
482 |
+
" inflating: benchmarks/wat2021-devtest/test.pa \n",
|
483 |
+
" inflating: benchmarks/wat2021-devtest/test.en \n",
|
484 |
+
" inflating: benchmarks/wat2021-devtest/test.mr \n",
|
485 |
+
" inflating: benchmarks/wat2021-devtest/test.kn \n",
|
486 |
+
" inflating: benchmarks/wat2021-devtest/dev.ml \n",
|
487 |
+
" inflating: benchmarks/wat2021-devtest/test.ta \n",
|
488 |
+
" inflating: benchmarks/wat2021-devtest/test.gu \n",
|
489 |
+
" inflating: benchmarks/wat2021-devtest/dev.or \n",
|
490 |
+
" inflating: benchmarks/wat2021-devtest/test.or \n",
|
491 |
+
" inflating: benchmarks/wat2021-devtest/test.te \n",
|
492 |
+
" inflating: benchmarks/wat2021-devtest/dev.mr \n",
|
493 |
+
" inflating: benchmarks/wat2021-devtest/test.ml \n",
|
494 |
+
" inflating: benchmarks/wat2021-devtest/dev.pa \n",
|
495 |
+
" inflating: benchmarks/wat2021-devtest/dev.te \n",
|
496 |
+
" inflating: benchmarks/wat2021-devtest/dev.hi \n",
|
497 |
+
" creating: benchmarks/wat2020-devtest/\n",
|
498 |
+
" creating: benchmarks/wat2020-devtest/en-bn/\n",
|
499 |
+
" inflating: benchmarks/wat2020-devtest/en-bn/dev.en \n",
|
500 |
+
" inflating: benchmarks/wat2020-devtest/en-bn/test.bn \n",
|
501 |
+
" inflating: benchmarks/wat2020-devtest/en-bn/dev.bn \n",
|
502 |
+
" inflating: benchmarks/wat2020-devtest/en-bn/test.en \n",
|
503 |
+
" creating: benchmarks/wat2020-devtest/en-ta/\n",
|
504 |
+
" inflating: benchmarks/wat2020-devtest/en-ta/dev.en \n",
|
505 |
+
" inflating: benchmarks/wat2020-devtest/en-ta/dev.ta \n",
|
506 |
+
" inflating: benchmarks/wat2020-devtest/en-ta/test.en \n",
|
507 |
+
" inflating: benchmarks/wat2020-devtest/en-ta/test.ta \n",
|
508 |
+
" creating: benchmarks/wat2020-devtest/en-mr/\n",
|
509 |
+
" inflating: benchmarks/wat2020-devtest/en-mr/dev.en \n",
|
510 |
+
" inflating: benchmarks/wat2020-devtest/en-mr/test.en \n",
|
511 |
+
" inflating: benchmarks/wat2020-devtest/en-mr/test.mr \n",
|
512 |
+
" inflating: benchmarks/wat2020-devtest/en-mr/dev.mr \n",
|
513 |
+
" creating: benchmarks/wat2020-devtest/en-te/\n",
|
514 |
+
" inflating: benchmarks/wat2020-devtest/en-te/dev.en \n",
|
515 |
+
" inflating: benchmarks/wat2020-devtest/en-te/test.en \n",
|
516 |
+
" inflating: benchmarks/wat2020-devtest/en-te/test.te \n",
|
517 |
+
" inflating: benchmarks/wat2020-devtest/en-te/dev.te \n",
|
518 |
+
" creating: benchmarks/wat2020-devtest/en-hi/\n",
|
519 |
+
" inflating: benchmarks/wat2020-devtest/en-hi/dev.en \n",
|
520 |
+
" inflating: benchmarks/wat2020-devtest/en-hi/test.hi \n",
|
521 |
+
" inflating: benchmarks/wat2020-devtest/en-hi/test.en \n",
|
522 |
+
" inflating: benchmarks/wat2020-devtest/en-hi/dev.hi \n",
|
523 |
+
" creating: benchmarks/wat2020-devtest/en-gu/\n",
|
524 |
+
" inflating: benchmarks/wat2020-devtest/en-gu/dev.gu \n",
|
525 |
+
" inflating: benchmarks/wat2020-devtest/en-gu/dev.en \n",
|
526 |
+
" inflating: benchmarks/wat2020-devtest/en-gu/test.en \n",
|
527 |
+
" inflating: benchmarks/wat2020-devtest/en-gu/test.gu \n",
|
528 |
+
" creating: benchmarks/wat2020-devtest/en-ml/\n",
|
529 |
+
" inflating: benchmarks/wat2020-devtest/en-ml/dev.en \n",
|
530 |
+
" inflating: benchmarks/wat2020-devtest/en-ml/test.en \n",
|
531 |
+
" inflating: benchmarks/wat2020-devtest/en-ml/dev.ml \n",
|
532 |
+
" inflating: benchmarks/wat2020-devtest/en-ml/test.ml \n",
|
533 |
+
" creating: benchmarks/ufal-ta/\n",
|
534 |
+
" creating: benchmarks/ufal-ta/en-ta/\n",
|
535 |
+
" inflating: benchmarks/ufal-ta/en-ta/dev.en \n",
|
536 |
+
" inflating: benchmarks/ufal-ta/en-ta/dev.ta \n",
|
537 |
+
" inflating: benchmarks/ufal-ta/en-ta/test.en \n",
|
538 |
+
" inflating: benchmarks/ufal-ta/en-ta/test.ta \n",
|
539 |
+
" creating: benchmarks/wmt-news/\n",
|
540 |
+
" creating: benchmarks/wmt-news/en-ta/\n",
|
541 |
+
" inflating: benchmarks/wmt-news/en-ta/dev.en \n",
|
542 |
+
" inflating: benchmarks/wmt-news/en-ta/dev.ta \n",
|
543 |
+
" inflating: benchmarks/wmt-news/en-ta/test.en \n",
|
544 |
+
" inflating: benchmarks/wmt-news/en-ta/test.ta \n",
|
545 |
+
" creating: benchmarks/wmt-news/en-hi/\n",
|
546 |
+
" inflating: benchmarks/wmt-news/en-hi/dev.en \n",
|
547 |
+
" inflating: benchmarks/wmt-news/en-hi/test.hi \n",
|
548 |
+
" inflating: benchmarks/wmt-news/en-hi/test.en \n",
|
549 |
+
" inflating: benchmarks/wmt-news/en-hi/dev.hi \n",
|
550 |
+
" creating: benchmarks/wmt-news/en-gu/\n",
|
551 |
+
" inflating: benchmarks/wmt-news/en-gu/test.en \n",
|
552 |
+
" inflating: benchmarks/wmt-news/en-gu/test.gu \n"
|
553 |
+
]
|
554 |
+
}
|
555 |
+
],
|
556 |
+
"source": [
|
557 |
+
"## for the latest samanantar dataset v0.3 -> please use this link: https://storage.googleapis.com/samanantar-public/V0.3/source_wise_splits.zip\n",
|
558 |
+
"# This v0.3 dataset has source wise splits to indicate where the data has been collected from\n",
|
559 |
+
"# For preprocessing simplicity we will use v0.2( which just uses raw text files without source information) in this tutorial\n",
|
560 |
+
"# \n",
|
561 |
+
"# \n",
|
562 |
+
"# lets now download the indictrans data v0.2 dataset\n",
|
563 |
+
"! wget https://storage.googleapis.com/samanantar-public/V0.2/data/en2indic/samanatar-en-indic-v0.2.zip\n",
|
564 |
+
"\n",
|
565 |
+
"\n",
|
566 |
+
"\n",
|
567 |
+
"# lets also download the benchmarks for dev and test set\n",
|
568 |
+
"\n",
|
569 |
+
"! wget https://storage.googleapis.com/samanantar-public/benchmarks.zip\n",
|
570 |
+
"\n",
|
571 |
+
"# training data is organized as en-X folders where each folder contains two text files containing parallel data for en-X lang pair.\n",
|
572 |
+
"\n",
|
573 |
+
"# final_data\n",
|
574 |
+
"# ├── en-as\n",
|
575 |
+
"# │ ├── train.as\n",
|
576 |
+
"# │ └── train.en\n",
|
577 |
+
"# ├── en-bn\n",
|
578 |
+
"# │ ├── train.bn\n",
|
579 |
+
"# │ └── train.en\n",
|
580 |
+
"# ├── en-gu\n",
|
581 |
+
"# │ ├── train.en\n",
|
582 |
+
"# │ └── train.gu\n",
|
583 |
+
"# ├── en-hi\n",
|
584 |
+
"# │ ├── train.en\n",
|
585 |
+
"# │ └── train.hi\n",
|
586 |
+
"# ├── en-kn\n",
|
587 |
+
"# │ ├── train.en\n",
|
588 |
+
"# │ └── train.kn\n",
|
589 |
+
"# ├── en-ml\n",
|
590 |
+
"# │ ├── train.en\n",
|
591 |
+
"# │ └── train.ml\n",
|
592 |
+
"# ├── en-mr\n",
|
593 |
+
"# │ ├── train.en\n",
|
594 |
+
"# │ └── train.mr\n",
|
595 |
+
"# ├── en-or\n",
|
596 |
+
"# │ ├── train.en\n",
|
597 |
+
"# │ └── train.or\n",
|
598 |
+
"# ├── en-pa\n",
|
599 |
+
"# │ ├── train.en\n",
|
600 |
+
"# │ └── train.pa\n",
|
601 |
+
"# ├── en-ta\n",
|
602 |
+
"# │ ├── train.en\n",
|
603 |
+
"# │ └── train.ta\n",
|
604 |
+
"# └── en-te\n",
|
605 |
+
"# ├── train.en\n",
|
606 |
+
"# └── train.te\n",
|
607 |
+
"\n",
|
608 |
+
"\n",
|
609 |
+
"! unzip samanatar-en-indic-v0.2.zip\n",
|
610 |
+
"\n",
|
611 |
+
"# benchmarks folder consists of all the benchmarks we report in the paper - pmi, ufal-ta, wat2020, wat2021, wmt-news\n",
|
612 |
+
"\n",
|
613 |
+
"! unzip benchmarks.zip"
|
614 |
+
]
|
615 |
+
},
|
616 |
+
{
|
617 |
+
"cell_type": "code",
|
618 |
+
"execution_count": null,
|
619 |
+
"metadata": {
|
620 |
+
"id": "MR_2GQoa84Jn"
|
621 |
+
},
|
622 |
+
"outputs": [],
|
623 |
+
"source": [
|
624 |
+
"# create an experiment dir to store train data, devtest data. \n",
|
625 |
+
"# This folder will also store vocabulary files (created with subword_nmt for bpe), fairseq bin files (for training), model checkpoints.\n",
|
626 |
+
"\n",
|
627 |
+
"# for this example we will be training indic to en translation model. We will name our exp_dir as indic-en-exp\n",
|
628 |
+
"! mkdir indic-en-exp\n",
|
629 |
+
"# copying all the train folders to exp_dir\n",
|
630 |
+
"! cp -r final_data/* indic-en-exp\n",
|
631 |
+
"\n",
|
632 |
+
"! mkdir -p indic-en-exp/devtest\n",
|
633 |
+
"\n",
|
634 |
+
"# copying all benchmarks to devtest folder in exp_dir\n",
|
635 |
+
"! cp -r benchmarks/* indic-en-exp/devtest\n",
|
636 |
+
"\n",
|
637 |
+
"# folder to store combined devtest data (based on the domains you want to test, you can combine multiple benchmarks dev datasets, remove duplicates)\n",
|
638 |
+
"! mkdir -p indic-en-exp/devtest/all\n",
|
639 |
+
"\n",
|
640 |
+
"# in this tutorial, for simplicity, we will just use wat2020 devtest for dev and test set\n",
|
641 |
+
"! cp -r indic-en-exp/devtest/wat2020-devtest/* indic-en-exp/devtest/all\n",
|
642 |
+
"\n"
|
643 |
+
]
|
644 |
+
},
|
645 |
+
{
|
646 |
+
"cell_type": "code",
|
647 |
+
"execution_count": null,
|
648 |
+
"metadata": {
|
649 |
+
"id": "lorcT8wkFPtQ"
|
650 |
+
},
|
651 |
+
"outputs": [],
|
652 |
+
"source": [
|
653 |
+
"% cd indicTrans"
|
654 |
+
]
|
655 |
+
},
|
656 |
+
{
|
657 |
+
"cell_type": "code",
|
658 |
+
"execution_count": null,
|
659 |
+
"metadata": {
|
660 |
+
"id": "vhvYXUc1FaVn"
|
661 |
+
},
|
662 |
+
"outputs": [],
|
663 |
+
"source": [
|
664 |
+
"# prepare_data_joint_training.sh takes experiment dir, src_lang, tgt_lang as input \n",
|
665 |
+
"# This does preprocessing, building vocab, binarization for joint training\n",
|
666 |
+
"\n",
|
667 |
+
"# The learning and applying vocabulary will take a while if the dataset is huge. To make it faster, run it on a multicore system\n",
|
668 |
+
"\n",
|
669 |
+
"! bash prepare_data_joint_training.sh '../indic-en-exp' 'indic' 'en'"
|
670 |
+
]
|
671 |
+
},
|
672 |
+
{
|
673 |
+
"cell_type": "code",
|
674 |
+
"execution_count": null,
|
675 |
+
"metadata": {
|
676 |
+
"id": "p1i3fRQzF2-x"
|
677 |
+
},
|
678 |
+
"outputs": [],
|
679 |
+
"source": [
|
680 |
+
"# Training the model\n",
|
681 |
+
"\n",
|
682 |
+
"# pls refer to fairseq documentaion to know more about each of these options (https://fairseq.readthedocs.io/en/latest/command_line_tools.html)\n",
|
683 |
+
"\n",
|
684 |
+
"\n",
|
685 |
+
"# some notable args:\n",
|
686 |
+
"# --max-updates -> maximum update steps the model will be trained for\n",
|
687 |
+
"# --arch=transformer_4x -> we use a custom transformer model and name it transformer_4x (4 times the parameter size of transformer base)\n",
|
688 |
+
"# --user_dir -> we define the custom transformer arch in model_configs folder and pass it as an argument to user_dir for fairseq to register this architechture\n",
|
689 |
+
"# --lr -> learning rate. From our limited experiments, we find that lower learning rates like 3e-5 works best for finetuning.\n",
|
690 |
+
"# --max_tokens -> this is max tokens per batch. You should limit to lower values if you get oom errors.\n",
|
691 |
+
"# --update-freq -> gradient accumulation steps\n",
|
692 |
+
"\n",
|
693 |
+
"\n",
|
694 |
+
"!( fairseq-train ../indic-en-exp/final_bin \\\n",
|
695 |
+
"--max-source-positions=210 \\\n",
|
696 |
+
"--max-target-positions=210 \\\n",
|
697 |
+
"--max-update=<max_updates> \\\n",
|
698 |
+
"--save-interval=1 \\\n",
|
699 |
+
"--arch=transformer_4x \\\n",
|
700 |
+
"--criterion=label_smoothed_cross_entropy \\\n",
|
701 |
+
"--source-lang=SRC \\\n",
|
702 |
+
"--lr-scheduler=inverse_sqrt \\\n",
|
703 |
+
"--target-lang=TGT \\\n",
|
704 |
+
"--label-smoothing=0.1 \\\n",
|
705 |
+
"--optimizer adam \\\n",
|
706 |
+
"--adam-betas \"(0.9, 0.98)\" \\\n",
|
707 |
+
"--clip-norm 1.0 \\\n",
|
708 |
+
"--warmup-init-lr 1e-07 \\\n",
|
709 |
+
"--lr 0.0005 \\\n",
|
710 |
+
"--warmup-updates 4000 \\\n",
|
711 |
+
"--dropout 0.2 \\\n",
|
712 |
+
"--save-dir ../indic-en-exp/model \\\n",
|
713 |
+
"--keep-last-epochs 5 \\\n",
|
714 |
+
"--patience 5 \\\n",
|
715 |
+
"--skip-invalid-size-inputs-valid-test \\\n",
|
716 |
+
"--fp16 \\\n",
|
717 |
+
"--user-dir model_configs \\\n",
|
718 |
+
"--wandb-project <wandb_project_name> \\\n",
|
719 |
+
"--update-freq=<grad_accumulation_steps> \\\n",
|
720 |
+
"--distributed-world-size <num_gpus> \\\n",
|
721 |
+
"--max-tokens <max_tokens_in_a_batch> )"
|
722 |
+
]
|
723 |
+
}
|
724 |
+
],
|
725 |
+
"metadata": {
|
726 |
+
"colab": {
|
727 |
+
"authorship_tag": "ABX9TyO6AA5gXphZ5kJ6h+dgeSqb",
|
728 |
+
"collapsed_sections": [],
|
729 |
+
"include_colab_link": true,
|
730 |
+
"name": "IndicTrans_training.ipynb",
|
731 |
+
"provenance": []
|
732 |
+
},
|
733 |
+
"kernelspec": {
|
734 |
+
"display_name": "Python 3",
|
735 |
+
"name": "python3"
|
736 |
+
},
|
737 |
+
"language_info": {
|
738 |
+
"codemirror_mode": {
|
739 |
+
"name": "ipython",
|
740 |
+
"version": 3
|
741 |
+
},
|
742 |
+
"file_extension": ".py",
|
743 |
+
"mimetype": "text/x-python",
|
744 |
+
"name": "python",
|
745 |
+
"nbconvert_exporter": "python",
|
746 |
+
"pygments_lexer": "ipython3",
|
747 |
+
"version": "3.7.7"
|
748 |
+
}
|
749 |
+
},
|
750 |
+
"nbformat": 4,
|
751 |
+
"nbformat_minor": 0
|
752 |
+
}
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2021 Gowtham.R
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
api.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
|
4 |
+
from inference.engine import Model
|
5 |
+
from flask import Flask, request
|
6 |
+
from flask import jsonify
|
7 |
+
from flask_cors import CORS, cross_origin
|
8 |
+
import webvtt
|
9 |
+
from io import StringIO
|
10 |
+
|
11 |
+
|
12 |
+
app = Flask(__name__)
|
13 |
+
cors = CORS(app)
|
14 |
+
app.config['CORS_HEADERS'] = 'Content-Type'
|
15 |
+
|
16 |
+
indic2en_model = Model(expdir='../models/v3/indic-en')
|
17 |
+
en2indic_model = Model(expdir='../models/v3/en-indic')
|
18 |
+
m2m_model = Model(expdir='../models/m2m')
|
19 |
+
|
20 |
+
language_dict = {
|
21 |
+
'Assamese': 'as',
|
22 |
+
'Hindi' : 'hi',
|
23 |
+
'Marathi' : 'mr',
|
24 |
+
'Tamil' : 'ta',
|
25 |
+
'Bengali' : 'bn',
|
26 |
+
'Kannada' : 'kn',
|
27 |
+
'Oriya' : 'or',
|
28 |
+
'Telugu' : 'te',
|
29 |
+
'Gujarati' : 'gu',
|
30 |
+
'Malayalam' : 'ml',
|
31 |
+
'Punjabi' : 'pa',
|
32 |
+
}
|
33 |
+
|
34 |
+
def get_inference_params():
|
35 |
+
model_type = request.form['model_type']
|
36 |
+
source_language = request.form['source_language']
|
37 |
+
target_language = request.form['target_language']
|
38 |
+
|
39 |
+
if model_type == 'indic-en':
|
40 |
+
model = indic2en_model
|
41 |
+
source_lang = language_dict[source_language]
|
42 |
+
assert target_language == 'English'
|
43 |
+
target_lang = 'en'
|
44 |
+
elif model_type == 'en-indic':
|
45 |
+
model = en2indic_model
|
46 |
+
assert source_language == 'English'
|
47 |
+
source_lang = 'en'
|
48 |
+
target_lang = language_dict[target_language]
|
49 |
+
elif model_type == 'm2m':
|
50 |
+
model = m2m_model
|
51 |
+
source_lang = language_dict[source_language]
|
52 |
+
target_lang = language_dict[target_language]
|
53 |
+
|
54 |
+
return model, source_lang, target_lang
|
55 |
+
|
56 |
+
@app.route('/', methods=['GET'])
|
57 |
+
def main():
|
58 |
+
return "IndicTrans API"
|
59 |
+
|
60 |
+
@app.route("/translate", methods=['POST'])
|
61 |
+
@cross_origin()
|
62 |
+
def infer_indic_en():
|
63 |
+
model, source_lang, target_lang = get_inference_params()
|
64 |
+
source_text = request.form['text']
|
65 |
+
|
66 |
+
start_time = time.time()
|
67 |
+
target_text = model.translate_paragraph(source_text, source_lang, target_lang)
|
68 |
+
end_time = time.time()
|
69 |
+
return {'text':target_text, 'duration':round(end_time-start_time, 2)}
|
70 |
+
|
71 |
+
@app.route("/translate_vtt", methods=['POST'])
|
72 |
+
@cross_origin()
|
73 |
+
def infer_vtt_indic_en():
|
74 |
+
model, source_lang, target_lang = get_inference_params()
|
75 |
+
source_text = request.form['text']
|
76 |
+
captions = webvtt.read_buffer(StringIO(source_text))
|
77 |
+
source_sentences = [caption.text.replace('\r', '').replace('\n', ' ') for caption in captions]
|
78 |
+
|
79 |
+
start_time = time.time()
|
80 |
+
target_sentences = model.batch_translate(source_sentences, source_lang, target_lang)
|
81 |
+
end_time = time.time()
|
82 |
+
|
83 |
+
for i in range(len(target_sentences)):
|
84 |
+
captions[i].text = target_sentences[i]
|
85 |
+
|
86 |
+
return {'text': captions.content, 'duration':round(end_time-start_time, 2)}
|
app.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
#import gradio as gr
|
3 |
+
|
4 |
+
os.system('wget -q https://storage.googleapis.com/vakyaansh-open-models/translation_models/en-indic.zip')
|
5 |
+
os.system('unzip /home/user/app/en-indic.zip')
|
6 |
+
os.system('pip uninstall -y numpy')
|
7 |
+
os.system('pip install numpy')
|
8 |
+
#os.system('pip uninstall -y numba')
|
9 |
+
#os.system('pip install numba==0.53')
|
10 |
+
|
11 |
+
from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
|
12 |
+
import gradio as grd
|
13 |
+
from inference.engine import Model
|
14 |
+
indic2en_model = Model(expdir='en-indic')
|
15 |
+
|
16 |
+
INDIC = {"Assamese": "as", "Bengali": "bn", "Gujarati": "gu", "Hindi": "hi","Kannada": "kn","Malayalam": "ml", "Marathi": "mr", "Odia": "or","Punjabi": "pa","Tamil": "ta", "Telugu" : "te"}
|
17 |
+
|
18 |
+
|
19 |
+
def translate(text, lang):
|
20 |
+
return indic2en_model.translate_paragraph(text, 'en', INDIC[lang])
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
languages = list(INDIC.keys())
|
25 |
+
|
26 |
+
#print(translate('helo how are you'))
|
27 |
+
ddwn = grd.inputs.Dropdown(languages, type="value", default="Hindi", label="Select Target Language")
|
28 |
+
txt = grd.inputs.Textbox( lines=5, placeholder="Enter Text to translate", default="", label="Enter Text in English")
|
29 |
+
txt_ouptut = grd.outputs.Textbox(type="auto", label="Translated text in Target Language")
|
30 |
+
|
31 |
+
example=[['I want to translate this sentence in Hindi','Hindi'],
|
32 |
+
['I am feeling very good today.', 'Bengali']]
|
33 |
+
|
34 |
+
supp = ','.join(languages)
|
35 |
+
iface = grd.Interface(fn=translate, inputs=[txt,ddwn] , outputs=txt_ouptut, title='Translation for 11 Indic Languages', description = 'This is a demo based on IndicTrans. Languages Supported: '+supp, article = 'Original repo [link](https://github.com/AI4Bharat/indicTrans) by AI4Bharat. <b>Note: This space can only perform translation from English to Indic languages. Support for other combinations will be provided soon.', examples=example)
|
36 |
+
iface.launch(enable_queue=True)
|
apply_bpe_traindevtest_notag.sh
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
expdir=$1 # EXPDIR
|
4 |
+
|
5 |
+
SUBWORD_NMT_DIR="subword-nmt"
|
6 |
+
|
7 |
+
data_dir="$expdir/data"
|
8 |
+
mkdir -p $expdir/bpe
|
9 |
+
|
10 |
+
for dset in `echo train dev test`
|
11 |
+
do
|
12 |
+
echo $dset
|
13 |
+
in_dset_dir="$data_dir/$dset"
|
14 |
+
out_dset_dir="$expdir/bpe/$dset"
|
15 |
+
# out_dset_dir="$expdir/final/$dset"
|
16 |
+
echo "Apply joint vocab to SRC corpus"
|
17 |
+
# for very large datasets, use gnu-parallel to speed up applying bpe
|
18 |
+
# uncomment the below line if the apply bpe is slow
|
19 |
+
|
20 |
+
# parallel --pipe --keep-order \
|
21 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
22 |
+
-c $expdir/vocab/bpe_codes.32k.SRC_TGT \
|
23 |
+
--vocabulary $expdir/vocab/vocab.SRC \
|
24 |
+
--vocabulary-threshold 5 \
|
25 |
+
--num-workers "-1" \
|
26 |
+
< $in_dset_dir.SRC \
|
27 |
+
> $out_dset_dir.SRC
|
28 |
+
echo "Apply joint vocab to TGT corpus"
|
29 |
+
|
30 |
+
# for very large datasets, use gnu-parallel to speed up applying bpe
|
31 |
+
# uncomment the below line if the apply bpe is slow
|
32 |
+
|
33 |
+
# parallel --pipe --keep-order \
|
34 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
35 |
+
-c $expdir/vocab/bpe_codes.32k.SRC_TGT \
|
36 |
+
--vocabulary $expdir/vocab/vocab.TGT \
|
37 |
+
--vocabulary-threshold 5 \
|
38 |
+
--num-workers "-1" \
|
39 |
+
< $in_dset_dir.TGT \
|
40 |
+
> $out_dset_dir.TGT
|
41 |
+
done
|
apply_single_bpe_traindevtest_notag.sh
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
expdir=$1 # EXPDIR
|
4 |
+
|
5 |
+
SUBWORD_NMT_DIR="subword-nmt"
|
6 |
+
|
7 |
+
data_dir="$expdir/data"
|
8 |
+
mkdir -p $expdir/bpe
|
9 |
+
|
10 |
+
for dset in `echo train dev test`
|
11 |
+
do
|
12 |
+
echo $dset
|
13 |
+
in_dset_dir="$data_dir/$dset"
|
14 |
+
out_dset_dir="$expdir/bpe/$dset"
|
15 |
+
# out_dset_dir="$expdir/final/$dset"
|
16 |
+
echo "Apply to SRC corpus"
|
17 |
+
# for very large datasets, use gnu-parallel to speed up applying bpe
|
18 |
+
# uncomment the below line if the apply bpe is slow
|
19 |
+
|
20 |
+
# parallel --pipe --keep-order \
|
21 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
22 |
+
-c $expdir/vocab/bpe_codes.32k.SRC \
|
23 |
+
--vocabulary $expdir/vocab/vocab.SRC \
|
24 |
+
--vocabulary-threshold 5 \
|
25 |
+
--num-workers "-1" \
|
26 |
+
< $in_dset_dir.SRC \
|
27 |
+
> $out_dset_dir.SRC
|
28 |
+
echo "Apply to TGT corpus"
|
29 |
+
# for very large datasets, use gnu-parallel to speed up applying bpe
|
30 |
+
# uncomment the below line if the apply bpe is slow
|
31 |
+
|
32 |
+
# parallel --pipe --keep-order \
|
33 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
34 |
+
-c $expdir/vocab/bpe_codes.32k.TGT \
|
35 |
+
--vocabulary $expdir/vocab/vocab.TGT \
|
36 |
+
--vocabulary-threshold 5 \
|
37 |
+
--num-workers "-1" \
|
38 |
+
< $in_dset_dir.TGT \
|
39 |
+
> $out_dset_dir.TGT
|
40 |
+
done
|
binarize_training_exp.sh
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#/bin/bash
|
2 |
+
|
3 |
+
exp_dir=$1
|
4 |
+
src_lang=$2
|
5 |
+
tgt_lang=$3
|
6 |
+
|
7 |
+
# use cpu_count to get num_workers instead of setting it manually when running in different
|
8 |
+
# instances
|
9 |
+
num_workers=`python -c "import multiprocessing; print(multiprocessing.cpu_count())"`
|
10 |
+
|
11 |
+
data_dir=$exp_dir/final
|
12 |
+
out_data_dir=$exp_dir/final_bin
|
13 |
+
|
14 |
+
rm -rf $out_data_dir
|
15 |
+
|
16 |
+
fairseq-preprocess \
|
17 |
+
--source-lang $src_lang --target-lang $tgt_lang \
|
18 |
+
--trainpref $data_dir/train \
|
19 |
+
--validpref $data_dir/dev \
|
20 |
+
--testpref $data_dir/test \
|
21 |
+
--destdir $out_data_dir \
|
22 |
+
--workers $num_workers \
|
23 |
+
--thresholdtgt 5 \
|
24 |
+
--thresholdsrc 5
|
compute_bleu.sh
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pred_fname=$1
|
2 |
+
ref_fname=$2
|
3 |
+
src_lang=$3
|
4 |
+
tgt_lang=$4
|
5 |
+
|
6 |
+
# we compute and report tokenized bleu scores.
|
7 |
+
# For computing BLEU scores, systems should output detokenized outputs. Your MT system might be doing it out of the box if you are using SentencePiece - nothing to do in that case.
|
8 |
+
# If you are using BPE then:
|
9 |
+
# 1. For English, you can use MosesDetokenizer (either the scripts in moses or the sacremoses python package)
|
10 |
+
# 2. For Indian languages, you can use the IndicNLP library detokenizer (note: please don't skip this step, since detok/tokenizer are not guaranteed to be reversible**.
|
11 |
+
# ^ both 1. and 2. are scripts/postprocess_translate.py
|
12 |
+
|
13 |
+
|
14 |
+
# For computing BLEU, we use sacrebleu:
|
15 |
+
# For English output: sacrebleu reffile < outputfile. This internally tokenizes using mteval-v13a
|
16 |
+
# For Indian language output, we need tokenized output and reference since we don't know how well the sacrebleu tokenizer works for Indic input.
|
17 |
+
# Hence we tokenize both preds and target files with IndicNLP tokenizer and then run: sacrebleu --tokenize none reffile < outputfile
|
18 |
+
if [ $tgt_lang == 'en' ]; then
|
19 |
+
# indic to en models
|
20 |
+
sacrebleu $ref_fname < $pred_fname
|
21 |
+
else
|
22 |
+
# indicnlp tokenize predictions and reference files before evaluation
|
23 |
+
input_size=`python scripts/preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang`
|
24 |
+
input_size=`python scripts/preprocess_translate.py $pred_fname $pred_fname.tok $tgt_lang`
|
25 |
+
|
26 |
+
# since we are tokenizing with indicnlp separately, we are setting tokenize to none here
|
27 |
+
sacrebleu --tokenize none $ref_fname.tok < $pred_fname.tok
|
28 |
+
fi
|
indicTrans_Finetuning.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
indicTrans_python_interface.ipynb
ADDED
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"colab_type": "text",
|
7 |
+
"id": "view-in-github"
|
8 |
+
},
|
9 |
+
"source": [
|
10 |
+
"<a href=\"https://colab.research.google.com/github/gowtham1997/indicTrans-1/blob/main/indicTrans_python_interface.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"metadata": {
|
17 |
+
"colab": {
|
18 |
+
"base_uri": "https://localhost:8080/"
|
19 |
+
},
|
20 |
+
"id": "CjfzxXZLHed_",
|
21 |
+
"outputId": "69a66b95-41b2-4413-82d1-0caacbddb3f3"
|
22 |
+
},
|
23 |
+
"outputs": [
|
24 |
+
{
|
25 |
+
"name": "stdout",
|
26 |
+
"output_type": "stream",
|
27 |
+
"text": [
|
28 |
+
"Cloning into 'indicTrans-1'...\n",
|
29 |
+
"remote: Enumerating objects: 486, done.\u001b[K\n",
|
30 |
+
"remote: Counting objects: 100% (189/189), done.\u001b[K\n",
|
31 |
+
"remote: Compressing objects: 100% (67/67), done.\u001b[K\n",
|
32 |
+
"remote: Total 486 (delta 154), reused 134 (delta 121), pack-reused 297\u001b[K\n",
|
33 |
+
"Receiving objects: 100% (486/486), 1.48 MiB | 17.61 MiB/s, done.\n",
|
34 |
+
"Resolving deltas: 100% (281/281), done.\n",
|
35 |
+
"/content/indicTrans\n",
|
36 |
+
"Cloning into 'indic_nlp_library'...\n",
|
37 |
+
"remote: Enumerating objects: 1325, done.\u001b[K\n",
|
38 |
+
"remote: Counting objects: 100% (147/147), done.\u001b[K\n",
|
39 |
+
"remote: Compressing objects: 100% (103/103), done.\u001b[K\n",
|
40 |
+
"remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178\u001b[K\n",
|
41 |
+
"Receiving objects: 100% (1325/1325), 9.57 MiB | 13.55 MiB/s, done.\n",
|
42 |
+
"Resolving deltas: 100% (688/688), done.\n",
|
43 |
+
"Cloning into 'indic_nlp_resources'...\n",
|
44 |
+
"remote: Enumerating objects: 133, done.\u001b[K\n",
|
45 |
+
"remote: Counting objects: 100% (7/7), done.\u001b[K\n",
|
46 |
+
"remote: Compressing objects: 100% (7/7), done.\u001b[K\n",
|
47 |
+
"remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126\u001b[K\n",
|
48 |
+
"Receiving objects: 100% (133/133), 149.77 MiB | 33.48 MiB/s, done.\n",
|
49 |
+
"Resolving deltas: 100% (51/51), done.\n",
|
50 |
+
"Checking out files: 100% (28/28), done.\n",
|
51 |
+
"Cloning into 'subword-nmt'...\n",
|
52 |
+
"remote: Enumerating objects: 580, done.\u001b[K\n",
|
53 |
+
"remote: Counting objects: 100% (4/4), done.\u001b[K\n",
|
54 |
+
"remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
|
55 |
+
"remote: Total 580 (delta 0), reused 1 (delta 0), pack-reused 576\u001b[K\n",
|
56 |
+
"Receiving objects: 100% (580/580), 237.41 KiB | 18.26 MiB/s, done.\n",
|
57 |
+
"Resolving deltas: 100% (349/349), done.\n",
|
58 |
+
"/content\n"
|
59 |
+
]
|
60 |
+
}
|
61 |
+
],
|
62 |
+
"source": [
|
63 |
+
"# clone the repo for running evaluation\n",
|
64 |
+
"!git clone https://github.com/AI4Bharat/indicTrans.git\n",
|
65 |
+
"%cd indicTrans\n",
|
66 |
+
"# clone requirements repositories\n",
|
67 |
+
"!git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git\n",
|
68 |
+
"!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git\n",
|
69 |
+
"!git clone https://github.com/rsennrich/subword-nmt.git\n",
|
70 |
+
"%cd .."
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"cell_type": "code",
|
75 |
+
"execution_count": 2,
|
76 |
+
"metadata": {
|
77 |
+
"colab": {
|
78 |
+
"base_uri": "https://localhost:8080/"
|
79 |
+
},
|
80 |
+
"id": "IeYW2BJhlJvx",
|
81 |
+
"outputId": "3357bc85-44d8-43b0-8c64-eef9f18be716"
|
82 |
+
},
|
83 |
+
"outputs": [
|
84 |
+
{
|
85 |
+
"name": "stdout",
|
86 |
+
"output_type": "stream",
|
87 |
+
"text": [
|
88 |
+
"Collecting sacremoses\n",
|
89 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)\n",
|
90 |
+
"\r\u001b[K |▍ | 10kB 14.0MB/s eta 0:00:01\r\u001b[K |▊ | 20kB 18.8MB/s eta 0:00:01\r\u001b[K |█ | 30kB 22.5MB/s eta 0:00:01\r\u001b[K |█▌ | 40kB 25.7MB/s eta 0:00:01\r\u001b[K |█▉ | 51kB 27.6MB/s eta 0:00:01\r\u001b[K |██▏ | 61kB 29.2MB/s eta 0:00:01\r\u001b[K |██▋ | 71kB 27.3MB/s eta 0:00:01\r\u001b[K |███ | 81kB 27.7MB/s eta 0:00:01\r\u001b[K |███▎ | 92kB 28.8MB/s eta 0:00:01\r\u001b[K |███▋ | 102kB 29.9MB/s eta 0:00:01\r\u001b[K |████ | 112kB 29.9MB/s eta 0:00:01\r\u001b[K |████▍ | 122kB 29.9MB/s eta 0:00:01\r\u001b[K |████▊ | 133kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▏ | 143kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▌ | 153kB 29.9MB/s eta 0:00:01\r\u001b[K |█████▉ | 163kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▎ | 174kB 29.9MB/s eta 0:00:01\r\u001b[K |██████▋ | 184kB 29.9MB/s eta 0:00:01\r\u001b[K |███████ | 194kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▎ | 204kB 29.9MB/s eta 0:00:01\r\u001b[K |███████▊ | 215kB 29.9MB/s eta 0:00:01\r\u001b[K |████████ | 225kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▍ | 235kB 29.9MB/s eta 0:00:01\r\u001b[K |████████▉ | 245kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▏ | 256kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 266kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████▉ | 276kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▎ | 286kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████▋ | 296kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████ | 307kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▍ | 317kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████▊ | 327kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████ | 337kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▌ | 348kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████▉ | 358kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▏ | 368kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████▌ | 378kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████ | 389kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▎ | 399kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 409kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████ | 419kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▍ | 430kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████▊ | 440kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████ | 450kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▌ | 460kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████▉ | 471kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▏ | 481kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████▋ | 491kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████ | 501kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▎ | 512kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████▊ | 522kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████ | 532kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 542kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████▊ | 552kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▏ | 563kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▌ | 573kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████▉ | 583kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▎ | 593kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 604kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 614kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▎ | 624kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▊ | 634kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████ | 645kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▍ | 655kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 665kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 675kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 686kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 696kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▎ | 706kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▋ | 716kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 727kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▍ | 737kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▊ | 747kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████ | 757kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▌ | 768kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▉ | 778kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▏ | 788kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 798kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████ | 808kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 819kB 29.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▋ | 829kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 839kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▍ | 849kB 29.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▊ | 860kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 870kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▌| 880kB 29.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▉| 890kB 29.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 901kB 29.9MB/s \n",
|
91 |
+
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (1.1.5)\n",
|
92 |
+
"Collecting mock\n",
|
93 |
+
" Downloading https://files.pythonhosted.org/packages/5c/03/b7e605db4a57c0f6fba744b11ef3ddf4ddebcada35022927a2b5fc623fdf/mock-4.0.3-py3-none-any.whl\n",
|
94 |
+
"Collecting sacrebleu\n",
|
95 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)\n",
|
96 |
+
"\u001b[K |████████████████████████████████| 61kB 7.5MB/s \n",
|
97 |
+
"\u001b[?25hCollecting tensorboardX\n",
|
98 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)\n",
|
99 |
+
"\u001b[K |████████████████████████████████| 133kB 47.5MB/s \n",
|
100 |
+
"\u001b[?25hRequirement already satisfied: pyarrow in /usr/local/lib/python3.7/dist-packages (3.0.0)\n",
|
101 |
+
"Collecting indic-nlp-library\n",
|
102 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/84/d4/495bb43b88a2a6d04b09c29fc5115f24872af74cd8317fe84026abd4ddb1/indic_nlp_library-0.81-py3-none-any.whl (40kB)\n",
|
103 |
+
"\u001b[K |████████████████████████████████| 40kB 5.2MB/s \n",
|
104 |
+
"\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from sacremoses) (4.41.1)\n",
|
105 |
+
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.15.0)\n",
|
106 |
+
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses) (1.0.1)\n",
|
107 |
+
"Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from sacremoses) (2019.12.20)\n",
|
108 |
+
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses) (7.1.2)\n",
|
109 |
+
"Requirement already satisfied: numpy>=1.15.4 in /usr/local/lib/python3.7/dist-packages (from pandas) (1.19.5)\n",
|
110 |
+
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas) (2.8.1)\n",
|
111 |
+
"Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas) (2018.9)\n",
|
112 |
+
"Collecting portalocker==2.0.0\n",
|
113 |
+
" Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl\n",
|
114 |
+
"Requirement already satisfied: protobuf>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorboardX) (3.12.4)\n",
|
115 |
+
"Collecting sphinx-rtd-theme\n",
|
116 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/ac/24/2475e8f83519b54b2148d4a56eb1111f9cec630d088c3ffc214492c12107/sphinx_rtd_theme-0.5.2-py2.py3-none-any.whl (9.1MB)\n",
|
117 |
+
"\u001b[K |████████████████████████████████| 9.2MB 42.0MB/s \n",
|
118 |
+
"\u001b[?25hCollecting morfessor\n",
|
119 |
+
" Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl\n",
|
120 |
+
"Collecting sphinx-argparse\n",
|
121 |
+
" Downloading https://files.pythonhosted.org/packages/06/2b/dfad6a1831c3aeeae25d8d3d417224684befbf45e10c7f2141631616a6ed/sphinx-argparse-0.2.5.tar.gz\n",
|
122 |
+
"Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.8.0->tensorboardX) (57.0.0)\n",
|
123 |
+
"Requirement already satisfied: sphinx in /usr/local/lib/python3.7/dist-packages (from sphinx-rtd-theme->indic-nlp-library) (1.8.5)\n",
|
124 |
+
"Collecting docutils<0.17\n",
|
125 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl (548kB)\n",
|
126 |
+
"\u001b[K |████████████████████████████████| 552kB 31.5MB/s \n",
|
127 |
+
"\u001b[?25hRequirement already satisfied: sphinxcontrib-websupport in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.4)\n",
|
128 |
+
"Requirement already satisfied: snowballstemmer>=1.1 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.1.0)\n",
|
129 |
+
"Requirement already satisfied: Jinja2>=2.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.11.3)\n",
|
130 |
+
"Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (20.9)\n",
|
131 |
+
"Requirement already satisfied: alabaster<0.8,>=0.7 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (0.7.12)\n",
|
132 |
+
"Requirement already satisfied: imagesize in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (1.2.0)\n",
|
133 |
+
"Requirement already satisfied: Pygments>=2.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.6.1)\n",
|
134 |
+
"Requirement already satisfied: requests>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.23.0)\n",
|
135 |
+
"Requirement already satisfied: babel!=2.0,>=1.3 in /usr/local/lib/python3.7/dist-packages (from sphinx->sphinx-rtd-theme->indic-nlp-library) (2.9.1)\n",
|
136 |
+
"Requirement already satisfied: sphinxcontrib-serializinghtml in /usr/local/lib/python3.7/dist-packages (from sphinxcontrib-websupport->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.1.5)\n",
|
137 |
+
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2>=2.3->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.0.1)\n",
|
138 |
+
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.4.7)\n",
|
139 |
+
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2.10)\n",
|
140 |
+
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (3.0.4)\n",
|
141 |
+
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (1.24.3)\n",
|
142 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.0.0->sphinx->sphinx-rtd-theme->indic-nlp-library) (2021.5.30)\n",
|
143 |
+
"Building wheels for collected packages: sphinx-argparse\n",
|
144 |
+
" Building wheel for sphinx-argparse (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
145 |
+
" Created wheel for sphinx-argparse: filename=sphinx_argparse-0.2.5-cp37-none-any.whl size=11552 sha256=d8cbdca000085e2e2c122c305bb21aa76a9600012ded8e06c300e03d1c4d1e32\n",
|
146 |
+
" Stored in directory: /root/.cache/pip/wheels/2a/18/1b/4990a1859da4edc77ab312bc2986c08d2733fb5713d06e44f5\n",
|
147 |
+
"Successfully built sphinx-argparse\n",
|
148 |
+
"\u001b[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.\u001b[0m\n",
|
149 |
+
"Installing collected packages: sacremoses, mock, portalocker, sacrebleu, tensorboardX, docutils, sphinx-rtd-theme, morfessor, sphinx-argparse, indic-nlp-library\n",
|
150 |
+
" Found existing installation: docutils 0.17.1\n",
|
151 |
+
" Uninstalling docutils-0.17.1:\n",
|
152 |
+
" Successfully uninstalled docutils-0.17.1\n",
|
153 |
+
"Successfully installed docutils-0.16 indic-nlp-library-0.81 mock-4.0.3 morfessor-2.0.6 portalocker-2.0.0 sacrebleu-1.5.1 sacremoses-0.0.45 sphinx-argparse-0.2.5 sphinx-rtd-theme-0.5.2 tensorboardX-2.3\n",
|
154 |
+
"Collecting mosestokenizer\n",
|
155 |
+
" Downloading https://files.pythonhosted.org/packages/4b/b3/c0af235b16c4f44a2828ef017f7947d1262b2646e440f85c6a2ff26a8c6f/mosestokenizer-1.1.0.tar.gz\n",
|
156 |
+
"Collecting subword-nmt\n",
|
157 |
+
" Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl\n",
|
158 |
+
"Requirement already satisfied: docopt in /usr/local/lib/python3.7/dist-packages (from mosestokenizer) (0.6.2)\n",
|
159 |
+
"Collecting openfile\n",
|
160 |
+
" Downloading https://files.pythonhosted.org/packages/93/e6/805db6867faacb488b44ba8e0829ef4de151dd0499f3c5da5f4ad11698a7/openfile-0.0.7-py3-none-any.whl\n",
|
161 |
+
"Collecting uctools\n",
|
162 |
+
" Downloading https://files.pythonhosted.org/packages/04/cb/70ed842d9a43460eedaa11f7503b4ab6537b43b63f0d854d59d8e150fac1/uctools-1.3.0.tar.gz\n",
|
163 |
+
"Collecting toolwrapper\n",
|
164 |
+
" Downloading https://files.pythonhosted.org/packages/41/7b/34bf8fb69426d8a18bcc61081e9d126f4fcd41c3c832072bef39af1602cd/toolwrapper-2.1.0.tar.gz\n",
|
165 |
+
"Building wheels for collected packages: mosestokenizer, uctools, toolwrapper\n",
|
166 |
+
" Building wheel for mosestokenizer (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
167 |
+
" Created wheel for mosestokenizer: filename=mosestokenizer-1.1.0-cp37-none-any.whl size=49120 sha256=4fc04046040e73bd5d13c606ebbfc65ac38c7d073f7fc0b0e4cc1d4215b595f3\n",
|
168 |
+
" Stored in directory: /root/.cache/pip/wheels/a2/e7/48/48d5e0f9c0cd5def2dfd7cb8543945f906448ed1313de24a29\n",
|
169 |
+
" Building wheel for uctools (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
170 |
+
" Created wheel for uctools: filename=uctools-1.3.0-cp37-none-any.whl size=6163 sha256=c5a865107c59f98c4da5d18ddc754fa141ab494574187281de1502561c6a004e\n",
|
171 |
+
" Stored in directory: /root/.cache/pip/wheels/06/b6/8f/935d5bf5bca85d47c6f5ec31641879bba057d336ab36b1e773\n",
|
172 |
+
" Building wheel for toolwrapper (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
173 |
+
" Created wheel for toolwrapper: filename=toolwrapper-2.1.0-cp37-none-any.whl size=3356 sha256=41a3e12078d5681e8467701735208d880ba588b0f5dbfb3b99c4e04bc643eccc\n",
|
174 |
+
" Stored in directory: /root/.cache/pip/wheels/84/ea/29/e02f3b855bf19344972092873a1091b329309bbc3d3d0cbaef\n",
|
175 |
+
"Successfully built mosestokenizer uctools toolwrapper\n",
|
176 |
+
"Installing collected packages: openfile, uctools, toolwrapper, mosestokenizer, subword-nmt\n",
|
177 |
+
"Successfully installed mosestokenizer-1.1.0 openfile-0.0.7 subword-nmt-0.3.7 toolwrapper-2.1.0 uctools-1.3.0\n",
|
178 |
+
"Cloning into 'fairseq'...\n",
|
179 |
+
"remote: Enumerating objects: 28410, done.\u001b[K\n",
|
180 |
+
"remote: Counting objects: 100% (229/229), done.\u001b[K\n",
|
181 |
+
"remote: Compressing objects: 100% (127/127), done.\u001b[K\n",
|
182 |
+
"remote: Total 28410 (delta 114), reused 187 (delta 99), pack-reused 28181\u001b[K\n",
|
183 |
+
"Receiving objects: 100% (28410/28410), 11.96 MiB | 24.16 MiB/s, done.\n",
|
184 |
+
"Resolving deltas: 100% (21310/21310), done.\n",
|
185 |
+
"/content/fairseq\n",
|
186 |
+
"Obtaining file:///content/fairseq\n",
|
187 |
+
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
|
188 |
+
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
|
189 |
+
" Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n",
|
190 |
+
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n",
|
191 |
+
"Requirement already satisfied: cffi in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.14.5)\n",
|
192 |
+
"Collecting hydra-core<1.1\n",
|
193 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)\n",
|
194 |
+
"\u001b[K |████████████████████████████████| 133kB 11.6MB/s \n",
|
195 |
+
"\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (4.41.1)\n",
|
196 |
+
"Collecting omegaconf<2.1\n",
|
197 |
+
" Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl\n",
|
198 |
+
"Requirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (2019.12.20)\n",
|
199 |
+
"Requirement already satisfied: numpy; python_version >= \"3.7\" in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.19.5)\n",
|
200 |
+
"Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.9.0+cu102)\n",
|
201 |
+
"Requirement already satisfied: sacrebleu>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (1.5.1)\n",
|
202 |
+
"Requirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from fairseq==1.0.0a0+f887152) (0.29.23)\n",
|
203 |
+
"Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi->fairseq==1.0.0a0+f887152) (2.20)\n",
|
204 |
+
"Collecting antlr4-python3-runtime==4.8\n",
|
205 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)\n",
|
206 |
+
"\u001b[K |████████████████████████████████| 112kB 33.5MB/s \n",
|
207 |
+
"\u001b[?25hRequirement already satisfied: importlib-resources; python_version < \"3.9\" in /usr/local/lib/python3.7/dist-packages (from hydra-core<1.1->fairseq==1.0.0a0+f887152) (5.1.4)\n",
|
208 |
+
"Collecting PyYAML>=5.1.*\n",
|
209 |
+
"\u001b[?25l Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)\n",
|
210 |
+
"\u001b[K |████████████████████████████████| 645kB 30.2MB/s \n",
|
211 |
+
"\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from omegaconf<2.1->fairseq==1.0.0a0+f887152) (3.7.4.3)\n",
|
212 |
+
"Requirement already satisfied: portalocker==2.0.0 in /usr/local/lib/python3.7/dist-packages (from sacrebleu>=1.4.12->fairseq==1.0.0a0+f887152) (2.0.0)\n",
|
213 |
+
"Requirement already satisfied: zipp>=3.1.0; python_version < \"3.10\" in /usr/local/lib/python3.7/dist-packages (from importlib-resources; python_version < \"3.9\"->hydra-core<1.1->fairseq==1.0.0a0+f887152) (3.4.1)\n",
|
214 |
+
"Building wheels for collected packages: antlr4-python3-runtime\n",
|
215 |
+
" Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
216 |
+
" Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-cp37-none-any.whl size=141231 sha256=69960f774a6fdb385fed1a63fb02ae50b57299408cfd6fb33be60d686be878b7\n",
|
217 |
+
" Stored in directory: /root/.cache/pip/wheels/e3/e2/fa/b78480b448b8579ddf393bebd3f47ee23aa84c89b6a78285c8\n",
|
218 |
+
"Successfully built antlr4-python3-runtime\n",
|
219 |
+
"Installing collected packages: antlr4-python3-runtime, PyYAML, omegaconf, hydra-core, fairseq\n",
|
220 |
+
" Found existing installation: PyYAML 3.13\n",
|
221 |
+
" Uninstalling PyYAML-3.13:\n",
|
222 |
+
" Successfully uninstalled PyYAML-3.13\n",
|
223 |
+
" Running setup.py develop for fairseq\n",
|
224 |
+
"Successfully installed PyYAML-5.4.1 antlr4-python3-runtime-4.8 fairseq hydra-core-1.0.6 omegaconf-2.0.6\n",
|
225 |
+
"/content\n"
|
226 |
+
]
|
227 |
+
}
|
228 |
+
],
|
229 |
+
"source": [
|
230 |
+
"# Install the necessary libraries\n",
|
231 |
+
"!pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow indic-nlp-library\n",
|
232 |
+
"! pip install mosestokenizer subword-nmt\n",
|
233 |
+
"# Install fairseq from source\n",
|
234 |
+
"!git clone https://github.com/pytorch/fairseq.git\n",
|
235 |
+
"%cd fairseq\n",
|
236 |
+
"# !git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d\n",
|
237 |
+
"!pip install --editable ./\n",
|
238 |
+
"\n",
|
239 |
+
"%cd .."
|
240 |
+
]
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"cell_type": "code",
|
244 |
+
"execution_count": 1,
|
245 |
+
"metadata": {
|
246 |
+
"id": "TktUu9NW_PLq"
|
247 |
+
},
|
248 |
+
"outputs": [],
|
249 |
+
"source": [
|
250 |
+
"# this step is only required if you are running the code on colab\n",
|
251 |
+
"# restart the runtime after running prev cell (to update). See this -> https://stackoverflow.com/questions/57838013/modulenotfounderror-after-successful-pip-install-in-google-colaboratory\n",
|
252 |
+
"\n",
|
253 |
+
"# this import will not work without restarting runtime\n",
|
254 |
+
"from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils"
|
255 |
+
]
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"cell_type": "code",
|
259 |
+
"execution_count": 9,
|
260 |
+
"metadata": {
|
261 |
+
"colab": {
|
262 |
+
"base_uri": "https://localhost:8080/"
|
263 |
+
},
|
264 |
+
"id": "E_4JxNdRlPQB",
|
265 |
+
"outputId": "82ab5e2f-d560-4f4e-bf3f-f1ca0a8d31b8"
|
266 |
+
},
|
267 |
+
"outputs": [
|
268 |
+
{
|
269 |
+
"name": "stdout",
|
270 |
+
"output_type": "stream",
|
271 |
+
"text": [
|
272 |
+
"--2021-06-27 12:43:16-- https://storage.googleapis.com/samanantar-public/V0.2/models/indic-en.zip\n",
|
273 |
+
"Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.13.240, 172.217.15.80, 142.251.33.208, ...\n",
|
274 |
+
"Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.13.240|:443... connected.\n",
|
275 |
+
"HTTP request sent, awaiting response... 200 OK\n",
|
276 |
+
"Length: 4551079075 (4.2G) [application/zip]\n",
|
277 |
+
"Saving to: ‘indic-en.zip’\n",
|
278 |
+
"\n",
|
279 |
+
"indic-en.zip 100%[===================>] 4.24G 28.8MB/s in 83s \n",
|
280 |
+
"\n",
|
281 |
+
"2021-06-27 12:44:39 (52.1 MB/s) - ‘indic-en.zip’ saved [4551079075/4551079075]\n",
|
282 |
+
"\n",
|
283 |
+
"Archive: indic-en.zip\n",
|
284 |
+
" creating: indic-en/\n",
|
285 |
+
" creating: indic-en/vocab/\n",
|
286 |
+
" inflating: indic-en/vocab/bpe_codes.32k.SRC \n",
|
287 |
+
" inflating: indic-en/vocab/vocab.SRC \n",
|
288 |
+
" inflating: indic-en/vocab/vocab.TGT \n",
|
289 |
+
" inflating: indic-en/vocab/bpe_codes.32k.TGT \n",
|
290 |
+
" creating: indic-en/final_bin/\n",
|
291 |
+
" inflating: indic-en/final_bin/dict.TGT.txt \n",
|
292 |
+
" inflating: indic-en/final_bin/dict.SRC.txt \n",
|
293 |
+
" creating: indic-en/model/\n",
|
294 |
+
" inflating: indic-en/model/checkpoint_best.pt \n",
|
295 |
+
"/content/indicTrans\n"
|
296 |
+
]
|
297 |
+
}
|
298 |
+
],
|
299 |
+
"source": [
|
300 |
+
"# download the indictrans model\n",
|
301 |
+
"\n",
|
302 |
+
"\n",
|
303 |
+
"# downloading the indic-en model\n",
|
304 |
+
"!wget https://storage.googleapis.com/samanantar-public/V0.3/models/indic-en.zip\n",
|
305 |
+
"!unzip indic-en.zip\n",
|
306 |
+
"\n",
|
307 |
+
"# downloading the en-indic model\n",
|
308 |
+
"# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/en-indic.zip\n",
|
309 |
+
"# !unzip en-indic.zip\n",
|
310 |
+
"\n",
|
311 |
+
"# # downloading the indic-indic model\n",
|
312 |
+
"# !wget https://storage.googleapis.com/samanantar-public/V0.3/models/m2m.zip\n",
|
313 |
+
"# !unzip m2m.zip\n",
|
314 |
+
"\n",
|
315 |
+
"%cd indicTrans"
|
316 |
+
]
|
317 |
+
},
|
318 |
+
{
|
319 |
+
"cell_type": "code",
|
320 |
+
"execution_count": 10,
|
321 |
+
"metadata": {
|
322 |
+
"colab": {
|
323 |
+
"base_uri": "https://localhost:8080/"
|
324 |
+
},
|
325 |
+
"id": "yTnWbHqY01-B",
|
326 |
+
"outputId": "0d075f51-097b-46ad-aade-407a4437aa62"
|
327 |
+
},
|
328 |
+
"outputs": [
|
329 |
+
{
|
330 |
+
"name": "stdout",
|
331 |
+
"output_type": "stream",
|
332 |
+
"text": [
|
333 |
+
"Initializing vocab and bpe\n",
|
334 |
+
"Initializing model for translation\n"
|
335 |
+
]
|
336 |
+
}
|
337 |
+
],
|
338 |
+
"source": [
|
339 |
+
"from indicTrans.inference.engine import Model\n",
|
340 |
+
"\n",
|
341 |
+
"indic2en_model = Model(expdir='../indic-en')"
|
342 |
+
]
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"cell_type": "code",
|
346 |
+
"execution_count": 11,
|
347 |
+
"metadata": {
|
348 |
+
"colab": {
|
349 |
+
"base_uri": "https://localhost:8080/"
|
350 |
+
},
|
351 |
+
"id": "QTp2NOgQ__sB",
|
352 |
+
"outputId": "e015a71e-8206-4e1d-cb3e-11ecb4d44f76"
|
353 |
+
},
|
354 |
+
"outputs": [
|
355 |
+
{
|
356 |
+
"name": "stderr",
|
357 |
+
"output_type": "stream",
|
358 |
+
"text": [
|
359 |
+
"100%|██████████| 3/3 [00:00<00:00, 1225.21it/s]\n",
|
360 |
+
"/usr/local/lib/python3.7/dist-packages/torch/_tensor.py:575: UserWarning: floor_divide is deprecated, and will be removed in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values.\n",
|
361 |
+
"To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)\n",
|
362 |
+
" return torch.floor_divide(self, other)\n"
|
363 |
+
]
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"data": {
|
367 |
+
"text/plain": [
|
368 |
+
"['He seems to know us.',\n",
|
369 |
+
" 'I couldnt find it anywhere.',\n",
|
370 |
+
" 'If someone in your neighbourhood develops these symptoms, staying at home can help prevent the spread of the coronavirus infection.']"
|
371 |
+
]
|
372 |
+
},
|
373 |
+
"execution_count": 11,
|
374 |
+
"metadata": {
|
375 |
+
"tags": []
|
376 |
+
},
|
377 |
+
"output_type": "execute_result"
|
378 |
+
}
|
379 |
+
],
|
380 |
+
"source": [
|
381 |
+
"ta_sents = ['அவனுக்கு நம்மைப் தெரியும் என்று தோன்றுகிறது',\n",
|
382 |
+
" \"இது எங்கே இருக்கு என்று என்னால் கண்டுபிடிக்க முடியவில்லை.\",\n",
|
383 |
+
" 'உங்களுக்கு உங்கள் அருகில் இருக்கும் ஒருவருக்கோ இத்தகைய அறிகுறிகள் தென்பட்டால், வீட்டிலேயே இருப்பது, கொரோனா வைரஸ் தொற்று பிறருக்கு வராமல் தடுக்க உதவும்.']\n",
|
384 |
+
"\n",
|
385 |
+
"\n",
|
386 |
+
"indic2en_model.batch_translate(ta_sents, 'ta', 'en')\n",
|
387 |
+
"\n"
|
388 |
+
]
|
389 |
+
},
|
390 |
+
{
|
391 |
+
"cell_type": "code",
|
392 |
+
"execution_count": 13,
|
393 |
+
"metadata": {
|
394 |
+
"colab": {
|
395 |
+
"base_uri": "https://localhost:8080/",
|
396 |
+
"height": 68
|
397 |
+
},
|
398 |
+
"id": "VFXrCNZGEN7Z",
|
399 |
+
"outputId": "f72aad17-1cc0-4774-a7ee-5b3a5d954de3"
|
400 |
+
},
|
401 |
+
"outputs": [
|
402 |
+
{
|
403 |
+
"name": "stderr",
|
404 |
+
"output_type": "stream",
|
405 |
+
"text": [
|
406 |
+
"100%|██████████| 4/4 [00:00<00:00, 1496.76it/s]\n"
|
407 |
+
]
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"data": {
|
411 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
412 |
+
"type": "string"
|
413 |
+
},
|
414 |
+
"text/plain": [
|
415 |
+
"'The pandemic has resulted in worldwide social and economic disruption. The world is facing the worst recession since the global financial crisis. This led to the postponement or cancellation of sporting, religious, political and cultural events. Due to the fear, there was shortage of supply as more people purchased items like masks, sanitizers etc.'"
|
416 |
+
]
|
417 |
+
},
|
418 |
+
"execution_count": 13,
|
419 |
+
"metadata": {
|
420 |
+
"tags": []
|
421 |
+
},
|
422 |
+
"output_type": "execute_result"
|
423 |
+
}
|
424 |
+
],
|
425 |
+
"source": [
|
426 |
+
"\n",
|
427 |
+
"ta_paragraph = \"\"\"இத்தொற்றுநோய் உலகளாவிய சமூக மற்றும் பொருளாதார சீர்குலைவை ஏற்படுத்தியுள்ளது.இதனால் பெரும் பொருளாதார மந்தநிலைக்குப் பின்னர் உலகளவில் மிகப்பெரிய மந்தநிலை ஏற்பட்டுள்ளது. இது விளையாட்டு,மத, அரசியல் மற்றும் கலாச்சார நிகழ்வுகளை ஒத்திவைக்க அல்லது ரத்து செய்ய வழிவகுத்தது.\n",
|
428 |
+
"அச்சம் காரணமாக முகக்கவசம், கிருமிநாசினி உள்ளிட்ட பொருட்களை அதிக நபர்கள் வாங்கியதால் விநியோகப் பற்றாக்குறை ஏற்பட்டது.\"\"\"\n",
|
429 |
+
"\n",
|
430 |
+
"indic2en_model.translate_paragraph(ta_paragraph, 'ta', 'en')"
|
431 |
+
]
|
432 |
+
},
|
433 |
+
{
|
434 |
+
"cell_type": "code",
|
435 |
+
"execution_count": null,
|
436 |
+
"metadata": {
|
437 |
+
"id": "Hi_D7s_VIjis"
|
438 |
+
},
|
439 |
+
"outputs": [],
|
440 |
+
"source": []
|
441 |
+
}
|
442 |
+
],
|
443 |
+
"metadata": {
|
444 |
+
"accelerator": "GPU",
|
445 |
+
"colab": {
|
446 |
+
"authorship_tag": "ABX9TyM3t8oQYMhBUuq4/Pyhcr0+",
|
447 |
+
"collapsed_sections": [],
|
448 |
+
"include_colab_link": true,
|
449 |
+
"name": "indicTrans_python_interface.ipynb",
|
450 |
+
"provenance": []
|
451 |
+
},
|
452 |
+
"kernelspec": {
|
453 |
+
"display_name": "Python 3",
|
454 |
+
"name": "python3"
|
455 |
+
},
|
456 |
+
"language_info": {
|
457 |
+
"name": "python"
|
458 |
+
}
|
459 |
+
},
|
460 |
+
"nbformat": 4,
|
461 |
+
"nbformat_minor": 0
|
462 |
+
}
|
indic_nlp_library/LICENSE
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2013-present Anoop Kunchukuttan
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
6 |
+
|
7 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
8 |
+
|
9 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
indic_nlp_library/README.md
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Indic NLP Library
|
2 |
+
|
3 |
+
The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
|
4 |
+
|
5 |
+
The library provides the following functionalities:
|
6 |
+
|
7 |
+
- Text Normalization
|
8 |
+
- Script Information
|
9 |
+
- Word Tokenization and Detokenization
|
10 |
+
- Sentence Splitting
|
11 |
+
- Word Segmentation
|
12 |
+
- Syllabification
|
13 |
+
- Script Conversion
|
14 |
+
- Romanization
|
15 |
+
- Indicization
|
16 |
+
- Transliteration
|
17 |
+
- Translation
|
18 |
+
|
19 |
+
The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
|
20 |
+
|
21 |
+
**If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/indicnlpweb/indicnlp_catalog) for pointers.**
|
22 |
+
|
23 |
+
## Pre-requisites
|
24 |
+
|
25 |
+
- Python 3.x
|
26 |
+
- (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
|
27 |
+
- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
|
28 |
+
- [Urduhack](https://github.com/urduhack/urduhack): Needed only if Urdu normalization is required. It has other dependencies like Tensorflow.
|
29 |
+
- Other dependencies are listed in setup.py
|
30 |
+
|
31 |
+
|
32 |
+
## Configuration
|
33 |
+
|
34 |
+
- Installation from pip:
|
35 |
+
|
36 |
+
`pip install indic-nlp-library`
|
37 |
+
|
38 |
+
- If you want to use the project from the github repo, add the project to the Python Path:
|
39 |
+
|
40 |
+
- Clone this repository
|
41 |
+
- Install dependencies: `pip install -r requirements.txt`
|
42 |
+
- Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
|
43 |
+
|
44 |
+
- In either case, export the path to the _Indic NLP Resources_ directory
|
45 |
+
|
46 |
+
Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
|
47 |
+
|
48 |
+
## Usage
|
49 |
+
|
50 |
+
You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
|
51 |
+
|
52 |
+
### Getting Started
|
53 |
+
|
54 |
+
Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
|
55 |
+
- You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
|
56 |
+
|
57 |
+
### Documentation
|
58 |
+
|
59 |
+
You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
|
60 |
+
|
61 |
+
This documents the Python API as well as the commandline reference.
|
62 |
+
|
63 |
+
## Citing
|
64 |
+
|
65 |
+
If you use this library, please include the following citation:
|
66 |
+
|
67 |
+
```
|
68 |
+
@misc{kunchukuttan2020indicnlp,
|
69 |
+
author = "Anoop Kunchukuttan",
|
70 |
+
title = "{The IndicNLP Library}",
|
71 |
+
year = "2020",
|
72 |
+
howpublished={\url{https://github.com/anoopkunchukuttan/indic_nlp_library/blob/master/docs/indicnlp.pdf}}
|
73 |
+
}
|
74 |
+
```
|
75 |
+
You can find the document [HERE](docs/indicnlp.pdf)
|
76 |
+
|
77 |
+
## Website
|
78 |
+
|
79 |
+
`http://anoopkunchukuttan.github.io/indic_nlp_library`
|
80 |
+
|
81 |
+
## Author
|
82 |
+
Anoop Kunchukuttan ([[email protected]]([email protected]))
|
83 |
+
|
84 |
+
## Companies, Organizations, Projects using IndicNLP Library
|
85 |
+
|
86 |
+
- [AI4Bharat-IndicNLPSuite](https://indicnlp.ai4bharat.org)
|
87 |
+
- [The Classical Language Toolkit](http://cltk.org)
|
88 |
+
- [Microsoft NLP Recipes](https://github.com/microsoft/nlp-recipes)
|
89 |
+
- [Facebook M2M-100](https://github.com/pytorch/fairseq/tree/master/examples/m2m_100)
|
90 |
+
|
91 |
+
## Revision Log
|
92 |
+
|
93 |
+
|
94 |
+
0.81 : 26 May 2021
|
95 |
+
|
96 |
+
- Bug fix in version number extraction
|
97 |
+
|
98 |
+
0.80 : 24 May 2021
|
99 |
+
|
100 |
+
- Improved sentence splitting
|
101 |
+
- Bug fixes
|
102 |
+
- Support for Urdu Normalizer
|
103 |
+
|
104 |
+
0.71 : 03 Sep 2020
|
105 |
+
|
106 |
+
- Improved documentation
|
107 |
+
- Bug fixes
|
108 |
+
|
109 |
+
0.7 : 02 Apr 2020:
|
110 |
+
|
111 |
+
- Unified commandline
|
112 |
+
- Improved documentation
|
113 |
+
- Added setup.py
|
114 |
+
|
115 |
+
0.6 : 16 Dec 2019:
|
116 |
+
|
117 |
+
- New romanizer and indicizer
|
118 |
+
- Script Unifiers
|
119 |
+
- Improved script normalizers
|
120 |
+
- Added contrib directory for sample uses
|
121 |
+
- changed to MIT license
|
122 |
+
|
123 |
+
0.5 : 03 Jun 2019:
|
124 |
+
|
125 |
+
- Improved word tokenizer to handle dates and numbers.
|
126 |
+
- Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
|
127 |
+
- Added detokenizer
|
128 |
+
- Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
|
129 |
+
|
130 |
+
0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
|
131 |
+
|
132 |
+
0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
|
133 |
+
|
134 |
+
0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
|
135 |
+
|
136 |
+
0.1 : 12 Mar 2014: Initial version. Supports text normalization.
|
137 |
+
|
138 |
+
## LICENSE
|
139 |
+
|
140 |
+
Indic NLP Library is released under the MIT license
|
141 |
+
|
142 |
+
|
indic_nlp_library/contrib/README.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Contrib
|
2 |
+
|
3 |
+
Contains additional utilities and applications using Indic NLP library core
|
4 |
+
|
5 |
+
- `indic_scraper_project_sample.ipynb`: A simple pipeline for building monolingual corpora for Indian languages from crawled web content, Wikipedia, etc. An extensible framework which allows incorporation of website specific extractors, whereas generic NLP tasks like tokenization, sentence splitting, normalization, etc. are handled by the framework.
|
6 |
+
- `correct_moses_tokenizer.py`: This script corrects the incorrect tokenization done by Moses tokenizer. The Moses tokenizer splits on nukta and halant characters.
|
7 |
+
- `hindi_to_kannada_transliterator.py`: This script transliterates Hindi to Kannada. It removes/remaps characters only found in Hindi. It also adds halanta to words ending with consonant - as is the convention in Kannada.
|
indic_nlp_library/contrib/correct_moses_tokenizer.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from indicnlp import langinfo
|
3 |
+
from indicnlp import loader
|
4 |
+
|
5 |
+
if __name__ == '__main__':
|
6 |
+
"""
|
7 |
+
This script corrects the incorrect tokenization done by Moses tokenizer.
|
8 |
+
The Moses tokenizer splits on nukta and halant characters
|
9 |
+
Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode>
|
10 |
+
"""
|
11 |
+
|
12 |
+
loader.load()
|
13 |
+
|
14 |
+
infname=sys.argv[1]
|
15 |
+
outfname=sys.argv[2]
|
16 |
+
lang=sys.argv[3]
|
17 |
+
|
18 |
+
halant_char=langinfo.offset_to_char(langinfo.HALANTA_OFFSET,lang)
|
19 |
+
nukta_char=langinfo.offset_to_char(langinfo.NUKTA_OFFSET,lang)
|
20 |
+
|
21 |
+
with open(infname,'r',encoding='utf-8') as infile, \
|
22 |
+
open(outfname,'w',encoding='utf-8') as outfile:
|
23 |
+
for line in infile:
|
24 |
+
outfile.write(
|
25 |
+
line.replace(
|
26 |
+
' {} '.format(halant_char), halant_char).replace(
|
27 |
+
' {} '.format(nukta_char), nukta_char).replace(
|
28 |
+
' {}{}'.format(nukta_char,halant_char),'{}{}'.format(nukta_char,halant_char))
|
29 |
+
)
|
indic_nlp_library/contrib/hindi_to_kannada_transliterator.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from indicnlp import common
|
3 |
+
common.set_resources_path(INDIC_NLP_RESOURCES)
|
4 |
+
|
5 |
+
from indicnlp import loader
|
6 |
+
from indicnlp.normalize import indic_normalize
|
7 |
+
from indicnlp.transliterate import unicode_transliterate
|
8 |
+
|
9 |
+
if __name__ == '__main__':
|
10 |
+
"""
|
11 |
+
This script transliterates Hindi to Kannada. It removes/remaps
|
12 |
+
characters only found in Hindi. It also adds halanta to words ending
|
13 |
+
with consonant - as is the convention in Kannada
|
14 |
+
"""
|
15 |
+
|
16 |
+
infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized
|
17 |
+
outfname=sys.agv[2]
|
18 |
+
loader.load()
|
19 |
+
|
20 |
+
normalizer_factory=indic_normalize.IndicNormalizerFactory()
|
21 |
+
normalizer=normalizer_factory.get_normalizer('hi')
|
22 |
+
|
23 |
+
with open(infname,'r',encoding='utf-8') as infile, \
|
24 |
+
open(outfname,'w',encoding='utf-8') as outfile:
|
25 |
+
for line in infile:
|
26 |
+
line=line.strip()
|
27 |
+
line=normalizer.normalize(line)
|
28 |
+
|
29 |
+
## replace chandrabindus with anusvara
|
30 |
+
line=line.replace('\u0900','\u0902')
|
31 |
+
line=line.replace('\u0901','\u0902')
|
32 |
+
|
33 |
+
### replace chandra e and o diacritics with e and o respectively
|
34 |
+
#line=line.replace('\u0945','\u0947')
|
35 |
+
#line=line.replace('\u0949','\u094b')
|
36 |
+
|
37 |
+
### replace chandra e and o diacritics with a diacritic
|
38 |
+
## this seems to be general usage
|
39 |
+
line=line.replace('\u0945','\u093e')
|
40 |
+
line=line.replace('\u0949','\u093e')
|
41 |
+
|
42 |
+
## remove nukta
|
43 |
+
line=line.replace('\u093c','')
|
44 |
+
|
45 |
+
## add halant if word ends with consonant
|
46 |
+
#if isc.is_consonant(isc.get_phonetic_feature_vector(line[-1],'hi')):
|
47 |
+
# line=line+'\u094d'
|
48 |
+
words=line.split(' ')
|
49 |
+
outwords=[]
|
50 |
+
for word in line.split(' '):
|
51 |
+
if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')):
|
52 |
+
word=word+'\u094d'
|
53 |
+
outwords.append(word)
|
54 |
+
line=' '.join(outwords)
|
55 |
+
|
56 |
+
|
57 |
+
## script conversion
|
58 |
+
line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn')
|
59 |
+
|
60 |
+
outfile.write(line+'\n')
|
61 |
+
|
62 |
+
|
indic_nlp_library/contrib/indic_scraper_project_sample.ipynb
ADDED
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {},
|
6 |
+
"source": [
|
7 |
+
"# Pre-requisites\n",
|
8 |
+
"\n",
|
9 |
+
"- Python 3.5+\n",
|
10 |
+
"- Python packages: \n",
|
11 |
+
" - `pip install bs4 pandas mmh3`\n",
|
12 |
+
"- [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library)\n",
|
13 |
+
"- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "markdown",
|
18 |
+
"metadata": {},
|
19 |
+
"source": [
|
20 |
+
"# Initialize the Indic NLP Library\n",
|
21 |
+
"\n",
|
22 |
+
"Run the cell below to initialize the Indic NLP Library"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "code",
|
27 |
+
"execution_count": null,
|
28 |
+
"metadata": {},
|
29 |
+
"outputs": [],
|
30 |
+
"source": [
|
31 |
+
"# The path to the local git repo for Indic NLP Library\n",
|
32 |
+
"INDIC_NLP_LIB_HOME=\"/disk1/src/indic_nlp_library\"\n",
|
33 |
+
"\n",
|
34 |
+
"# The path to the local git repo for Indic NLP Resources\n",
|
35 |
+
"INDIC_NLP_RESOURCES=\"/disk1/src/indic_nlp_resources\"\n",
|
36 |
+
"\n",
|
37 |
+
"import sys\n",
|
38 |
+
"sys.path.append('{}/src'.format(INDIC_NLP_LIB_HOME))\n",
|
39 |
+
"\n",
|
40 |
+
"from indicnlp import common\n",
|
41 |
+
"common.set_resources_path(INDIC_NLP_RESOURCES)\n",
|
42 |
+
"\n",
|
43 |
+
"from indicnlp import loader\n",
|
44 |
+
"loader.load()"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": null,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"from bs4 import BeautifulSoup\n",
|
54 |
+
"import os\n",
|
55 |
+
"import string\n",
|
56 |
+
"import indicnlp\n",
|
57 |
+
"from indicnlp.tokenize import indic_tokenize\n",
|
58 |
+
"from indicnlp.normalize import indic_normalize\n",
|
59 |
+
"from indicnlp.transliterate import unicode_transliterate\n",
|
60 |
+
"from indicnlp.tokenize import sentence_tokenize\n",
|
61 |
+
"import re\n",
|
62 |
+
"import collections\n",
|
63 |
+
"import random\n",
|
64 |
+
"import mmh3"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "markdown",
|
69 |
+
"metadata": {},
|
70 |
+
"source": [
|
71 |
+
"# Common Functions"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": null,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"def preprocess_sent(text,lang,normalizer):\n",
|
81 |
+
" \"\"\"\n",
|
82 |
+
" Pre-process text (normalization and tokenization)\n",
|
83 |
+
" \n",
|
84 |
+
" text: text string to preprocess\n",
|
85 |
+
" lang: language code (2-letter ISO code)\n",
|
86 |
+
" normalizer: normalizer object for language\n",
|
87 |
+
" \n",
|
88 |
+
" returns the processed text string\n",
|
89 |
+
" \"\"\"\n",
|
90 |
+
" return ' '.join(indic_tokenize.trivial_tokenize(normalizer.normalize(text.replace('\\n',' ')),lang)) \n",
|
91 |
+
"\n",
|
92 |
+
"def sent_split(text,lang):\n",
|
93 |
+
" \"\"\"\n",
|
94 |
+
" Sentence splitter\n",
|
95 |
+
" \n",
|
96 |
+
" text: text to sentence split \n",
|
97 |
+
" lang: language\n",
|
98 |
+
" \n",
|
99 |
+
" returns list of sentences \n",
|
100 |
+
" \"\"\"\n",
|
101 |
+
" return sentence_tokenize.sentence_split(text,lang)\n",
|
102 |
+
"\n",
|
103 |
+
"def extract_all_content(indir,lang,\n",
|
104 |
+
" article_extract_fn,\n",
|
105 |
+
" preprocess_fn=preprocess_sent,\n",
|
106 |
+
" narticles=-1,\n",
|
107 |
+
" start_artid=0):\n",
|
108 |
+
" \"\"\"\n",
|
109 |
+
" This method reads all files from the input directory, extracts text content from each file,\n",
|
110 |
+
" and pre-processes the text. This method is a generator. \n",
|
111 |
+
" For each sentence, the method yields a tuple of the format: \n",
|
112 |
+
" \n",
|
113 |
+
" (artid, fname, paraid, sentid, processed_text)\n",
|
114 |
+
" \n",
|
115 |
+
" indir: path to input directoryo containing files to be parsed \n",
|
116 |
+
" \n",
|
117 |
+
" lang: language to the files in the input directory\n",
|
118 |
+
" \n",
|
119 |
+
" article_extract_fn: the function to extract text content from each file. \n",
|
120 |
+
" Signature of the function: get_article_contents(fname,lang,encoding) \n",
|
121 |
+
" `fname` is name of the file, `lang` is langcode, \n",
|
122 |
+
" `encoding` is text-encoding (default=utf-8). \n",
|
123 |
+
" The function yields a tuple (paraid, sentid, extracted_text) \n",
|
124 |
+
" for each sentence.\n",
|
125 |
+
" \n",
|
126 |
+
" preprocess_fn: pre-processing function to apply to the extracted text. \n",
|
127 |
+
" The function takes a string as input and returns processed string as output.\n",
|
128 |
+
" \n",
|
129 |
+
" narticles: extract and process the first `narticles` from input directory. \n",
|
130 |
+
" if narticles=-1 (default), all files are extracted\n",
|
131 |
+
" \n",
|
132 |
+
" start_artid: the start of the article id to assign to extracted articles (default=0)\n",
|
133 |
+
" \n",
|
134 |
+
" \"\"\"\n",
|
135 |
+
"\n",
|
136 |
+
" fnames = os.listdir(indir)\n",
|
137 |
+
" if narticles>0:\n",
|
138 |
+
" fnames=fnames[:narticles]\n",
|
139 |
+
" nsent=0\n",
|
140 |
+
"\n",
|
141 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
142 |
+
" normalizer=normalizer_factory.get_normalizer(lang)\n",
|
143 |
+
" \n",
|
144 |
+
" print('Number of articles: {}'.format(len(fnames)))\n",
|
145 |
+
" for artid, fname in enumerate(fnames,start_artid):\n",
|
146 |
+
"# print(fname)\n",
|
147 |
+
" if artid%100 == 0:\n",
|
148 |
+
" print('({}|{})'.format(artid,nsent),end=' ... ')\n",
|
149 |
+
" \n",
|
150 |
+
" try:\n",
|
151 |
+
" fpath=os.sep.join([indir,fname])\n",
|
152 |
+
" for paraid, sentid, sent in article_extract_fn(fpath,lang):\n",
|
153 |
+
" nsent+=1\n",
|
154 |
+
" yield( ( artid, fname, paraid, sentid, preprocess_fn(sent,lang,normalizer) ) )\n",
|
155 |
+
" except:\n",
|
156 |
+
" print('Cannot parse {}'.format(fname))\n",
|
157 |
+
" \n",
|
158 |
+
"def write_corpus(corpus_iterator,content_fname,article_mapping_fname,delimiter=' ||| ', encoding='utf-8'):\n",
|
159 |
+
" \"\"\"\n",
|
160 |
+
" Writes the extracted corpus to a file. The extracted data is organized in terms of articles, paragraphs \n",
|
161 |
+
" and sentences. The following is the format of the output file: \n",
|
162 |
+
" - one line per sentence\n",
|
163 |
+
" - format of line: article_id, para_id, sent_id, sentence\n",
|
164 |
+
" In addition to the content file mention, a metadata file which maps the article id to the filename is also written. \n",
|
165 |
+
" \n",
|
166 |
+
" corpus_iterator: iterator over the corpus, yielding tuple (artid, fname, paraid, sentid, processed_text). \n",
|
167 |
+
" The function `extract_all_content` yields a generator in this format. \n",
|
168 |
+
" content_fname: output content file to write the extracted data to in the format mentioned above\n",
|
169 |
+
" article_mapping_fname: output metadata file to write article id to filename mapping.\n",
|
170 |
+
" delimiter=' ||| ': delimiter for the content file. The default delimiter is the same \n",
|
171 |
+
" as used in the Moses phrase table\n",
|
172 |
+
" encoding: text encoding default - 'utf-8'\n",
|
173 |
+
" \n",
|
174 |
+
" \"\"\"\n",
|
175 |
+
" \n",
|
176 |
+
" artid_name_mapping={}\n",
|
177 |
+
" with open(content_fname,'w',encoding=encoding) as contentfile:\n",
|
178 |
+
" for artid, fname, paraid, sentid, text in corpus_iterator:\n",
|
179 |
+
" contentfile.write(delimiter.join([str(artid), str(paraid), str(sentid), text]) + '\\n')\n",
|
180 |
+
" artid_name_mapping[artid]=fname\n",
|
181 |
+
"\n",
|
182 |
+
" with open(article_mapping_fname,'w',encoding=encoding) as artmappingfile:\n",
|
183 |
+
" for artid, name in sorted(artid_name_mapping.items(),key=lambda x: x[0]):\n",
|
184 |
+
" artmappingfile.write('{} {} {}\\n'.format(artid,delimiter,name))\n",
|
185 |
+
"\n",
|
186 |
+
"def convert_txt_to_csv_format(infname, outfname, encoding='utf-8'):\n",
|
187 |
+
" \"\"\"\n",
|
188 |
+
" convert txt file to csv format. This method is used when the text file is directly available.\n",
|
189 |
+
" The input file has one sentence per line. Assumed to be preprocessed (tokenized, normalized)\n",
|
190 |
+
" \n",
|
191 |
+
" \"\"\"\n",
|
192 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
193 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
194 |
+
" for i, line in enumerate(infile):\n",
|
195 |
+
" outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,line.strip()))\n",
|
196 |
+
" \n",
|
197 |
+
"def preprocess_convert_txt_to_csv_format(infname, outfname, lang, encoding='utf-8'):\n",
|
198 |
+
" \"\"\"\n",
|
199 |
+
" Convert raw text file to csv format\n",
|
200 |
+
" \"\"\"\n",
|
201 |
+
" \n",
|
202 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
203 |
+
" normalizer=normalizer_factory.get_normalizer(lang)\n",
|
204 |
+
" \n",
|
205 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
206 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
207 |
+
" i=0\n",
|
208 |
+
" for line in infile:\n",
|
209 |
+
" sents = sent_split(line.strip(),lang)\n",
|
210 |
+
" for sent in sents:\n",
|
211 |
+
" outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,\n",
|
212 |
+
" preprocess_sent(sent.strip(), lang, normalizer)) )\n",
|
213 |
+
" i=i+1\n",
|
214 |
+
"\n",
|
215 |
+
"def print_txt(infnames, outfname, encoding='utf-8'):\n",
|
216 |
+
" \"\"\"\n",
|
217 |
+
" Extract only the text from the content csv file. The output file has one sentence per file.\n",
|
218 |
+
" \"\"\"\n",
|
219 |
+
" with open(outfname,'w',encoding=encoding) as outfile: \n",
|
220 |
+
" for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
|
221 |
+
" with open(infname,'r',encoding=encoding) as infile:\n",
|
222 |
+
" for i, line in enumerate(infile):\n",
|
223 |
+
" fields=line.strip().split('|||')\n",
|
224 |
+
" if len(fields) >=4:\n",
|
225 |
+
" outfile.write('{}\\n'.format(fields[3].strip()))\n",
|
226 |
+
" \n",
|
227 |
+
"# def dedup_and_print_txt(infnames, outfname, encoding='utf-8'):\n",
|
228 |
+
" \n",
|
229 |
+
"# total=0\n",
|
230 |
+
"# unique=0\n",
|
231 |
+
"# hash_codes=set()\n",
|
232 |
+
" \n",
|
233 |
+
"# with open(outfname,'w',encoding=encoding) as outfile: \n",
|
234 |
+
"# for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
|
235 |
+
"# with open(infname,'r',encoding=encoding) as infile:\n",
|
236 |
+
"# for i, line in enumerate(infile):\n",
|
237 |
+
"# fields=line.strip().split('|||')\n",
|
238 |
+
"# if len(fields) >=4:\n",
|
239 |
+
"# sent=fields[3].strip()\n",
|
240 |
+
"# total+=1\n",
|
241 |
+
"# hs=hash(sent)\n",
|
242 |
+
"# if hs not in hash_codes:\n",
|
243 |
+
"# outfile.write('{}\\n'.format(sent))\n",
|
244 |
+
"# hash_codes.add(hs)\n",
|
245 |
+
"# unique+=1\n",
|
246 |
+
" \n",
|
247 |
+
"# print('Total: {}'.format(total))\n",
|
248 |
+
"# print('Unique: {}'.format(unique))\n",
|
249 |
+
"\n",
|
250 |
+
"def dedup_shuffle_and_print_txt(infnames, outfname, max_buf_size=100000,encoding='utf-8'):\n",
|
251 |
+
" \"\"\"\n",
|
252 |
+
" The method creates a sentence level corpora from multiple content csv files.\n",
|
253 |
+
" All sentences are extracted, they are de-duplicated using murmurhash and shuffled\n",
|
254 |
+
" before writing the entire corpus to the output file. The output file has one sentence per line.\n",
|
255 |
+
"\n",
|
256 |
+
" \"\"\"\n",
|
257 |
+
" \n",
|
258 |
+
" total=0\n",
|
259 |
+
" unique=0\n",
|
260 |
+
" hash_codes=set()\n",
|
261 |
+
" sent_buffer=[]\n",
|
262 |
+
" \n",
|
263 |
+
" with open(outfname,'w',encoding=encoding) as outfile: \n",
|
264 |
+
" for infname in filter(lambda x: os.path.isfile(x),infnames):\n",
|
265 |
+
" print('Processing: {}'.format(infname))\n",
|
266 |
+
" with open(infname,'r',encoding=encoding) as infile:\n",
|
267 |
+
" for i, line in enumerate(infile):\n",
|
268 |
+
" fields=line.strip().split('|||')\n",
|
269 |
+
" if len(fields) >=4:\n",
|
270 |
+
" sent=fields[3].strip()\n",
|
271 |
+
" total+=1\n",
|
272 |
+
"# hs=hash(sent)\n",
|
273 |
+
" hs=mmh3.hash128(sent)\n",
|
274 |
+
" if hs not in hash_codes:\n",
|
275 |
+
"# outfile.write('{}\\n'.format(sent))\n",
|
276 |
+
" sent_buffer.append(sent)\n",
|
277 |
+
" hash_codes.add(hs)\n",
|
278 |
+
" unique+=1\n",
|
279 |
+
" if len(sent_buffer)>=max_buf_size:\n",
|
280 |
+
" random.shuffle(sent_buffer)\n",
|
281 |
+
" for sent in sent_buffer: \n",
|
282 |
+
" outfile.write('{}\\n'.format(sent))\n",
|
283 |
+
" sent_buffer.clear()\n",
|
284 |
+
" \n",
|
285 |
+
" if len(sent_buffer)>0:\n",
|
286 |
+
" random.shuffle(sent_buffer)\n",
|
287 |
+
" for sent in sent_buffer: \n",
|
288 |
+
" outfile.write('{}\\n'.format(sent))\n",
|
289 |
+
" sent_buffer.clear() \n",
|
290 |
+
" \n",
|
291 |
+
" print('Total: {}'.format(total))\n",
|
292 |
+
" print('Unique: {}'.format(unique))\n",
|
293 |
+
"\n",
|
294 |
+
"def extract_wikiextractor_file(infname, outfname, lang, \n",
|
295 |
+
" encoding='utf-8', delimiter=' ||| ', preprocess_fn=preprocess_sent):\n",
|
296 |
+
" \"\"\"\n",
|
297 |
+
" Extract text content into a content csv file from wikipedia article page. \n",
|
298 |
+
" The wikipedia article page is the output from `wikiextractor` [https://github.com/attardi/wikiextractor] \n",
|
299 |
+
" \n",
|
300 |
+
" \"\"\"\n",
|
301 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
302 |
+
" normalizer=normalizer_factory.get_normalizer(lang)\n",
|
303 |
+
" \n",
|
304 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
305 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
306 |
+
" artid=-1\n",
|
307 |
+
" paraid=0\n",
|
308 |
+
" for line in infile:\n",
|
309 |
+
" if line.find('<doc')==0:\n",
|
310 |
+
" artid+=1\n",
|
311 |
+
" paraid=0\n",
|
312 |
+
" continue\n",
|
313 |
+
" if line.find('</doc')==0:\n",
|
314 |
+
" continue\n",
|
315 |
+
" if len(line.strip())>0:\n",
|
316 |
+
" for sentid, sent in enumerate(sent_split(line.strip(),lang)):\n",
|
317 |
+
" sent=sent.strip()\n",
|
318 |
+
" if sent!='':\n",
|
319 |
+
" sent = preprocess_fn(sent,lang,normalizer)\n",
|
320 |
+
" outfile.write(delimiter.join([str(artid), str(paraid), str(sentid), sent]) + '\\n')\n",
|
321 |
+
" paraid+=1\n",
|
322 |
+
"\n",
|
323 |
+
" \n",
|
324 |
+
"def extract_leipzig_corpus(infname,outfname,lang,encoding='utf-8'):\n",
|
325 |
+
" \"\"\"\n",
|
326 |
+
" Extractor for files form the Leipzig corpus\n",
|
327 |
+
" [http://wortschatz.uni-leipzig.de/en/download/]\n",
|
328 |
+
" \n",
|
329 |
+
" \"\"\"\n",
|
330 |
+
" normalizer_factory=indic_normalize.IndicNormalizerFactory()\n",
|
331 |
+
" normalizer=normalizer_factory.get_normalizer(lang) \n",
|
332 |
+
"\n",
|
333 |
+
" with open(infname,'r',encoding=encoding) as infile, \\\n",
|
334 |
+
" open(outfname,'w',encoding=encoding) as outfile: \n",
|
335 |
+
" for i, line in enumerate(infile):\n",
|
336 |
+
" outfile.write('0 ||| 0 ||| {} ||| {}\\n'.format(i,preprocess_sent(line,lang,normalizer))) \n",
|
337 |
+
" \n",
|
338 |
+
"def dataset_stats(fname):\n",
|
339 |
+
" \"\"\"\n",
|
340 |
+
" Extracts dataset statistics from the final extracted file. This input file contains\n",
|
341 |
+
" one sentence per line. The sentences are tokenized.\n",
|
342 |
+
" \"\"\"\n",
|
343 |
+
"\n",
|
344 |
+
" all_puncs=set(string.punctuation+'\\u0964\\u0965')\n",
|
345 |
+
" \n",
|
346 |
+
" sent_count=0\n",
|
347 |
+
" token_cnt=0\n",
|
348 |
+
" true_token_cnt=0\n",
|
349 |
+
" tokens=set()\n",
|
350 |
+
" \n",
|
351 |
+
" with open(fname,'r',encoding='utf-8') as infile:\n",
|
352 |
+
" for line in infile:\n",
|
353 |
+
" sent_count+=1\n",
|
354 |
+
" a=line.strip().split(' ')\n",
|
355 |
+
" token_cnt+=len(a)\n",
|
356 |
+
" b=list(filter(lambda x: x not in all_puncs,a))\n",
|
357 |
+
" true_token_cnt+=len(b)\n",
|
358 |
+
" tokens.update(b)\n",
|
359 |
+
" \n",
|
360 |
+
" print('== Stats ==')\n",
|
361 |
+
" print('Sent count: {}'.format(sent_count))\n",
|
362 |
+
" print('Token count: {}'.format(token_cnt))\n",
|
363 |
+
" print('True Token count: {}'.format(true_token_cnt))\n",
|
364 |
+
" print('Unique Token count: {}'.format(len(tokens)))\n"
|
365 |
+
]
|
366 |
+
},
|
367 |
+
{
|
368 |
+
"cell_type": "markdown",
|
369 |
+
"metadata": {},
|
370 |
+
"source": [
|
371 |
+
"# Marathi"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "markdown",
|
376 |
+
"metadata": {},
|
377 |
+
"source": [
|
378 |
+
"## Wikipedia"
|
379 |
+
]
|
380 |
+
},
|
381 |
+
{
|
382 |
+
"cell_type": "markdown",
|
383 |
+
"metadata": {},
|
384 |
+
"source": [
|
385 |
+
"### Wikipedia extraction commands using wikiextractor\n",
|
386 |
+
"\n",
|
387 |
+
"```\n",
|
388 |
+
"### This uses WikiExtractor (https://github.com/attardi/wikiextractor)\n",
|
389 |
+
"\n",
|
390 |
+
"x=/disk1/crawl_project/ta/wikipedia\n",
|
391 |
+
"mkdir $x\n",
|
392 |
+
"cd $x\n",
|
393 |
+
"wget https://dumps.wikimedia.org/tawiki/20190501/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
|
394 |
+
"cd /disk1/src/wikiextractor\n",
|
395 |
+
"python3 WikiExtractor.py -cb 250k -o $x/extracted $x/tawiki-20190501-pages-articles-multistream.xml.bz2\n",
|
396 |
+
"cd -\n",
|
397 |
+
"find extracted -name '*bz2' -exec bunzip2 -c {} \\; > text.xml\n",
|
398 |
+
"rm text.xml\n",
|
399 |
+
"rm tawiki-20190501-pages-articles-multistream.xml.bz2\n",
|
400 |
+
"rm -rf extracted\n",
|
401 |
+
"```"
|
402 |
+
]
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"cell_type": "markdown",
|
406 |
+
"metadata": {},
|
407 |
+
"source": [
|
408 |
+
"mrwiki-20190401-pages-articles-multistream.xml.bz2\n",
|
409 |
+
"\n",
|
410 |
+
"INFO: Finished 1-process extraction of 53715 articles in 123.6s (434.7 art/s)\n",
|
411 |
+
"\n",
|
412 |
+
"INFO: total of page: 102025, total of articl page: 53715; total of used articl page: 53715"
|
413 |
+
]
|
414 |
+
},
|
415 |
+
{
|
416 |
+
"cell_type": "markdown",
|
417 |
+
"metadata": {},
|
418 |
+
"source": [
|
419 |
+
"### Post-processing output generated by wikiextractor"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"cell_type": "code",
|
424 |
+
"execution_count": null,
|
425 |
+
"metadata": {},
|
426 |
+
"outputs": [],
|
427 |
+
"source": [
|
428 |
+
"## tex.xml is extracted as shown in commanfs above\n",
|
429 |
+
"extract_wikiextractor_file('text.xml',\n",
|
430 |
+
" 'content_fname1.csv',\n",
|
431 |
+
" 'mr')"
|
432 |
+
]
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"cell_type": "markdown",
|
436 |
+
"metadata": {},
|
437 |
+
"source": [
|
438 |
+
"## Loksatta"
|
439 |
+
]
|
440 |
+
},
|
441 |
+
{
|
442 |
+
"cell_type": "markdown",
|
443 |
+
"metadata": {},
|
444 |
+
"source": [
|
445 |
+
"**Extractor function for Marathi Loksatta page**"
|
446 |
+
]
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"cell_type": "code",
|
450 |
+
"execution_count": null,
|
451 |
+
"metadata": {},
|
452 |
+
"outputs": [],
|
453 |
+
"source": [
|
454 |
+
"def get_article_contents_mr_loksatta(fname,lang,encoding='utf-8'):\n",
|
455 |
+
" with open(fname,'r',encoding=encoding) as infile: \n",
|
456 |
+
" soup = BeautifulSoup(infile)\n",
|
457 |
+
" for elem in soup.find_all('div'):\n",
|
458 |
+
" if 'itemprop' in elem.attrs and 'articleBody' in elem['itemprop']:\n",
|
459 |
+
" filtered_paras=list(filter(lambda x: x.name=='p' and len(x.attrs)==0,elem.children))\n",
|
460 |
+
" paraid=0\n",
|
461 |
+
" for blockid, block in enumerate(filtered_paras):\n",
|
462 |
+
"# print('Para: {}'.format(blockid))\n",
|
463 |
+
"# print(list(block.strings))\n",
|
464 |
+
" text=' '.join(block.strings)\n",
|
465 |
+
" if blockid==0 and text.find(':')>=0 and text.find(':')<20:\n",
|
466 |
+
" text=':'.join(text.split(':')[1:])\n",
|
467 |
+
" for para_text in text.split('\\n'): \n",
|
468 |
+
" for sentid, sent in enumerate(sent_split(para_text,lang)):\n",
|
469 |
+
" sent=sent.strip()\n",
|
470 |
+
" if sent!='':\n",
|
471 |
+
" # print('{}: {}'.format(sentid, sent))\n",
|
472 |
+
" yield((paraid,sentid,sent))\n",
|
473 |
+
" # yield((paraid,sentid,preprocess_sent(sent,'ml',normalizer)))\n",
|
474 |
+
" # print() \n",
|
475 |
+
" paraid+=1"
|
476 |
+
]
|
477 |
+
},
|
478 |
+
{
|
479 |
+
"cell_type": "markdown",
|
480 |
+
"metadata": {},
|
481 |
+
"source": [
|
482 |
+
"**Extracting data from crawled HTML files**"
|
483 |
+
]
|
484 |
+
},
|
485 |
+
{
|
486 |
+
"cell_type": "code",
|
487 |
+
"execution_count": null,
|
488 |
+
"metadata": {},
|
489 |
+
"outputs": [],
|
490 |
+
"source": [
|
491 |
+
"lang='mr'\n",
|
492 |
+
"posts_dir='directory_containing_crawled_html_pages'\n",
|
493 |
+
"content_fname='content_fname2.csv'\n",
|
494 |
+
"article_mapping_fname='article_mapping_fname'\n",
|
495 |
+
"get_article_contents=get_article_contents_mr_loksatta\n",
|
496 |
+
"narticles=-1"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"cell_type": "code",
|
501 |
+
"execution_count": null,
|
502 |
+
"metadata": {},
|
503 |
+
"outputs": [],
|
504 |
+
"source": [
|
505 |
+
"write_corpus(\n",
|
506 |
+
" extract_all_content(posts_dir, lang, article_extract_fn=get_article_contents,narticles=narticles),\n",
|
507 |
+
" content_fname,\n",
|
508 |
+
" article_mapping_fname\n",
|
509 |
+
" )"
|
510 |
+
]
|
511 |
+
},
|
512 |
+
{
|
513 |
+
"cell_type": "markdown",
|
514 |
+
"metadata": {},
|
515 |
+
"source": [
|
516 |
+
"## Aggregating all crawled data"
|
517 |
+
]
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"cell_type": "code",
|
521 |
+
"execution_count": null,
|
522 |
+
"metadata": {},
|
523 |
+
"outputs": [],
|
524 |
+
"source": [
|
525 |
+
"### aggregating, de-duplicating and shuffling all the data \n",
|
526 |
+
"dedup_shuffle_and_print_txt([ 'content_fname1.csv', 'content_fname2.csv' ], 'output_fname.txt' )\n",
|
527 |
+
"### extract dataset statistics\n",
|
528 |
+
"dataset_stats('output_fname.txt')"
|
529 |
+
]
|
530 |
+
}
|
531 |
+
],
|
532 |
+
"metadata": {
|
533 |
+
"kernelspec": {
|
534 |
+
"display_name": "Python 3",
|
535 |
+
"language": "python",
|
536 |
+
"name": "python3"
|
537 |
+
},
|
538 |
+
"language_info": {
|
539 |
+
"codemirror_mode": {
|
540 |
+
"name": "ipython",
|
541 |
+
"version": 3
|
542 |
+
},
|
543 |
+
"file_extension": ".py",
|
544 |
+
"mimetype": "text/x-python",
|
545 |
+
"name": "python",
|
546 |
+
"nbconvert_exporter": "python",
|
547 |
+
"pygments_lexer": "ipython3",
|
548 |
+
"version": "3.6.7"
|
549 |
+
},
|
550 |
+
"toc": {
|
551 |
+
"base_numbering": 1,
|
552 |
+
"nav_menu": {
|
553 |
+
"height": "703px",
|
554 |
+
"width": "326px"
|
555 |
+
},
|
556 |
+
"number_sections": true,
|
557 |
+
"sideBar": true,
|
558 |
+
"skip_h1_title": false,
|
559 |
+
"title_cell": "Table of Contents",
|
560 |
+
"title_sidebar": "Contents",
|
561 |
+
"toc_cell": false,
|
562 |
+
"toc_position": {},
|
563 |
+
"toc_section_display": true,
|
564 |
+
"toc_window_display": false
|
565 |
+
}
|
566 |
+
},
|
567 |
+
"nbformat": 4,
|
568 |
+
"nbformat_minor": 2
|
569 |
+
}
|
indic_nlp_library/docs/Makefile
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Makefile for Sphinx documentation
|
2 |
+
#
|
3 |
+
|
4 |
+
# You can set these variables from the command line.
|
5 |
+
SPHINXOPTS =
|
6 |
+
SPHINXBUILD = sphinx-build
|
7 |
+
PAPER =
|
8 |
+
BUILDDIR = _build
|
9 |
+
|
10 |
+
# Internal variables.
|
11 |
+
PAPEROPT_a4 = -D latex_paper_size=a4
|
12 |
+
PAPEROPT_letter = -D latex_paper_size=letter
|
13 |
+
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
14 |
+
# the i18n builder cannot share the environment and doctrees with the others
|
15 |
+
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
|
16 |
+
|
17 |
+
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
18 |
+
|
19 |
+
help:
|
20 |
+
@echo "Please use \`make <target>' where <target> is one of"
|
21 |
+
@echo " html to make standalone HTML files"
|
22 |
+
@echo " dirhtml to make HTML files named index.html in directories"
|
23 |
+
@echo " singlehtml to make a single large HTML file"
|
24 |
+
@echo " pickle to make pickle files"
|
25 |
+
@echo " json to make JSON files"
|
26 |
+
@echo " htmlhelp to make HTML files and a HTML help project"
|
27 |
+
@echo " qthelp to make HTML files and a qthelp project"
|
28 |
+
@echo " devhelp to make HTML files and a Devhelp project"
|
29 |
+
@echo " epub to make an epub"
|
30 |
+
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
31 |
+
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
32 |
+
@echo " text to make text files"
|
33 |
+
@echo " man to make manual pages"
|
34 |
+
@echo " texinfo to make Texinfo files"
|
35 |
+
@echo " info to make Texinfo files and run them through makeinfo"
|
36 |
+
@echo " gettext to make PO message catalogs"
|
37 |
+
@echo " changes to make an overview of all changed/added/deprecated items"
|
38 |
+
@echo " linkcheck to check all external links for integrity"
|
39 |
+
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
40 |
+
|
41 |
+
clean:
|
42 |
+
-rm -rf $(BUILDDIR)/*
|
43 |
+
|
44 |
+
html:
|
45 |
+
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
46 |
+
@echo
|
47 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
48 |
+
|
49 |
+
dirhtml:
|
50 |
+
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
51 |
+
@echo
|
52 |
+
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
53 |
+
|
54 |
+
singlehtml:
|
55 |
+
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
56 |
+
@echo
|
57 |
+
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
58 |
+
|
59 |
+
pickle:
|
60 |
+
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
61 |
+
@echo
|
62 |
+
@echo "Build finished; now you can process the pickle files."
|
63 |
+
|
64 |
+
json:
|
65 |
+
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
66 |
+
@echo
|
67 |
+
@echo "Build finished; now you can process the JSON files."
|
68 |
+
|
69 |
+
htmlhelp:
|
70 |
+
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
71 |
+
@echo
|
72 |
+
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
73 |
+
".hhp project file in $(BUILDDIR)/htmlhelp."
|
74 |
+
|
75 |
+
qthelp:
|
76 |
+
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
77 |
+
@echo
|
78 |
+
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
79 |
+
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
80 |
+
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/IndicNLPLibrary.qhcp"
|
81 |
+
@echo "To view the help file:"
|
82 |
+
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/IndicNLPLibrary.qhc"
|
83 |
+
|
84 |
+
devhelp:
|
85 |
+
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
86 |
+
@echo
|
87 |
+
@echo "Build finished."
|
88 |
+
@echo "To view the help file:"
|
89 |
+
@echo "# mkdir -p $$HOME/.local/share/devhelp/IndicNLPLibrary"
|
90 |
+
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/IndicNLPLibrary"
|
91 |
+
@echo "# devhelp"
|
92 |
+
|
93 |
+
epub:
|
94 |
+
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
95 |
+
@echo
|
96 |
+
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
97 |
+
|
98 |
+
latex:
|
99 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
100 |
+
@echo
|
101 |
+
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
102 |
+
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
103 |
+
"(use \`make latexpdf' here to do that automatically)."
|
104 |
+
|
105 |
+
latexpdf:
|
106 |
+
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
107 |
+
@echo "Running LaTeX files through pdflatex..."
|
108 |
+
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
109 |
+
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
110 |
+
|
111 |
+
text:
|
112 |
+
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
113 |
+
@echo
|
114 |
+
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
115 |
+
|
116 |
+
man:
|
117 |
+
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
118 |
+
@echo
|
119 |
+
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
120 |
+
|
121 |
+
texinfo:
|
122 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
123 |
+
@echo
|
124 |
+
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
125 |
+
@echo "Run \`make' in that directory to run these through makeinfo" \
|
126 |
+
"(use \`make info' here to do that automatically)."
|
127 |
+
|
128 |
+
info:
|
129 |
+
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
130 |
+
@echo "Running Texinfo files through makeinfo..."
|
131 |
+
make -C $(BUILDDIR)/texinfo info
|
132 |
+
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
133 |
+
|
134 |
+
gettext:
|
135 |
+
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
136 |
+
@echo
|
137 |
+
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
138 |
+
|
139 |
+
changes:
|
140 |
+
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
141 |
+
@echo
|
142 |
+
@echo "The overview file is in $(BUILDDIR)/changes."
|
143 |
+
|
144 |
+
linkcheck:
|
145 |
+
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
146 |
+
@echo
|
147 |
+
@echo "Link check complete; look for any errors in the above output " \
|
148 |
+
"or in $(BUILDDIR)/linkcheck/output.txt."
|
149 |
+
|
150 |
+
doctest:
|
151 |
+
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
152 |
+
@echo "Testing of doctests in the sources finished, look at the " \
|
153 |
+
"results in $(BUILDDIR)/doctest/output.txt."
|
indic_nlp_library/docs/cmd.rst
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Commandline
|
2 |
+
===========
|
3 |
+
|
4 |
+
.. argparse::
|
5 |
+
:module: indicnlp.cli.cliparser
|
6 |
+
:func: get_parser
|
7 |
+
:prog: cliparser.py
|
8 |
+
|
indic_nlp_library/docs/code.rst
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Auto Generated Documentation
|
2 |
+
============================
|
3 |
+
|
4 |
+
.. automodule:: indicnlp.langinfo indicnlp.common
|
5 |
+
:members:
|
indic_nlp_library/docs/conf.py
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
#
|
3 |
+
# Indic NLP Library documentation build configuration file, created by
|
4 |
+
# sphinx-quickstart on Tue Nov 3 01:50:37 2015.
|
5 |
+
#
|
6 |
+
# This file is execfile()d with the current directory set to its containing dir.
|
7 |
+
#
|
8 |
+
# Note that not all possible configuration values are present in this
|
9 |
+
# autogenerated file.
|
10 |
+
#
|
11 |
+
# All configuration values have a default; values that are commented out
|
12 |
+
# serve to show the default.
|
13 |
+
|
14 |
+
import sys, os
|
15 |
+
|
16 |
+
# If extensions (or modules to document with autodoc) are in another directory,
|
17 |
+
# add these directories to sys.path here. If the directory is relative to the
|
18 |
+
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
19 |
+
sys.path.insert(0, os.path.abspath('..'))
|
20 |
+
|
21 |
+
# -- General configuration -----------------------------------------------------
|
22 |
+
|
23 |
+
# If your documentation needs a minimal Sphinx version, state it here.
|
24 |
+
#needs_sphinx = '1.0'
|
25 |
+
|
26 |
+
# Add any Sphinx extension module names here, as strings. They can be extensions
|
27 |
+
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
|
28 |
+
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', 'sphinxarg.ext']
|
29 |
+
|
30 |
+
# Add any paths that contain templates here, relative to this directory.
|
31 |
+
templates_path = ['_templates']
|
32 |
+
|
33 |
+
# The suffix of source filenames.
|
34 |
+
source_suffix = '.rst'
|
35 |
+
|
36 |
+
# The encoding of source files.
|
37 |
+
#source_encoding = 'utf-8-sig'
|
38 |
+
|
39 |
+
# The master toctree document.
|
40 |
+
master_doc = 'index'
|
41 |
+
|
42 |
+
# General information about the project.
|
43 |
+
project = 'Indic NLP Library'
|
44 |
+
copyright = '2015, Anoop Kunchukuttan'
|
45 |
+
|
46 |
+
# The version info for the project you're documenting, acts as replacement for
|
47 |
+
# |version| and |release|, also used in various other places throughout the
|
48 |
+
# built documents.
|
49 |
+
#
|
50 |
+
# The short X.Y version.
|
51 |
+
version = '0.2'
|
52 |
+
# The full version, including alpha/beta/rc tags.
|
53 |
+
release = '0.2'
|
54 |
+
|
55 |
+
# The language for content autogenerated by Sphinx. Refer to documentation
|
56 |
+
# for a list of supported languages.
|
57 |
+
#language = None
|
58 |
+
|
59 |
+
# There are two options for replacing |today|: either, you set today to some
|
60 |
+
# non-false value, then it is used:
|
61 |
+
#today = ''
|
62 |
+
# Else, today_fmt is used as the format for a strftime call.
|
63 |
+
#today_fmt = '%B %d, %Y'
|
64 |
+
|
65 |
+
# List of patterns, relative to source directory, that match files and
|
66 |
+
# directories to ignore when looking for source files.
|
67 |
+
exclude_patterns = ['_build']
|
68 |
+
|
69 |
+
# The reST default role (used for this markup: `text`) to use for all documents.
|
70 |
+
#default_role = None
|
71 |
+
|
72 |
+
# If true, '()' will be appended to :func: etc. cross-reference text.
|
73 |
+
#add_function_parentheses = True
|
74 |
+
|
75 |
+
# If true, the current module name will be prepended to all description
|
76 |
+
# unit titles (such as .. function::).
|
77 |
+
#add_module_names = True
|
78 |
+
|
79 |
+
# If true, sectionauthor and moduleauthor directives will be shown in the
|
80 |
+
# output. They are ignored by default.
|
81 |
+
#show_authors = False
|
82 |
+
|
83 |
+
# The name of the Pygments (syntax highlighting) style to use.
|
84 |
+
pygments_style = 'sphinx'
|
85 |
+
|
86 |
+
# A list of ignored prefixes for module index sorting.
|
87 |
+
#modindex_common_prefix = []
|
88 |
+
|
89 |
+
|
90 |
+
# -- Options for HTML output ---------------------------------------------------
|
91 |
+
|
92 |
+
# The theme to use for HTML and HTML Help pages. See the documentation for
|
93 |
+
# a list of builtin themes.
|
94 |
+
html_theme = 'sphinx_rtd_theme'
|
95 |
+
|
96 |
+
# Theme options are theme-specific and customize the look and feel of a theme
|
97 |
+
# further. For a list of options available for each theme, see the
|
98 |
+
# documentation.
|
99 |
+
#html_theme_options = {}
|
100 |
+
|
101 |
+
# Add any paths that contain custom themes here, relative to this directory.
|
102 |
+
#html_theme_path = []
|
103 |
+
|
104 |
+
# The name for this set of Sphinx documents. If None, it defaults to
|
105 |
+
# "<project> v<release> documentation".
|
106 |
+
#html_title = None
|
107 |
+
|
108 |
+
# A shorter title for the navigation bar. Default is the same as html_title.
|
109 |
+
#html_short_title = None
|
110 |
+
|
111 |
+
# The name of an image file (relative to this directory) to place at the top
|
112 |
+
# of the sidebar.
|
113 |
+
#html_logo = None
|
114 |
+
|
115 |
+
# The name of an image file (within the static path) to use as favicon of the
|
116 |
+
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
117 |
+
# pixels large.
|
118 |
+
#html_favicon = None
|
119 |
+
|
120 |
+
# Add any paths that contain custom static files (such as style sheets) here,
|
121 |
+
# relative to this directory. They are copied after the builtin static files,
|
122 |
+
# so a file named "default.css" will overwrite the builtin "default.css".
|
123 |
+
html_static_path = ['_static']
|
124 |
+
|
125 |
+
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
126 |
+
# using the given strftime format.
|
127 |
+
#html_last_updated_fmt = '%b %d, %Y'
|
128 |
+
|
129 |
+
# If true, SmartyPants will be used to convert quotes and dashes to
|
130 |
+
# typographically correct entities.
|
131 |
+
#html_use_smartypants = True
|
132 |
+
|
133 |
+
# Custom sidebar templates, maps document names to template names.
|
134 |
+
#html_sidebars = {}
|
135 |
+
|
136 |
+
# Additional templates that should be rendered to pages, maps page names to
|
137 |
+
# template names.
|
138 |
+
#html_additional_pages = {}
|
139 |
+
|
140 |
+
# If false, no module index is generated.
|
141 |
+
#html_domain_indices = True
|
142 |
+
|
143 |
+
# If false, no index is generated.
|
144 |
+
#html_use_index = True
|
145 |
+
|
146 |
+
# If true, the index is split into individual pages for each letter.
|
147 |
+
#html_split_index = False
|
148 |
+
|
149 |
+
# If true, links to the reST sources are added to the pages.
|
150 |
+
#html_show_sourcelink = True
|
151 |
+
|
152 |
+
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
153 |
+
#html_show_sphinx = True
|
154 |
+
|
155 |
+
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
156 |
+
#html_show_copyright = True
|
157 |
+
|
158 |
+
# If true, an OpenSearch description file will be output, and all pages will
|
159 |
+
# contain a <link> tag referring to it. The value of this option must be the
|
160 |
+
# base URL from which the finished HTML is served.
|
161 |
+
#html_use_opensearch = ''
|
162 |
+
|
163 |
+
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
164 |
+
#html_file_suffix = None
|
165 |
+
|
166 |
+
# Output file base name for HTML help builder.
|
167 |
+
htmlhelp_basename = 'IndicNLPLibrarydoc'
|
168 |
+
|
169 |
+
|
170 |
+
# -- Options for LaTeX output --------------------------------------------------
|
171 |
+
|
172 |
+
latex_elements = {
|
173 |
+
# The paper size ('letterpaper' or 'a4paper').
|
174 |
+
#'papersize': 'letterpaper',
|
175 |
+
|
176 |
+
# The font size ('10pt', '11pt' or '12pt').
|
177 |
+
#'pointsize': '10pt',
|
178 |
+
|
179 |
+
# Additional stuff for the LaTeX preamble.
|
180 |
+
#'preamble': '',
|
181 |
+
}
|
182 |
+
|
183 |
+
# Grouping the document tree into LaTeX files. List of tuples
|
184 |
+
# (source start file, target name, title, author, documentclass [howto/manual]).
|
185 |
+
latex_documents = [
|
186 |
+
('index', 'IndicNLPLibrary.tex', 'Indic NLP Library Documentation',
|
187 |
+
'Anoop Kunchukuttan', 'manual'),
|
188 |
+
]
|
189 |
+
|
190 |
+
# The name of an image file (relative to this directory) to place at the top of
|
191 |
+
# the title page.
|
192 |
+
#latex_logo = None
|
193 |
+
|
194 |
+
# For "manual" documents, if this is true, then toplevel headings are parts,
|
195 |
+
# not chapters.
|
196 |
+
#latex_use_parts = False
|
197 |
+
|
198 |
+
# If true, show page references after internal links.
|
199 |
+
#latex_show_pagerefs = False
|
200 |
+
|
201 |
+
# If true, show URL addresses after external links.
|
202 |
+
#latex_show_urls = False
|
203 |
+
|
204 |
+
# Documents to append as an appendix to all manuals.
|
205 |
+
#latex_appendices = []
|
206 |
+
|
207 |
+
# If false, no module index is generated.
|
208 |
+
#latex_domain_indices = True
|
209 |
+
|
210 |
+
|
211 |
+
# -- Options for manual page output --------------------------------------------
|
212 |
+
|
213 |
+
# One entry per manual page. List of tuples
|
214 |
+
# (source start file, name, description, authors, manual section).
|
215 |
+
man_pages = [
|
216 |
+
('index', 'indicnlplibrary', 'Indic NLP Library Documentation',
|
217 |
+
['Anoop Kunchukuttan'], 1)
|
218 |
+
]
|
219 |
+
|
220 |
+
# If true, show URL addresses after external links.
|
221 |
+
#man_show_urls = False
|
222 |
+
|
223 |
+
|
224 |
+
# -- Options for Texinfo output ------------------------------------------------
|
225 |
+
|
226 |
+
# Grouping the document tree into Texinfo files. List of tuples
|
227 |
+
# (source start file, target name, title, author,
|
228 |
+
# dir menu entry, description, category)
|
229 |
+
texinfo_documents = [
|
230 |
+
('index', 'IndicNLPLibrary', 'Indic NLP Library Documentation',
|
231 |
+
'Anoop Kunchukuttan', 'IndicNLPLibrary', 'NLP library for Indian languages',
|
232 |
+
'NLP'),
|
233 |
+
]
|
234 |
+
|
235 |
+
# Documents to append as an appendix to all manuals.
|
236 |
+
#texinfo_appendices = []
|
237 |
+
|
238 |
+
# If false, no module index is generated.
|
239 |
+
#texinfo_domain_indices = True
|
240 |
+
|
241 |
+
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
242 |
+
#texinfo_show_urls = 'footnote'
|
indic_nlp_library/docs/index.rst
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.. Indic NLP Library documentation master file, created by
|
2 |
+
sphinx-quickstart on Tue Nov 3 01:50:37 2015.
|
3 |
+
You can adapt this file completely to your liking, but it should at least
|
4 |
+
contain the root `toctree` directive.
|
5 |
+
|
6 |
+
:github_url: https://github.com/anoopkunchukuttan/indic_nlp_library
|
7 |
+
|
8 |
+
.. toctree::
|
9 |
+
:maxdepth: 2
|
10 |
+
:caption: Packages
|
11 |
+
|
12 |
+
indicnlp
|
13 |
+
|
14 |
+
.. toctree::
|
15 |
+
:maxdepth: 2
|
16 |
+
:caption: Commandline
|
17 |
+
|
18 |
+
cmd
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
indic_nlp_library/docs/indicnlp.MD
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Indic NLP Library
|
2 |
+
## A unified approach to NLP for Indian languages
|
3 |
+
|
4 |
+
### Anoop Kunchukuttan (`[email protected]`)
|
5 |
+
|
6 |
+
The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. Indian languages share a lot of similarity in terms of script, phonology, language syntax, etc. and this library is an attempt to provide a general solution to very commonly required toolsets for Indian language text.
|
7 |
+
|
8 |
+
The library provides the following functionalities:
|
9 |
+
|
10 |
+
- Text Normalization
|
11 |
+
- Script Information
|
12 |
+
- Word Tokenization and Detokenization
|
13 |
+
- Sentence Splitting
|
14 |
+
- Word Segmentation
|
15 |
+
- Syllabification
|
16 |
+
- Script Conversion
|
17 |
+
- Romanization
|
18 |
+
- Indicization
|
19 |
+
- Transliteration
|
20 |
+
- Translation
|
21 |
+
|
22 |
+
The data resources required by the Indic NLP Library are hosted in a different repository. These resources are required for some modules. You can download from the [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) project.
|
23 |
+
|
24 |
+
**If you are interested in Indian language NLP resources, you should check the [Indic NLP Catalog](https://github.com/anoopkunchukuttan/indic_nlp_library) for pointers.**
|
25 |
+
|
26 |
+
## Pre-requisites
|
27 |
+
|
28 |
+
- Python 3.x
|
29 |
+
- (For Python 2.x version check the tag `PYTHON_2.7_FINAL_JAN_2019`. Not actively supporting Python 2.x anymore, but will try to maintain as much compatibility as possible)
|
30 |
+
- [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources)
|
31 |
+
- Other dependencies are listed in setup.py
|
32 |
+
|
33 |
+
|
34 |
+
## Configuration
|
35 |
+
|
36 |
+
- Installation from pip:
|
37 |
+
|
38 |
+
`pip install indic-nlp-library`
|
39 |
+
|
40 |
+
- If you want to use the project from the github repo, add the project to the Python Path:
|
41 |
+
|
42 |
+
- Clone this repository
|
43 |
+
- Install dependencies: `pip install -r requirements.txt`
|
44 |
+
- Run: `export PYTHONPATH=$PYTHONPATH:<project base directory>`
|
45 |
+
|
46 |
+
- In either case, export the path to the _Indic NLP Resources_ directory
|
47 |
+
|
48 |
+
Run: `export INDIC_RESOURCES_PATH=<path to Indic NLP resources>`
|
49 |
+
|
50 |
+
## Usage
|
51 |
+
|
52 |
+
You can use the Python API to access all the features of the library. Many of the most common operations are also accessible via a unified commandline API.
|
53 |
+
|
54 |
+
### Getting Started
|
55 |
+
|
56 |
+
Check [this IPython Notebook](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples.ipynb) for examples to use the Python API.
|
57 |
+
- You can find the Python 2.x Notebook [here](http://nbviewer.ipython.org/url/anoopkunchukuttan.github.io/indic_nlp_library/doc/indic_nlp_examples_2_7.ipynb)
|
58 |
+
|
59 |
+
### Documentation
|
60 |
+
|
61 |
+
You can find detailed documentation [HERE](https://indic-nlp-library.readthedocs.io/en/latest)
|
62 |
+
|
63 |
+
This documents the Python API as well as the commandline reference.
|
64 |
+
|
65 |
+
## Citing
|
66 |
+
|
67 |
+
If you use this library, please include the following citation:
|
68 |
+
|
69 |
+
```
|
70 |
+
@unpublished{kunchukuttan2020indicnlp,
|
71 |
+
author = "Anoop Kunchukuttan",
|
72 |
+
title = "The IndicNLP Library",
|
73 |
+
year = "2020",
|
74 |
+
}
|
75 |
+
```
|
76 |
+
You can find the document [HERE](docs/indicnlp.pdf)
|
77 |
+
|
78 |
+
## Website
|
79 |
+
|
80 |
+
`http://anoopkunchukuttan.github.io/indic_nlp_library`
|
81 |
+
|
82 |
+
## Author
|
83 |
+
Anoop Kunchukuttan ([[email protected]]([email protected]))
|
84 |
+
|
85 |
+
## Version: 0.7
|
86 |
+
|
87 |
+
## Revision Log
|
88 |
+
|
89 |
+
0.7 : 02 Apr 2020:
|
90 |
+
|
91 |
+
- Unified commandline
|
92 |
+
- Improved documentation
|
93 |
+
- Added setup.py
|
94 |
+
|
95 |
+
0.6 : 16 Dec 2019:
|
96 |
+
|
97 |
+
- New romanizer and indicizer
|
98 |
+
- Script Unifiers
|
99 |
+
- Improved script normalizers
|
100 |
+
- Added contrib directory for sample uses
|
101 |
+
- changed to MIT license
|
102 |
+
|
103 |
+
0.5 : 03 Jun 2019:
|
104 |
+
|
105 |
+
- Improved word tokenizer to handle dates and numbers.
|
106 |
+
- Added sentence splitter that can handle common prefixes/honorofics and uses some heuristics.
|
107 |
+
- Added detokenizer
|
108 |
+
- Added acronym transliterator that can convert English acronyms to Brahmi-derived scripts
|
109 |
+
|
110 |
+
0.4 : 28 Jan 2019: Ported to Python 3, and lots of feature additions since last release; primarily around script information, script similarity and syllabification.
|
111 |
+
|
112 |
+
0.3 : 21 Oct 2014: Supports morph-analysis between Indian languages
|
113 |
+
|
114 |
+
0.2 : 13 Jun 2014: Supports transliteration between Indian languages and tokenization of Indian languages
|
115 |
+
|
116 |
+
0.1 : 12 Mar 2014: Initial version. Supports text normalization.
|
117 |
+
|
118 |
+
## LICENSE
|
119 |
+
|
120 |
+
Indic NLP Library is released under the MIT license
|
121 |
+
|
122 |
+
|
indic_nlp_library/docs/indicnlp.cli.rst
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cli Package
|
2 |
+
=============
|
3 |
+
|
4 |
+
:mod:`cliparser` Module
|
5 |
+
--------------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.cli.cliparser
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
indic_nlp_library/docs/indicnlp.morph.rst
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
morph Package
|
2 |
+
=============
|
3 |
+
|
4 |
+
:mod:`unsupervised_morph` Module
|
5 |
+
--------------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.morph.unsupervised_morph
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
indic_nlp_library/docs/indicnlp.normalize.rst
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
normalize Package
|
2 |
+
=================
|
3 |
+
|
4 |
+
:mod:`indic_normalize` Module
|
5 |
+
-----------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.normalize.indic_normalize
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
.. autoclass:: indicnlp.normalize.indic_normalize.
|
13 |
+
:members:
|
14 |
+
:undoc-members:
|
15 |
+
:show-inheritance:
|
indic_nlp_library/docs/indicnlp.pdf
ADDED
Binary file (38.1 kB). View file
|
|
indic_nlp_library/docs/indicnlp.rst
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
indicnlp Package
|
2 |
+
================
|
3 |
+
|
4 |
+
:mod:`common` Module
|
5 |
+
--------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.common
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`langinfo` Module
|
13 |
+
----------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.langinfo
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`loader` Module
|
21 |
+
--------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.loader
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
27 |
+
|
28 |
+
Subpackages
|
29 |
+
-----------
|
30 |
+
|
31 |
+
.. toctree::
|
32 |
+
|
33 |
+
indicnlp.cli
|
34 |
+
indicnlp.morph
|
35 |
+
indicnlp.normalize
|
36 |
+
indicnlp.script
|
37 |
+
indicnlp.syllable
|
38 |
+
indicnlp.tokenize
|
39 |
+
indicnlp.transliterate
|
40 |
+
|
41 |
+
Indices and tables
|
42 |
+
==================
|
43 |
+
|
44 |
+
* :ref:`genindex`
|
45 |
+
* :ref:`modindex`
|
46 |
+
* :ref:`search`
|
47 |
+
|
indic_nlp_library/docs/indicnlp.script.rst
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
script Package
|
2 |
+
==============
|
3 |
+
|
4 |
+
:mod:`indic_scripts` Module
|
5 |
+
---------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.script.indic_scripts
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`english_script` Module
|
13 |
+
---------------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.script.english_script
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`phonetic_sim` Module
|
21 |
+
---------------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.script.phonetic_sim
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
indic_nlp_library/docs/indicnlp.syllable.rst
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
syllable Package
|
2 |
+
==============
|
3 |
+
|
4 |
+
:mod:`syllabifier` Module
|
5 |
+
---------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.syllable.syllabifier
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
indic_nlp_library/docs/indicnlp.tokenize.rst
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tokenize Package
|
2 |
+
================
|
3 |
+
|
4 |
+
:mod:`indic_tokenize` Module
|
5 |
+
----------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.tokenize.indic_tokenize
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`indic_detokenize` Module
|
13 |
+
------------------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.tokenize.indic_detokenize
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`sentence_tokenize` Module
|
21 |
+
----------------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.tokenize.sentence_tokenize
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
indic_nlp_library/docs/indicnlp.transliterate.rst
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transliterate Package
|
2 |
+
=====================
|
3 |
+
|
4 |
+
:mod:`sinhala_transliterator` Module
|
5 |
+
------------------------------------
|
6 |
+
|
7 |
+
.. automodule:: indicnlp.transliterate.sinhala_transliterator
|
8 |
+
:members:
|
9 |
+
:undoc-members:
|
10 |
+
:show-inheritance:
|
11 |
+
|
12 |
+
:mod:`unicode_transliterate` Module
|
13 |
+
-----------------------------------
|
14 |
+
|
15 |
+
.. automodule:: indicnlp.transliterate.unicode_transliterate
|
16 |
+
:members:
|
17 |
+
:undoc-members:
|
18 |
+
:show-inheritance:
|
19 |
+
|
20 |
+
:mod:`acronym_transliterator` Module
|
21 |
+
-----------------------------------
|
22 |
+
|
23 |
+
.. automodule:: indicnlp.transliterate.acronym_transliterator
|
24 |
+
:members:
|
25 |
+
:undoc-members:
|
26 |
+
:show-inheritance:
|
27 |
+
|
28 |
+
:mod:`script_unifier` Module
|
29 |
+
-----------------------------------
|
30 |
+
|
31 |
+
.. automodule:: indicnlp.transliterate.script_unifier
|
32 |
+
:members:
|
33 |
+
:undoc-members:
|
34 |
+
:show-inheritance:
|
indic_nlp_library/docs/make.bat
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@ECHO OFF
|
2 |
+
|
3 |
+
pushd %~dp0
|
4 |
+
|
5 |
+
REM Command file for Sphinx documentation
|
6 |
+
|
7 |
+
if "%SPHINXBUILD%" == "" (
|
8 |
+
set SPHINXBUILD=sphinx-build
|
9 |
+
)
|
10 |
+
set SOURCEDIR=.
|
11 |
+
set BUILDDIR=_build
|
12 |
+
|
13 |
+
if "%1" == "" goto help
|
14 |
+
|
15 |
+
%SPHINXBUILD% >NUL 2>NUL
|
16 |
+
if errorlevel 9009 (
|
17 |
+
echo.
|
18 |
+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
19 |
+
echo.installed, then set the SPHINXBUILD environment variable to point
|
20 |
+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
21 |
+
echo.may add the Sphinx directory to PATH.
|
22 |
+
echo.
|
23 |
+
echo.If you don't have Sphinx installed, grab it from
|
24 |
+
echo.http://sphinx-doc.org/
|
25 |
+
exit /b 1
|
26 |
+
)
|
27 |
+
|
28 |
+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
29 |
+
goto end
|
30 |
+
|
31 |
+
:help
|
32 |
+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
33 |
+
|
34 |
+
:end
|
35 |
+
popd
|
indic_nlp_library/docs/modules.rst
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
indicnlp
|
2 |
+
===
|
3 |
+
|
4 |
+
.. toctree::
|
5 |
+
:maxdepth: 4
|
6 |
+
|
7 |
+
indicnlp
|
indic_nlp_library/indicnlp/__init__.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
|
4 |
+
try:
|
5 |
+
from .version import __version__ # noqa
|
6 |
+
except ImportError:
|
7 |
+
version_txt = os.path.join(os.path.dirname(__file__), "version.txt")
|
8 |
+
with open(version_txt) as f:
|
9 |
+
__version__ = f.read().strip()
|
10 |
+
|
indic_nlp_library/indicnlp/cli/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/cli/cliparser.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import sys
|
3 |
+
|
4 |
+
from indicnlp import loader
|
5 |
+
from indicnlp.tokenize import indic_tokenize
|
6 |
+
from indicnlp.tokenize import indic_detokenize
|
7 |
+
from indicnlp.normalize import indic_normalize
|
8 |
+
from indicnlp.morph import unsupervised_morph
|
9 |
+
from indicnlp.tokenize import sentence_tokenize
|
10 |
+
from indicnlp.syllable import syllabifier
|
11 |
+
from indicnlp.transliterate import unicode_transliterate
|
12 |
+
from indicnlp.transliterate import script_unifier
|
13 |
+
|
14 |
+
DEFAULT_ENCODING='utf-8'
|
15 |
+
|
16 |
+
def run_detokenize(args):
|
17 |
+
for line in args.infile:
|
18 |
+
args.outfile.write(indic_detokenize.trivial_detokenize(line,args.lang))
|
19 |
+
|
20 |
+
def run_tokenize(args):
|
21 |
+
for line in args.infile:
|
22 |
+
args.outfile.write(' '.join(
|
23 |
+
indic_tokenize.trivial_tokenize(line,args.lang)))
|
24 |
+
|
25 |
+
def run_sentence_split(args):
|
26 |
+
text=' '.join([ l.replace('\n','').replace('\r','') for l in args.infile])
|
27 |
+
outlines=sentence_tokenize.sentence_split(text,args.lang)
|
28 |
+
for line in outlines:
|
29 |
+
args.outfile.write(line+'\n')
|
30 |
+
|
31 |
+
def run_normalize(args):
|
32 |
+
|
33 |
+
# TODO: add more options to cli
|
34 |
+
remove_nuktas=False
|
35 |
+
normalize_nasals='do_nothing'
|
36 |
+
|
37 |
+
# create normalizer
|
38 |
+
factory=indic_normalize.IndicNormalizerFactory()
|
39 |
+
normalizer=factory.get_normalizer(args.lang,
|
40 |
+
remove_nuktas=remove_nuktas,
|
41 |
+
nasals_mode=normalize_nasals)
|
42 |
+
|
43 |
+
# DO normalization
|
44 |
+
for line in args.infile:
|
45 |
+
normalized_line=normalizer.normalize(line)
|
46 |
+
args.outfile.write(normalized_line)
|
47 |
+
|
48 |
+
def run_morph(args):
|
49 |
+
|
50 |
+
add_marker=False
|
51 |
+
analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer(args.lang,add_marker)
|
52 |
+
for line in args.infile:
|
53 |
+
morph_tokens=analyzer.morph_analyze_document(line.strip().split(' '))
|
54 |
+
args.outfile.write(' '.join(morph_tokens) + '\n')
|
55 |
+
|
56 |
+
def run_syllabify(args):
|
57 |
+
for line in args.infile:
|
58 |
+
new_line = ' '.join(
|
59 |
+
[ ' '.join(syllabifier.orthographic_syllabify(w,args.lang))
|
60 |
+
for w in line.strip().split(' ') ]
|
61 |
+
)
|
62 |
+
args.outfile.write(new_line+'\n')
|
63 |
+
|
64 |
+
def run_wc(args):
|
65 |
+
# if args.l==False and args.w==False and args.c==False:
|
66 |
+
# args.l, args.w, args.c= True, True, True
|
67 |
+
|
68 |
+
nl=0
|
69 |
+
nw=0
|
70 |
+
nc=0
|
71 |
+
|
72 |
+
for line in args.infile:
|
73 |
+
nl+=1
|
74 |
+
nw+=len(line.strip(' ').split(' '))
|
75 |
+
nc+=len(line)
|
76 |
+
|
77 |
+
print('{} {} {}'.format(nl,nw,nc))
|
78 |
+
|
79 |
+
def run_indic2roman(args):
|
80 |
+
for line in args.infile:
|
81 |
+
transliterated_line=unicode_transliterate.ItransTransliterator.to_itrans(
|
82 |
+
line,args.lang)
|
83 |
+
args.outfile.write(transliterated_line)
|
84 |
+
|
85 |
+
def run_roman2indic(args):
|
86 |
+
for line in args.infile:
|
87 |
+
transliterated_line=unicode_transliterate.ItransTransliterator.from_itrans(
|
88 |
+
line,args.lang)
|
89 |
+
args.outfile.write(transliterated_line)
|
90 |
+
|
91 |
+
def run_script_unify(args):
|
92 |
+
|
93 |
+
unifier=None
|
94 |
+
|
95 |
+
if args.mode=='aggressive':
|
96 |
+
unifier=script_unifier.AggressiveScriptUnifier(nasals_mode='to_anusvaara_relaxed', common_lang=args.common_lang)
|
97 |
+
|
98 |
+
elif args.mode=='basic':
|
99 |
+
unifier=script_unifier.BasicScriptUnifier(nasals_mode='do_nothing',
|
100 |
+
common_lang=args.common_lang)
|
101 |
+
|
102 |
+
elif args.mode=='naive':
|
103 |
+
unifier=script_unifier.NaiveScriptUnifier(common_lang=args.common_lang)
|
104 |
+
|
105 |
+
assert(unifier is not None)
|
106 |
+
|
107 |
+
for line in args.infile:
|
108 |
+
transliterated_line=unifier.transform(line,args.lang)
|
109 |
+
args.outfile.write(transliterated_line)
|
110 |
+
|
111 |
+
def run_script_convert(args):
|
112 |
+
for line in args.infile:
|
113 |
+
transliterated_line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(
|
114 |
+
line,args.srclang,args.tgtlang)
|
115 |
+
args.outfile.write(transliterated_line)
|
116 |
+
|
117 |
+
def add_common_monolingual_args(task_parser):
|
118 |
+
task_parser.add_argument('infile',
|
119 |
+
type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
|
120 |
+
nargs='?',
|
121 |
+
default=sys.stdin,
|
122 |
+
help='Input File path',
|
123 |
+
)
|
124 |
+
task_parser.add_argument('outfile',
|
125 |
+
type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
|
126 |
+
nargs='?',
|
127 |
+
default=sys.stdout,
|
128 |
+
help='Output File path',
|
129 |
+
)
|
130 |
+
task_parser.add_argument('-l', '--lang',
|
131 |
+
help='Language',
|
132 |
+
)
|
133 |
+
|
134 |
+
def add_common_bilingual_args(task_parser):
|
135 |
+
task_parser.add_argument('infile',
|
136 |
+
type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
|
137 |
+
nargs='?',
|
138 |
+
default=sys.stdin,
|
139 |
+
help='Input File path',
|
140 |
+
)
|
141 |
+
task_parser.add_argument('outfile',
|
142 |
+
type=argparse.FileType('w',encoding=DEFAULT_ENCODING),
|
143 |
+
nargs='?',
|
144 |
+
default=sys.stdout,
|
145 |
+
help='Output File path',
|
146 |
+
)
|
147 |
+
task_parser.add_argument('-s', '--srclang',
|
148 |
+
help='Source Language',
|
149 |
+
)
|
150 |
+
|
151 |
+
task_parser.add_argument('-t', '--tgtlang',
|
152 |
+
help='Target Language',
|
153 |
+
)
|
154 |
+
|
155 |
+
def add_tokenize_parser(subparsers):
|
156 |
+
task_parser=subparsers.add_parser('tokenize',
|
157 |
+
help='tokenizer help')
|
158 |
+
add_common_monolingual_args(task_parser)
|
159 |
+
task_parser.set_defaults(func=run_tokenize)
|
160 |
+
|
161 |
+
def add_detokenize_parser(subparsers):
|
162 |
+
task_parser=subparsers.add_parser('detokenize',
|
163 |
+
help='de-tokenizer help')
|
164 |
+
add_common_monolingual_args(task_parser)
|
165 |
+
task_parser.set_defaults(func=run_detokenize)
|
166 |
+
|
167 |
+
def add_sentence_split_parser(subparsers):
|
168 |
+
task_parser=subparsers.add_parser('sentence_split', help='sentence split help')
|
169 |
+
add_common_monolingual_args(task_parser)
|
170 |
+
task_parser.set_defaults(func=run_sentence_split)
|
171 |
+
|
172 |
+
def add_normalize_parser(subparsers):
|
173 |
+
task_parser=subparsers.add_parser('normalize', help='normalizer help')
|
174 |
+
add_common_monolingual_args(task_parser)
|
175 |
+
task_parser.set_defaults(func=run_normalize)
|
176 |
+
|
177 |
+
def add_morph_parser(subparsers):
|
178 |
+
task_parser=subparsers.add_parser('morph', help='morph help')
|
179 |
+
add_common_monolingual_args(task_parser)
|
180 |
+
task_parser.set_defaults(func=run_morph)
|
181 |
+
|
182 |
+
def add_syllabify_parser(subparsers):
|
183 |
+
task_parser=subparsers.add_parser('syllabify', help='syllabify help')
|
184 |
+
add_common_monolingual_args(task_parser)
|
185 |
+
task_parser.set_defaults(func=run_syllabify)
|
186 |
+
|
187 |
+
def add_wc_parser(subparsers):
|
188 |
+
task_parser=subparsers.add_parser('wc', help='wc help')
|
189 |
+
|
190 |
+
task_parser.add_argument('infile',
|
191 |
+
type=argparse.FileType('r',encoding=DEFAULT_ENCODING),
|
192 |
+
nargs='?',
|
193 |
+
default=sys.stdin,
|
194 |
+
help='Input File path',
|
195 |
+
)
|
196 |
+
# task_parser.add_argument('-l', action='store_true')
|
197 |
+
# task_parser.add_argument('-w', action='store_true')
|
198 |
+
# task_parser.add_argument('-c', action='store_true')
|
199 |
+
# task_parser.set_defaults(l=False)
|
200 |
+
# task_parser.set_defaults(w=False)
|
201 |
+
# task_parser.set_defaults(c=False)
|
202 |
+
|
203 |
+
task_parser.set_defaults(func=run_wc)
|
204 |
+
|
205 |
+
def add_indic2roman_parser(subparsers):
|
206 |
+
task_parser=subparsers.add_parser('indic2roman', help='indic2roman help')
|
207 |
+
add_common_monolingual_args(task_parser)
|
208 |
+
task_parser.set_defaults(func=run_indic2roman)
|
209 |
+
|
210 |
+
def add_roman2indic_parser(subparsers):
|
211 |
+
task_parser=subparsers.add_parser('roman2indic', help='roman2indic help')
|
212 |
+
add_common_monolingual_args(task_parser)
|
213 |
+
task_parser.set_defaults(func=run_indic2roman)
|
214 |
+
|
215 |
+
def add_script_unify_parser(subparsers):
|
216 |
+
task_parser=subparsers.add_parser('script_unify', help='script_unify help')
|
217 |
+
add_common_monolingual_args(task_parser)
|
218 |
+
task_parser.add_argument('-m','--mode',
|
219 |
+
default='basic',
|
220 |
+
choices=['naive', 'basic', 'aggressive'] ,
|
221 |
+
help='Script unification mode',
|
222 |
+
)
|
223 |
+
task_parser.add_argument('-c','--common_lang',
|
224 |
+
default='hi',
|
225 |
+
help='Common language in which all languages are represented',
|
226 |
+
)
|
227 |
+
|
228 |
+
task_parser.set_defaults(func=run_script_unify)
|
229 |
+
|
230 |
+
def add_script_convert_parser(subparsers):
|
231 |
+
task_parser=subparsers.add_parser('script_convert', help='script convert help')
|
232 |
+
add_common_bilingual_args(task_parser)
|
233 |
+
task_parser.set_defaults(func=run_script_convert)
|
234 |
+
|
235 |
+
def get_parser():
|
236 |
+
parser = argparse.ArgumentParser(prog='indicnlp')
|
237 |
+
subparsers = parser.add_subparsers(help='Invoke each operation with one of the subcommands', dest='subcommand')
|
238 |
+
|
239 |
+
add_tokenize_parser(subparsers)
|
240 |
+
add_detokenize_parser(subparsers)
|
241 |
+
add_sentence_split_parser(subparsers)
|
242 |
+
add_normalize_parser(subparsers)
|
243 |
+
|
244 |
+
add_morph_parser(subparsers)
|
245 |
+
add_syllabify_parser(subparsers)
|
246 |
+
|
247 |
+
add_wc_parser(subparsers)
|
248 |
+
|
249 |
+
add_indic2roman_parser(subparsers)
|
250 |
+
add_roman2indic_parser(subparsers)
|
251 |
+
add_script_unify_parser(subparsers)
|
252 |
+
|
253 |
+
add_script_convert_parser(subparsers)
|
254 |
+
|
255 |
+
return parser
|
256 |
+
|
257 |
+
def main():
|
258 |
+
parser=get_parser()
|
259 |
+
args=parser.parse_args()
|
260 |
+
# print(args)
|
261 |
+
args.func(args)
|
262 |
+
|
263 |
+
if __name__ == '__main__':
|
264 |
+
loader.load()
|
265 |
+
main()
|
266 |
+
|
indic_nlp_library/indicnlp/common.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import os
|
10 |
+
|
11 |
+
"""
|
12 |
+
Path to the Indic NLP Resources directory
|
13 |
+
"""
|
14 |
+
INDIC_RESOURCES_PATH=''
|
15 |
+
|
16 |
+
def init():
|
17 |
+
"""
|
18 |
+
Initialize the module. The following actions are performed:
|
19 |
+
|
20 |
+
- Checks of INDIC_RESOURCES_PATH variable is set. If not, checks if it can beb initialized from
|
21 |
+
INDIC_RESOURCES_PATH environment variable. If that fails, an exception is raised
|
22 |
+
"""
|
23 |
+
global INDIC_RESOURCES_PATH
|
24 |
+
try:
|
25 |
+
if INDIC_RESOURCES_PATH=='':
|
26 |
+
INDIC_RESOURCES_PATH=os.environ['INDIC_RESOURCES_PATH']
|
27 |
+
except Exception as e:
|
28 |
+
raise IndicNlpException('INDIC_RESOURCES_PATH not set')
|
29 |
+
|
30 |
+
if INDIC_RESOURCES_PATH=='':
|
31 |
+
raise IndicNlpException('INDIC_RESOURCES_PATH not set')
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
def get_resources_path():
|
36 |
+
"""
|
37 |
+
Get the path to the Indic NLP Resources directory
|
38 |
+
"""
|
39 |
+
return INDIC_RESOURCES_PATH
|
40 |
+
|
41 |
+
def set_resources_path(resources_path):
|
42 |
+
"""
|
43 |
+
Set the path to the Indic NLP Resources directory
|
44 |
+
"""
|
45 |
+
global INDIC_RESOURCES_PATH
|
46 |
+
INDIC_RESOURCES_PATH=resources_path
|
47 |
+
|
48 |
+
class IndicNlpException(Exception):
|
49 |
+
"""
|
50 |
+
Exceptions thrown by Indic NLP Library components are instances of this class.
|
51 |
+
'msg' attribute contains exception details.
|
52 |
+
"""
|
53 |
+
def __init__(self, msg):
|
54 |
+
self.msg = msg
|
55 |
+
|
56 |
+
def __str__(self):
|
57 |
+
return repr(self.msg)
|
58 |
+
|
indic_nlp_library/indicnlp/langinfo.py
ADDED
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
## language codes
|
10 |
+
LC_TA='ta'
|
11 |
+
|
12 |
+
SCRIPT_RANGES={
|
13 |
+
'pa':[0x0a00,0x0a7f] ,
|
14 |
+
'gu':[0x0a80,0x0aff] ,
|
15 |
+
'or':[0x0b00,0x0b7f] ,
|
16 |
+
'ta':[0x0b80,0x0bff] ,
|
17 |
+
'te':[0x0c00,0x0c7f] ,
|
18 |
+
'kn':[0x0c80,0x0cff] ,
|
19 |
+
'ml':[0x0d00,0x0d7f] ,
|
20 |
+
'si':[0x0d80,0x0dff] ,
|
21 |
+
'hi':[0x0900,0x097f] ,
|
22 |
+
'mr':[0x0900,0x097f] ,
|
23 |
+
'kK':[0x0900,0x097f] ,
|
24 |
+
'sa':[0x0900,0x097f] ,
|
25 |
+
'ne':[0x0900,0x097f] ,
|
26 |
+
'sd':[0x0900,0x097f] ,
|
27 |
+
'bn':[0x0980,0x09ff] ,
|
28 |
+
'as':[0x0980,0x09ff] ,
|
29 |
+
}
|
30 |
+
|
31 |
+
DRAVIDIAN_LANGUAGES=['ta', 'te', 'kn', 'ml',]
|
32 |
+
IE_LANGUAGES=['hi', 'mr', 'kK', 'sa', 'ne', 'sd', 'bn', 'as', 'pa', 'gu', 'or', 'si', ]
|
33 |
+
DANDA_DELIM_LANGUAGES=['as','bn','hi','ne','or','pa','sa','sd']
|
34 |
+
|
35 |
+
URDU_RANGES=[
|
36 |
+
[0x0600,0x06ff],
|
37 |
+
[0x0750,0x077f],
|
38 |
+
[0xfb50,0xfdff],
|
39 |
+
[0xfe70,0xfeff],
|
40 |
+
]
|
41 |
+
|
42 |
+
COORDINATED_RANGE_START_INCLUSIVE=0
|
43 |
+
COORDINATED_RANGE_END_INCLUSIVE=0x6f
|
44 |
+
|
45 |
+
NUMERIC_OFFSET_START=0x66
|
46 |
+
NUMERIC_OFFSET_END=0x6f
|
47 |
+
|
48 |
+
HALANTA_OFFSET=0x4d
|
49 |
+
AUM_OFFSET=0x50
|
50 |
+
NUKTA_OFFSET=0x3c
|
51 |
+
|
52 |
+
RUPEE_SIGN=0x20b9
|
53 |
+
|
54 |
+
DANDA=0x0964
|
55 |
+
DOUBLE_DANDA=0x0965
|
56 |
+
|
57 |
+
#TODO: add missing fricatives and approximants
|
58 |
+
VELAR_RANGE=[0x15,0x19]
|
59 |
+
PALATAL_RANGE=[0x1a,0x1e]
|
60 |
+
RETROFLEX_RANGE=[0x1f,0x23]
|
61 |
+
DENTAL_RANGE=[0x24,0x29]
|
62 |
+
LABIAL_RANGE=[0x2a,0x2e]
|
63 |
+
|
64 |
+
# verify
|
65 |
+
VOICED_LIST=[0x17,0x18,0x1c,0x1d,0x21,0x22,0x26,0x27,0x2c,0x2d]
|
66 |
+
UNVOICED_LIST=[0x15,0x16,0x1a,0x1b,0x1f,0x20,0x24,0x25,0x2a,0x2b] #TODO: add sibilants/sonorants
|
67 |
+
ASPIRATED_LIST=[0x16,0x18,0x1b,0x1d,0x20,0x22,0x25,0x27,0x2b,0x2d]
|
68 |
+
UNASPIRATED_LIST=[0x15,0x17,0x1a,0x1c,0x1f,0x21,0x24,0x26,0x2a,0x2c]
|
69 |
+
NASAL_LIST=[0x19,0x1e,0x23,0x28,0x29,0x2d]
|
70 |
+
FRICATIVE_LIST=[0x36,0x37,0x38]
|
71 |
+
APPROXIMANT_LIST=[0x2f,0x30,0x31,0x32,0x33,0x34,0x35]
|
72 |
+
|
73 |
+
#TODO: ha has to be properly categorized
|
74 |
+
|
75 |
+
def is_danda_delim(lang):
|
76 |
+
"""
|
77 |
+
Returns True if danda/double danda is a possible delimiter for the language
|
78 |
+
"""
|
79 |
+
return lang in DANDA_DELIM_LANGUAGES
|
80 |
+
|
81 |
+
def get_offset(c,lang):
|
82 |
+
"""
|
83 |
+
Applicable to Brahmi derived Indic scripts
|
84 |
+
"""
|
85 |
+
return ord(c)-SCRIPT_RANGES[lang][0]
|
86 |
+
|
87 |
+
def offset_to_char(c,lang):
|
88 |
+
"""
|
89 |
+
Applicable to Brahmi derived Indic scripts
|
90 |
+
"""
|
91 |
+
return chr(c+SCRIPT_RANGES[lang][0])
|
92 |
+
|
93 |
+
def in_coordinated_range(c_offset):
|
94 |
+
"""
|
95 |
+
Applicable to Brahmi derived Indic scripts
|
96 |
+
"""
|
97 |
+
return (c_offset>=COORDINATED_RANGE_START_INCLUSIVE and c_offset<=COORDINATED_RANGE_END_INCLUSIVE)
|
98 |
+
|
99 |
+
def is_indiclang_char(c,lang):
|
100 |
+
"""
|
101 |
+
Applicable to Brahmi derived Indic scripts
|
102 |
+
"""
|
103 |
+
o=get_offset(c,lang)
|
104 |
+
return (o>=0 and o<=0x7f) or ord(c)==DANDA or ord(c)==DOUBLE_DANDA
|
105 |
+
|
106 |
+
# def is_vowel(c,lang):
|
107 |
+
# """
|
108 |
+
# Is the character a vowel
|
109 |
+
# """
|
110 |
+
# o=get_offset(c,lang)
|
111 |
+
# return (o>=0x04 and o<=0x14)
|
112 |
+
|
113 |
+
# def is_vowel_sign(c,lang):
|
114 |
+
# """
|
115 |
+
# Is the character a vowel sign (maatraa)
|
116 |
+
# """
|
117 |
+
# o=get_offset(c,lang)
|
118 |
+
# return (o>=0x3e and o<=0x4c)
|
119 |
+
|
120 |
+
# def is_halanta(c,lang):
|
121 |
+
# """
|
122 |
+
# Is the character the halanta character
|
123 |
+
# """
|
124 |
+
# o=get_offset(c,lang)
|
125 |
+
# return (o==HALANTA_OFFSET)
|
126 |
+
|
127 |
+
# def is_nukta(c,lang):
|
128 |
+
# """
|
129 |
+
# Is the character the halanta character
|
130 |
+
# """
|
131 |
+
# o=get_offset(c,lang)
|
132 |
+
# return (o==NUKTA_OFFSET)
|
133 |
+
|
134 |
+
# def is_aum(c,lang):
|
135 |
+
# """
|
136 |
+
# Is the character a vowel sign (maatraa)
|
137 |
+
# """
|
138 |
+
# o=get_offset(c,lang)
|
139 |
+
# return (o==AUM_OFFSET)
|
140 |
+
|
141 |
+
# def is_consonant(c,lang):
|
142 |
+
# """
|
143 |
+
# Is the character a consonant
|
144 |
+
# """
|
145 |
+
# o=get_offset(c,lang)
|
146 |
+
# return (o>=0x15 and o<=0x39)
|
147 |
+
|
148 |
+
# def is_velar(c,lang):
|
149 |
+
# """
|
150 |
+
# Is the character a velar
|
151 |
+
# """
|
152 |
+
# o=get_offset(c,lang)
|
153 |
+
# return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
|
154 |
+
|
155 |
+
# def is_palatal(c,lang):
|
156 |
+
# """
|
157 |
+
# Is the character a palatal
|
158 |
+
# """
|
159 |
+
# o=get_offset(c,lang)
|
160 |
+
# return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
|
161 |
+
|
162 |
+
# def is_retroflex(c,lang):
|
163 |
+
# """
|
164 |
+
# Is the character a retroflex
|
165 |
+
# """
|
166 |
+
# o=get_offset(c,lang)
|
167 |
+
# return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
|
168 |
+
|
169 |
+
# def is_dental(c,lang):
|
170 |
+
# """
|
171 |
+
# Is the character a dental
|
172 |
+
# """
|
173 |
+
# o=get_offset(c,lang)
|
174 |
+
# return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
|
175 |
+
|
176 |
+
# def is_labial(c,lang):
|
177 |
+
# """
|
178 |
+
# Is the character a labial
|
179 |
+
# """
|
180 |
+
# o=get_offset(c,lang)
|
181 |
+
# return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
|
182 |
+
|
183 |
+
# def is_voiced(c,lang):
|
184 |
+
# """
|
185 |
+
# Is the character a voiced consonant
|
186 |
+
# """
|
187 |
+
# o=get_offset(c,lang)
|
188 |
+
# return o in VOICED_LIST
|
189 |
+
|
190 |
+
# def is_unvoiced(c,lang):
|
191 |
+
# """
|
192 |
+
# Is the character a unvoiced consonant
|
193 |
+
# """
|
194 |
+
# o=get_offset(c,lang)
|
195 |
+
# return o in UNVOICED_LIST
|
196 |
+
|
197 |
+
# def is_aspirated(c,lang):
|
198 |
+
# """
|
199 |
+
# Is the character a aspirated consonant
|
200 |
+
# """
|
201 |
+
# o=get_offset(c,lang)
|
202 |
+
# return o in ASPIRATED_LIST
|
203 |
+
|
204 |
+
# def is_unaspirated(c,lang):
|
205 |
+
# """
|
206 |
+
# Is the character a unaspirated consonant
|
207 |
+
# """
|
208 |
+
# o=get_offset(c,lang)
|
209 |
+
# return o in UNASPIRATED_LIST
|
210 |
+
|
211 |
+
# def is_nasal(c,lang):
|
212 |
+
# """
|
213 |
+
# Is the character a nasal consonant
|
214 |
+
# """
|
215 |
+
# o=get_offset(c,lang)
|
216 |
+
# return o in NASAL_LIST
|
217 |
+
|
218 |
+
# def is_fricative(c,lang):
|
219 |
+
# """
|
220 |
+
# Is the character a fricative consonant
|
221 |
+
# """
|
222 |
+
# o=get_offset(c,lang)
|
223 |
+
# return o in FRICATIVE_LIST
|
224 |
+
|
225 |
+
# def is_approximant(c,lang):
|
226 |
+
# """
|
227 |
+
# Is the character an approximant consonant
|
228 |
+
# """
|
229 |
+
# o=get_offset(c,lang)
|
230 |
+
# return o in APPROXIMANT_LIST
|
231 |
+
|
232 |
+
# def is_number(c,lang):
|
233 |
+
# """
|
234 |
+
# Is the character a number
|
235 |
+
# """
|
236 |
+
# o=get_offset(c,lang)
|
237 |
+
# return (o>=0x66 and o<=0x6f)
|
238 |
+
|
239 |
+
|
240 |
+
def is_vowel(c,lang):
|
241 |
+
"""
|
242 |
+
Is the character a vowel
|
243 |
+
"""
|
244 |
+
o=get_offset(c,lang)
|
245 |
+
return (o>=0x04 and o<=0x14)
|
246 |
+
|
247 |
+
def is_vowel_sign(c,lang):
|
248 |
+
"""
|
249 |
+
Is the character a vowel sign (maatraa)
|
250 |
+
"""
|
251 |
+
o=get_offset(c,lang)
|
252 |
+
return (o>=0x3e and o<=0x4c)
|
253 |
+
|
254 |
+
def is_halanta(c,lang):
|
255 |
+
"""
|
256 |
+
Is the character the halanta character
|
257 |
+
"""
|
258 |
+
o=get_offset(c,lang)
|
259 |
+
return (o==HALANTA_OFFSET)
|
260 |
+
|
261 |
+
def is_nukta(c,lang):
|
262 |
+
"""
|
263 |
+
Is the character the halanta character
|
264 |
+
"""
|
265 |
+
o=get_offset(c,lang)
|
266 |
+
return (o==NUKTA_OFFSET)
|
267 |
+
|
268 |
+
def is_aum(c,lang):
|
269 |
+
"""
|
270 |
+
Is the character a vowel sign (maatraa)
|
271 |
+
"""
|
272 |
+
o=get_offset(c,lang)
|
273 |
+
return (o==AUM_OFFSET)
|
274 |
+
|
275 |
+
def is_consonant(c,lang):
|
276 |
+
"""
|
277 |
+
Is the character a consonant
|
278 |
+
"""
|
279 |
+
o=get_offset(c,lang)
|
280 |
+
return (o>=0x15 and o<=0x39)
|
281 |
+
|
282 |
+
def is_velar(c,lang):
|
283 |
+
"""
|
284 |
+
Is the character a velar
|
285 |
+
"""
|
286 |
+
o=get_offset(c,lang)
|
287 |
+
return (o>=VELAR_RANGE[0] and o<=VELAR_RANGE[1])
|
288 |
+
|
289 |
+
def is_palatal(c,lang):
|
290 |
+
"""
|
291 |
+
Is the character a palatal
|
292 |
+
"""
|
293 |
+
o=get_offset(c,lang)
|
294 |
+
return (o>=PALATAL_RANGE[0] and o<=PALATAL_RANGE[1])
|
295 |
+
|
296 |
+
def is_retroflex(c,lang):
|
297 |
+
"""
|
298 |
+
Is the character a retroflex
|
299 |
+
"""
|
300 |
+
o=get_offset(c,lang)
|
301 |
+
return (o>=RETROFLEX_RANGE[0] and o<=RETROFLEX_RANGE[1])
|
302 |
+
|
303 |
+
def is_dental(c,lang):
|
304 |
+
"""
|
305 |
+
Is the character a dental
|
306 |
+
"""
|
307 |
+
o=get_offset(c,lang)
|
308 |
+
return (o>=DENTAL_RANGE[0] and o<=DENTAL_RANGE[1])
|
309 |
+
|
310 |
+
def is_labial(c,lang):
|
311 |
+
"""
|
312 |
+
Is the character a labial
|
313 |
+
"""
|
314 |
+
o=get_offset(c,lang)
|
315 |
+
return (o>=LABIAL_RANGE[0] and o<=LABIAL_RANGE[1])
|
316 |
+
|
317 |
+
def is_voiced(c,lang):
|
318 |
+
"""
|
319 |
+
Is the character a voiced consonant
|
320 |
+
"""
|
321 |
+
o=get_offset(c,lang)
|
322 |
+
return o in VOICED_LIST
|
323 |
+
|
324 |
+
def is_unvoiced(c,lang):
|
325 |
+
"""
|
326 |
+
Is the character a unvoiced consonant
|
327 |
+
"""
|
328 |
+
o=get_offset(c,lang)
|
329 |
+
return o in UNVOICED_LIST
|
330 |
+
|
331 |
+
def is_aspirated(c,lang):
|
332 |
+
"""
|
333 |
+
Is the character a aspirated consonant
|
334 |
+
"""
|
335 |
+
o=get_offset(c,lang)
|
336 |
+
return o in ASPIRATED_LIST
|
337 |
+
|
338 |
+
def is_unaspirated(c,lang):
|
339 |
+
"""
|
340 |
+
Is the character a unaspirated consonant
|
341 |
+
"""
|
342 |
+
o=get_offset(c,lang)
|
343 |
+
return o in UNASPIRATED_LIST
|
344 |
+
|
345 |
+
def is_nasal(c,lang):
|
346 |
+
"""
|
347 |
+
Is the character a nasal consonant
|
348 |
+
"""
|
349 |
+
o=get_offset(c,lang)
|
350 |
+
return o in NASAL_LIST
|
351 |
+
|
352 |
+
def is_fricative(c,lang):
|
353 |
+
"""
|
354 |
+
Is the character a fricative consonant
|
355 |
+
"""
|
356 |
+
o=get_offset(c,lang)
|
357 |
+
return o in FRICATIVE_LIST
|
358 |
+
|
359 |
+
def is_approximant(c,lang):
|
360 |
+
"""
|
361 |
+
Is the character an approximant consonant
|
362 |
+
"""
|
363 |
+
o=get_offset(c,lang)
|
364 |
+
return o in APPROXIMANT_LIST
|
365 |
+
|
366 |
+
def is_number(c,lang):
|
367 |
+
"""
|
368 |
+
Is the character a number
|
369 |
+
"""
|
370 |
+
o=get_offset(c,lang)
|
371 |
+
return (o>=0x66 and o<=0x6f)
|
372 |
+
|
373 |
+
|
374 |
+
##################################################
|
375 |
+
|
376 |
+
def is_vowel_offset(c_offset):
|
377 |
+
"""
|
378 |
+
Is the offset a vowel
|
379 |
+
"""
|
380 |
+
return (c_offset>=0x04 and c_offset<=0x14)
|
381 |
+
|
382 |
+
def is_vowel_sign_offset(c_offset):
|
383 |
+
"""
|
384 |
+
Is the offset a vowel sign (maatraa)
|
385 |
+
"""
|
386 |
+
return (c_offset>=0x3e and c_offset<=0x4c)
|
387 |
+
|
388 |
+
def is_halanta_offset(c_offset):
|
389 |
+
"""
|
390 |
+
Is the offset the halanta offset
|
391 |
+
"""
|
392 |
+
return (c_offset==HALANTA_OFFSET)
|
393 |
+
|
394 |
+
def is_nukta_offset(c_offset):
|
395 |
+
"""
|
396 |
+
Is the offset the halanta offset
|
397 |
+
"""
|
398 |
+
return (c_offset==NUKTA_OFFSET)
|
399 |
+
|
400 |
+
def is_aum_offset(c_offset):
|
401 |
+
"""
|
402 |
+
Is the offset a vowel sign (maatraa)
|
403 |
+
"""
|
404 |
+
return (c_offset==AUM_OFFSET)
|
405 |
+
|
406 |
+
def is_consonant_offset(c_offset):
|
407 |
+
"""
|
408 |
+
Is the offset a consonant
|
409 |
+
"""
|
410 |
+
return (c_offset>=0x15 and c_offset<=0x39)
|
411 |
+
|
412 |
+
def is_velar_offset(c_offset):
|
413 |
+
"""
|
414 |
+
Is the offset a velar
|
415 |
+
"""
|
416 |
+
return (c_offset>=VELAR_RANGE[0] and c_offset<=VELAR_RANGE[1])
|
417 |
+
|
418 |
+
def is_palatal_offset(c_offset):
|
419 |
+
"""
|
420 |
+
Is the offset a palatal
|
421 |
+
"""
|
422 |
+
return (c_offset>=PALATAL_RANGE[0] and c_offset<=PALATAL_RANGE[1])
|
423 |
+
|
424 |
+
def is_retroflex_offset(c_offset):
|
425 |
+
"""
|
426 |
+
Is the offset a retroflex
|
427 |
+
"""
|
428 |
+
return (c_offset>=RETROFLEX_RANGE[0] and c_offset<=RETROFLEX_RANGE[1])
|
429 |
+
|
430 |
+
def is_dental_offset(c_offset):
|
431 |
+
"""
|
432 |
+
Is the offset a dental
|
433 |
+
"""
|
434 |
+
return (c_offset>=DENTAL_RANGE[0] and c_offset<=DENTAL_RANGE[1])
|
435 |
+
|
436 |
+
def is_labial_offset(c_offset):
|
437 |
+
"""
|
438 |
+
Is the offset a labial
|
439 |
+
"""
|
440 |
+
return (c_offset>=LABIAL_RANGE[0] and c_offset<=LABIAL_RANGE[1])
|
441 |
+
|
442 |
+
def is_voiced_offset(c_offset):
|
443 |
+
"""
|
444 |
+
Is the offset a voiced consonant
|
445 |
+
"""
|
446 |
+
return c_offset in VOICED_LIST
|
447 |
+
|
448 |
+
def is_unvoiced_offset(c_offset):
|
449 |
+
"""
|
450 |
+
Is the offset a unvoiced consonant
|
451 |
+
"""
|
452 |
+
return c_offset in UNVOICED_LIST
|
453 |
+
|
454 |
+
def is_aspirated_offset(c_offset):
|
455 |
+
"""
|
456 |
+
Is the offset a aspirated consonant
|
457 |
+
"""
|
458 |
+
return c_offset in ASPIRATED_LIST
|
459 |
+
|
460 |
+
def is_unaspirated_offset(c_offset):
|
461 |
+
"""
|
462 |
+
Is the offset a unaspirated consonant
|
463 |
+
"""
|
464 |
+
return c_offset in UNASPIRATED_LIST
|
465 |
+
|
466 |
+
def is_nasal_offset(c_offset):
|
467 |
+
"""
|
468 |
+
Is the offset a nasal consonant
|
469 |
+
"""
|
470 |
+
return c_offset in NASAL_LIST
|
471 |
+
|
472 |
+
def is_fricative_offset(c_offset):
|
473 |
+
"""
|
474 |
+
Is the offset a fricative consonant
|
475 |
+
"""
|
476 |
+
return c_offset in FRICATIVE_LIST
|
477 |
+
|
478 |
+
def is_approximant_offset(c_offset):
|
479 |
+
"""
|
480 |
+
Is the offset an approximant consonant
|
481 |
+
"""
|
482 |
+
return c_offset in APPROXIMANT_LIST
|
483 |
+
|
484 |
+
def is_number_offset(c_offset):
|
485 |
+
"""
|
486 |
+
Is the offset a number
|
487 |
+
"""
|
488 |
+
return (c_offset>=0x66 and c_offset<=0x6f)
|
indic_nlp_library/indicnlp/loader.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
from indicnlp import common
|
10 |
+
from indicnlp.script import indic_scripts
|
11 |
+
from indicnlp.script import english_script
|
12 |
+
from indicnlp.transliterate import unicode_transliterate
|
13 |
+
|
14 |
+
def load():
|
15 |
+
"""
|
16 |
+
Initializes the Indic NLP library. Clients should call this method before using the library.
|
17 |
+
|
18 |
+
Any module requiring initialization should have a init() method, to which a call must be made from this method
|
19 |
+
"""
|
20 |
+
|
21 |
+
### Order of intialization may matter
|
22 |
+
|
23 |
+
# Common has to be loaded first to get access to resources
|
24 |
+
common.init()
|
25 |
+
|
26 |
+
## Initialization of Indic scripts module
|
27 |
+
indic_scripts.init()
|
28 |
+
|
29 |
+
## Initialization of English scripts module
|
30 |
+
english_script.init()
|
31 |
+
|
32 |
+
## Initialization of unicode_transliterate module
|
33 |
+
unicode_transliterate.init()
|
34 |
+
|
35 |
+
|
indic_nlp_library/indicnlp/morph/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/morph/unsupervised_morph.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import codecs, sys, itertools,re,os
|
10 |
+
import morfessor
|
11 |
+
|
12 |
+
from functools import lru_cache
|
13 |
+
|
14 |
+
from indicnlp import langinfo
|
15 |
+
from indicnlp import common
|
16 |
+
from indicnlp.tokenize import indic_tokenize
|
17 |
+
|
18 |
+
# Unsupervised Morphological Analyser for Indian languages.
|
19 |
+
#
|
20 |
+
# @author Anoop Kunchukuttan
|
21 |
+
#
|
22 |
+
|
23 |
+
class MorphAnalyzerI(object):
|
24 |
+
"""
|
25 |
+
Interface for Morph Analyzer
|
26 |
+
"""
|
27 |
+
|
28 |
+
def morph_analyze(word):
|
29 |
+
pass
|
30 |
+
|
31 |
+
def morph_analyze_document(tokens):
|
32 |
+
pass
|
33 |
+
|
34 |
+
class UnsupervisedMorphAnalyzer(MorphAnalyzerI):
|
35 |
+
"""
|
36 |
+
Unsupervised Morphological analyser built using Morfessor 2.0
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(self,lang,add_marker=False):
|
40 |
+
self.lang=lang
|
41 |
+
self.add_marker=add_marker
|
42 |
+
|
43 |
+
io = morfessor.MorfessorIO()
|
44 |
+
self._morfessor_model=io.read_any_model(os.path.join(common.INDIC_RESOURCES_PATH,'morph','morfessor','{}.model'.format(lang)))
|
45 |
+
|
46 |
+
self._script_range_pat=r'^[{}-{}]+$'.format(chr(langinfo.SCRIPT_RANGES[lang][0]),chr(langinfo.SCRIPT_RANGES[lang][1]))
|
47 |
+
self._script_check_re=re.compile(self._script_range_pat)
|
48 |
+
|
49 |
+
def _contains_number(self,text):
|
50 |
+
if self.lang in langinfo.SCRIPT_RANGES:
|
51 |
+
for c in text:
|
52 |
+
offset=ord(c)-langinfo.SCRIPT_RANGES[self.lang][0]
|
53 |
+
if offset >=langinfo.NUMERIC_OFFSET_START and offset <= langinfo.NUMERIC_OFFSET_END:
|
54 |
+
return True
|
55 |
+
return False
|
56 |
+
|
57 |
+
def _morphanalysis_needed(self,word):
|
58 |
+
return self._script_check_re.match(word) and not self._contains_number(word)
|
59 |
+
|
60 |
+
@lru_cache(maxsize=16384)
|
61 |
+
def morph_analyze(self,word):
|
62 |
+
"""
|
63 |
+
Morphanalyzes a single word and returns a list of component morphemes
|
64 |
+
|
65 |
+
@param word: string input word
|
66 |
+
"""
|
67 |
+
m_list=[]
|
68 |
+
if self._morphanalysis_needed(word):
|
69 |
+
val=self._morfessor_model.viterbi_segment(word)
|
70 |
+
m_list=val[0]
|
71 |
+
if self.add_marker:
|
72 |
+
m_list= [ '{}_S_'.format(m) if i>0 else '{}_R_'.format(m) for i,m in enumerate(m_list)]
|
73 |
+
else:
|
74 |
+
if self.add_marker:
|
75 |
+
word='{}_E_'.format(word)
|
76 |
+
m_list=[word]
|
77 |
+
return m_list
|
78 |
+
|
79 |
+
### Older implementation
|
80 |
+
#val=self._morfessor_model.viterbi_segment(word)
|
81 |
+
#m_list=val[0]
|
82 |
+
#if self.add_marker:
|
83 |
+
# m_list= [ u'{}_S_'.format(m) if i>0 else u'{}_R_'.format(m) for i,m in enumerate(m_list)]
|
84 |
+
#return m_list
|
85 |
+
|
86 |
+
|
87 |
+
def morph_analyze_document(self,tokens):
|
88 |
+
"""
|
89 |
+
Morphanalyzes a document, represented as a list of tokens
|
90 |
+
Each word is morphanalyzed and result is a list of morphemes constituting the document
|
91 |
+
|
92 |
+
@param tokens: string sequence of words
|
93 |
+
|
94 |
+
@return list of segments in the document after morph analysis
|
95 |
+
"""
|
96 |
+
|
97 |
+
out_tokens=[]
|
98 |
+
for token in tokens:
|
99 |
+
morphs=self.morph_analyze(token)
|
100 |
+
out_tokens.extend(morphs)
|
101 |
+
return out_tokens
|
102 |
+
|
103 |
+
#### Older implementation
|
104 |
+
#out_tokens=[]
|
105 |
+
#for token in tokens:
|
106 |
+
# if self._morphanalysis_needed(token):
|
107 |
+
# morphs=self.morph_analyze(token)
|
108 |
+
# out_tokens.extend(morphs)
|
109 |
+
# else:
|
110 |
+
# if self.add_marker:
|
111 |
+
# token=u'{}_E_'.format(token)
|
112 |
+
# out_tokens.append(token)
|
113 |
+
#return out_tokens
|
114 |
+
|
115 |
+
|
116 |
+
if __name__ == '__main__':
|
117 |
+
|
118 |
+
if len(sys.argv)<4:
|
119 |
+
print("Usage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]")
|
120 |
+
sys.exit(1)
|
121 |
+
|
122 |
+
language=sys.argv[3]
|
123 |
+
common.INDIC_RESOURCES_PATH=sys.argv[4]
|
124 |
+
|
125 |
+
add_marker=False
|
126 |
+
|
127 |
+
if len(sys.argv)==6:
|
128 |
+
add_marker= True if sys.argv[5] == 'True' else False
|
129 |
+
|
130 |
+
print('Loading morph analyser for ' + language)
|
131 |
+
analyzer=UnsupervisedMorphAnalyzer(language,add_marker)
|
132 |
+
print('Loaded morph analyser for ' + language)
|
133 |
+
|
134 |
+
with codecs.open(sys.argv[1],'r','utf-8') as ifile:
|
135 |
+
with codecs.open(sys.argv[2],'w','utf-8') as ofile:
|
136 |
+
for line in ifile.readlines():
|
137 |
+
line=line.strip()
|
138 |
+
tokens=indic_tokenize.trivial_tokenize(line)
|
139 |
+
morph_tokens=analyzer.morph_analyze_document(tokens)
|
140 |
+
ofile.write(' '.join(morph_tokens))
|
141 |
+
ofile.write('\n')
|
142 |
+
|
indic_nlp_library/indicnlp/normalize/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/normalize/indic_normalize.py
ADDED
@@ -0,0 +1,984 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
#
|
4 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
5 |
+
# All rights reserved.
|
6 |
+
#
|
7 |
+
# This source code is licensed under the MIT license found in the
|
8 |
+
# LICENSE file in the root directory of this source tree.
|
9 |
+
#
|
10 |
+
|
11 |
+
#Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts
|
12 |
+
#
|
13 |
+
# @author Anoop Kunchukuttan
|
14 |
+
#
|
15 |
+
|
16 |
+
import sys, codecs, string, itertools, re
|
17 |
+
from indicnlp import langinfo
|
18 |
+
|
19 |
+
|
20 |
+
class NormalizerI(object):
|
21 |
+
"""
|
22 |
+
The normalizer classes do the following:
|
23 |
+
|
24 |
+
* Some characters have multiple Unicode codepoints. The normalizer chooses a single standard representation
|
25 |
+
* Some control characters are deleted
|
26 |
+
* While typing using the Latin keyboard, certain typical mistakes occur which are corrected by the module
|
27 |
+
|
28 |
+
Base class for normalizer. Performs some common normalization, which includes:
|
29 |
+
|
30 |
+
* Byte order mark, word joiner, etc. removal
|
31 |
+
* ZERO_WIDTH_NON_JOINER and ZERO_WIDTH_JOINER removal
|
32 |
+
* ZERO_WIDTH_SPACE and NO_BREAK_SPACE replaced by spaces
|
33 |
+
|
34 |
+
Script specific normalizers should derive from this class and override the normalize() method.
|
35 |
+
They can call the super class 'normalize() method to avail of the common normalization
|
36 |
+
|
37 |
+
"""
|
38 |
+
|
39 |
+
BYTE_ORDER_MARK='\uFEFF'
|
40 |
+
BYTE_ORDER_MARK_2='\uFFFE'
|
41 |
+
WORD_JOINER='\u2060'
|
42 |
+
SOFT_HYPHEN='\u00AD'
|
43 |
+
|
44 |
+
ZERO_WIDTH_SPACE='\u200B'
|
45 |
+
NO_BREAK_SPACE='\u00A0'
|
46 |
+
|
47 |
+
ZERO_WIDTH_NON_JOINER='\u200C'
|
48 |
+
ZERO_WIDTH_JOINER='\u200D'
|
49 |
+
|
50 |
+
def _normalize_punctuations(self, text):
|
51 |
+
"""
|
52 |
+
Normalize punctuations.
|
53 |
+
Applied many of the punctuation normalizations that are part of MosesNormalizer
|
54 |
+
from sacremoses
|
55 |
+
"""
|
56 |
+
text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
|
57 |
+
text=text.replace('„', r'"')
|
58 |
+
text=text.replace('“', r'"')
|
59 |
+
text=text.replace('”', r'"')
|
60 |
+
text=text.replace('–', r'-')
|
61 |
+
text=text.replace('—', r' - ')
|
62 |
+
text=text.replace('´', r"'")
|
63 |
+
text=text.replace('‘', r"'")
|
64 |
+
text=text.replace('‚', r"'")
|
65 |
+
text=text.replace('’', r"'")
|
66 |
+
text=text.replace("''", r'"')
|
67 |
+
text=text.replace('´´', r'"')
|
68 |
+
text=text.replace('…', r'...')
|
69 |
+
|
70 |
+
return text
|
71 |
+
|
72 |
+
def normalize(self,text):
|
73 |
+
pass
|
74 |
+
|
75 |
+
|
76 |
+
class BaseNormalizer(NormalizerI):
|
77 |
+
|
78 |
+
def __init__(self,lang,
|
79 |
+
remove_nuktas=False,
|
80 |
+
nasals_mode='do_nothing',
|
81 |
+
do_normalize_chandras=False,
|
82 |
+
do_normalize_vowel_ending=False):
|
83 |
+
|
84 |
+
self.lang=lang
|
85 |
+
self.remove_nuktas=remove_nuktas
|
86 |
+
self.nasals_mode=nasals_mode
|
87 |
+
self.do_normalize_chandras=do_normalize_chandras
|
88 |
+
self.do_normalize_vowel_ending=do_normalize_vowel_ending
|
89 |
+
|
90 |
+
self._init_normalize_chandras()
|
91 |
+
self._init_normalize_nasals()
|
92 |
+
self._init_normalize_vowel_ending()
|
93 |
+
#self._init_visarga_correction()
|
94 |
+
|
95 |
+
def _init_normalize_vowel_ending(self):
|
96 |
+
|
97 |
+
if self.lang in langinfo.IE_LANGUAGES:
|
98 |
+
self.fn_vowel_ending=self._normalize_word_vowel_ending_ie
|
99 |
+
elif self.lang in langinfo.DRAVIDIAN_LANGUAGES:
|
100 |
+
self.fn_vowel_ending=self._normalize_word_vowel_ending_dravidian
|
101 |
+
else:
|
102 |
+
self.fn_vowel_ending=lambda x: x
|
103 |
+
|
104 |
+
def _init_normalize_chandras(self):
|
105 |
+
|
106 |
+
substitution_offsets =\
|
107 |
+
[
|
108 |
+
[0x0d , 0x0f], # chandra e, independent
|
109 |
+
[0x11 , 0x13], # chandra o, independent
|
110 |
+
[0x45 , 0x47], # chandra e , 0xde],pendent
|
111 |
+
[0x49 , 0x4b], # chandra o , 0xde],pendent
|
112 |
+
# [0x72 , 0x0f], # mr: chandra e, independent
|
113 |
+
|
114 |
+
[0x00 , 0x02], # chandrabindu
|
115 |
+
[0x01 , 0x02], # chandrabindu
|
116 |
+
]
|
117 |
+
|
118 |
+
self.chandra_substitutions = [
|
119 |
+
(langinfo.offset_to_char(x[0],self.lang), langinfo.offset_to_char(x[1],self.lang))
|
120 |
+
for x in substitution_offsets ]
|
121 |
+
|
122 |
+
def _normalize_chandras(self,text):
|
123 |
+
for match, repl in self.chandra_substitutions:
|
124 |
+
text=text.replace(match,repl)
|
125 |
+
return text
|
126 |
+
|
127 |
+
def _init_to_anusvaara_strict(self):
|
128 |
+
"""
|
129 |
+
`r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
|
130 |
+
"""
|
131 |
+
|
132 |
+
pat_signatures=\
|
133 |
+
[
|
134 |
+
[0x19,0x15,0x18],
|
135 |
+
[0x1e,0x1a,0x1d],
|
136 |
+
[0x23,0x1f,0x22],
|
137 |
+
[0x28,0x24,0x27],
|
138 |
+
[0x29,0x24,0x27],
|
139 |
+
[0x2e,0x2a,0x2d],
|
140 |
+
]
|
141 |
+
|
142 |
+
halant_offset=0x4d
|
143 |
+
anusvaara_offset=0x02
|
144 |
+
|
145 |
+
pats=[]
|
146 |
+
|
147 |
+
for pat_signature in pat_signatures:
|
148 |
+
pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format(
|
149 |
+
nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
|
150 |
+
halant=langinfo.offset_to_char(halant_offset,self.lang),
|
151 |
+
start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
|
152 |
+
end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
|
153 |
+
))
|
154 |
+
pats.append(pat)
|
155 |
+
|
156 |
+
repl_string='{anusvaara}\\1'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
|
157 |
+
|
158 |
+
self.pats_repls=(pats,repl_string)
|
159 |
+
|
160 |
+
def _to_anusvaara_strict(self,text):
|
161 |
+
|
162 |
+
pats, repl_string = self.pats_repls
|
163 |
+
for pat in pats:
|
164 |
+
text=pat.sub(repl_string,text)
|
165 |
+
|
166 |
+
return text
|
167 |
+
|
168 |
+
def _init_to_anusvaara_relaxed(self):
|
169 |
+
"""
|
170 |
+
`r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
|
171 |
+
"""
|
172 |
+
|
173 |
+
nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e]
|
174 |
+
nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list])
|
175 |
+
|
176 |
+
halant_offset=0x4d
|
177 |
+
anusvaara_offset=0x02
|
178 |
+
|
179 |
+
pat=re.compile(r'[{nasals_list_str}]{halant}'.format(
|
180 |
+
nasals_list_str=nasals_list_str,
|
181 |
+
halant=langinfo.offset_to_char(halant_offset,self.lang),
|
182 |
+
))
|
183 |
+
|
184 |
+
repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))
|
185 |
+
|
186 |
+
self.pats_repls = (pat,repl_string)
|
187 |
+
|
188 |
+
def _to_anusvaara_relaxed(self,text):
|
189 |
+
pat, repl_string = self.pats_repls
|
190 |
+
return pat.sub(repl_string,text)
|
191 |
+
|
192 |
+
|
193 |
+
def _init_to_nasal_consonants(self):
|
194 |
+
"""
|
195 |
+
`r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
|
196 |
+
"""
|
197 |
+
|
198 |
+
pat_signatures=\
|
199 |
+
[
|
200 |
+
[0x19,0x15,0x18],
|
201 |
+
[0x1e,0x1a,0x1d],
|
202 |
+
[0x23,0x1f,0x22],
|
203 |
+
[0x28,0x24,0x27],
|
204 |
+
[0x29,0x24,0x27],
|
205 |
+
[0x2e,0x2a,0x2d],
|
206 |
+
]
|
207 |
+
|
208 |
+
halant_offset=0x4d
|
209 |
+
anusvaara_offset=0x02
|
210 |
+
|
211 |
+
pats=[]
|
212 |
+
repl_strings=[]
|
213 |
+
|
214 |
+
for pat_signature in pat_signatures:
|
215 |
+
pat=re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format(
|
216 |
+
anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang),
|
217 |
+
start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
|
218 |
+
end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
|
219 |
+
))
|
220 |
+
pats.append(pat)
|
221 |
+
repl_string='{nasal}{halant}\\1'.format(
|
222 |
+
nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
|
223 |
+
halant=langinfo.offset_to_char(halant_offset,self.lang),
|
224 |
+
)
|
225 |
+
repl_strings.append(repl_string)
|
226 |
+
|
227 |
+
self.pats_repls=list(zip(pats,repl_strings))
|
228 |
+
|
229 |
+
def _to_nasal_consonants(self,text):
|
230 |
+
|
231 |
+
for pat, repl in self.pats_repls:
|
232 |
+
text=pat.sub(repl,text)
|
233 |
+
|
234 |
+
return text
|
235 |
+
|
236 |
+
def _init_normalize_nasals(self):
|
237 |
+
|
238 |
+
if self.nasals_mode == 'to_anusvaara_strict':
|
239 |
+
self._init_to_anusvaara_strict()
|
240 |
+
elif self.nasals_mode == 'to_anusvaara_relaxed':
|
241 |
+
self._init_to_anusvaara_relaxed()
|
242 |
+
elif self.nasals_mode == 'to_nasal_consonants':
|
243 |
+
self._init_to_nasal_consonants()
|
244 |
+
|
245 |
+
def _normalize_nasals(self,text):
|
246 |
+
if self.nasals_mode == 'to_anusvaara_strict':
|
247 |
+
return self._to_anusvaara_strict(text)
|
248 |
+
elif self.nasals_mode == 'to_anusvaara_relaxed':
|
249 |
+
return self._to_anusvaara_relaxed(text)
|
250 |
+
elif self.nasals_mode == 'to_nasal_consonants':
|
251 |
+
return self._to_nasal_consonants(text)
|
252 |
+
else:
|
253 |
+
return text
|
254 |
+
|
255 |
+
|
256 |
+
def _normalize_word_vowel_ending_dravidian(self,word):
|
257 |
+
"""
|
258 |
+
for Dravidian
|
259 |
+
- consonant ending: add 'a' ki maatra
|
260 |
+
- halant ending: no change
|
261 |
+
- 'a' ki maatra: no change
|
262 |
+
"""
|
263 |
+
if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
|
264 |
+
return word+langinfo.offset_to_char(0x3e,self.lang)
|
265 |
+
else:
|
266 |
+
return word
|
267 |
+
|
268 |
+
def _normalize_word_vowel_ending_ie(self,word):
|
269 |
+
"""
|
270 |
+
for IE
|
271 |
+
- consonant ending: add halant
|
272 |
+
- halant ending: no change
|
273 |
+
- 'a' ki maatra: no change
|
274 |
+
"""
|
275 |
+
if len(word)>0 and langinfo.is_consonant(word[-1],self.lang):
|
276 |
+
return word+langinfo.offset_to_char(langinfo.HALANTA_OFFSET,self.lang)
|
277 |
+
else:
|
278 |
+
return word
|
279 |
+
|
280 |
+
def _normalize_vowel_ending(self,text):
|
281 |
+
return ' '.join([ self.fn_vowel_ending(w) for w in text.split(' ') ])
|
282 |
+
|
283 |
+
def normalize(self,text):
|
284 |
+
"""
|
285 |
+
Method to be implemented for normalization for each script
|
286 |
+
"""
|
287 |
+
text=text.replace(NormalizerI.BYTE_ORDER_MARK,'')
|
288 |
+
text=text.replace(NormalizerI.BYTE_ORDER_MARK_2,'')
|
289 |
+
text=text.replace(NormalizerI.WORD_JOINER,'')
|
290 |
+
text=text.replace(NormalizerI.SOFT_HYPHEN,'')
|
291 |
+
|
292 |
+
text=text.replace(NormalizerI.ZERO_WIDTH_SPACE,' ') # ??
|
293 |
+
text=text.replace(NormalizerI.NO_BREAK_SPACE,' ')
|
294 |
+
|
295 |
+
text=text.replace(NormalizerI.ZERO_WIDTH_NON_JOINER, '')
|
296 |
+
text=text.replace(NormalizerI.ZERO_WIDTH_JOINER,'')
|
297 |
+
|
298 |
+
text=self._normalize_punctuations(text)
|
299 |
+
|
300 |
+
if self.do_normalize_chandras:
|
301 |
+
text=self._normalize_chandras(text)
|
302 |
+
text=self._normalize_nasals(text)
|
303 |
+
if self.do_normalize_vowel_ending:
|
304 |
+
text=self._normalize_vowel_ending(text)
|
305 |
+
|
306 |
+
return text
|
307 |
+
|
308 |
+
|
309 |
+
def get_char_stats(self,text):
|
310 |
+
print(len(re.findall(NormalizerI.BYTE_ORDER_MARK,text)))
|
311 |
+
print(len(re.findall(NormalizerI.BYTE_ORDER_MARK_2,text)))
|
312 |
+
print(len(re.findall(NormalizerI.WORD_JOINER,text)))
|
313 |
+
print(len(re.findall(NormalizerI.SOFT_HYPHEN,text)))
|
314 |
+
|
315 |
+
print(len(re.findall(NormalizerI.ZERO_WIDTH_SPACE,text) ))
|
316 |
+
print(len(re.findall(NormalizerI.NO_BREAK_SPACE,text)))
|
317 |
+
|
318 |
+
print(len(re.findall(NormalizerI.ZERO_WIDTH_NON_JOINER,text)))
|
319 |
+
print(len(re.findall(NormalizerI.ZERO_WIDTH_JOINER,text)))
|
320 |
+
|
321 |
+
#for mobj in re.finditer(NormalizerI.ZERO_WIDTH_NON_JOINER,text):
|
322 |
+
# print text[mobj.start()-10:mobj.end()+10].replace('\n', ' ').replace(NormalizerI.ZERO_WIDTH_NON_JOINER,'').encode('utf-8')
|
323 |
+
#print hex(ord(text[mobj.end():mobj.end()+1]))
|
324 |
+
|
325 |
+
def correct_visarga(self,text,visarga_char,char_range):
|
326 |
+
text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
class DevanagariNormalizer(BaseNormalizer):
|
331 |
+
"""
|
332 |
+
Normalizer for the Devanagari script. In addition to basic normalization by the super class,
|
333 |
+
|
334 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
335 |
+
* replace pipe character '|' by poorna virama character
|
336 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
337 |
+
|
338 |
+
"""
|
339 |
+
|
340 |
+
NUKTA='\u093C'
|
341 |
+
|
342 |
+
def __init__(self,lang='hi',remove_nuktas=False,nasals_mode='do_nothing',
|
343 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
344 |
+
super(DevanagariNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
345 |
+
|
346 |
+
def normalize(self,text):
|
347 |
+
|
348 |
+
# common normalization for Indic scripts
|
349 |
+
text=super(DevanagariNormalizer,self).normalize(text)
|
350 |
+
|
351 |
+
# chandra a replacement for Marathi
|
352 |
+
text=text.replace('\u0972','\u090f')
|
353 |
+
|
354 |
+
# decomposing Nukta based composite characters
|
355 |
+
text=text.replace('\u0929','\u0928'+DevanagariNormalizer.NUKTA)
|
356 |
+
text=text.replace('\u0931','\u0930'+DevanagariNormalizer.NUKTA)
|
357 |
+
text=text.replace('\u0934','\u0933'+DevanagariNormalizer.NUKTA)
|
358 |
+
text=text.replace('\u0958','\u0915'+DevanagariNormalizer.NUKTA)
|
359 |
+
text=text.replace('\u0959','\u0916'+DevanagariNormalizer.NUKTA)
|
360 |
+
text=text.replace('\u095A','\u0917'+DevanagariNormalizer.NUKTA)
|
361 |
+
text=text.replace('\u095B','\u091C'+DevanagariNormalizer.NUKTA)
|
362 |
+
text=text.replace('\u095C','\u0921'+DevanagariNormalizer.NUKTA)
|
363 |
+
text=text.replace('\u095D','\u0922'+DevanagariNormalizer.NUKTA)
|
364 |
+
text=text.replace('\u095E','\u092B'+DevanagariNormalizer.NUKTA)
|
365 |
+
text=text.replace('\u095F','\u092F'+DevanagariNormalizer.NUKTA)
|
366 |
+
|
367 |
+
if self.remove_nuktas:
|
368 |
+
text=text.replace(DevanagariNormalizer.NUKTA,'')
|
369 |
+
|
370 |
+
# replace pipe character for poorna virama
|
371 |
+
text=text.replace('\u007c','\u0964')
|
372 |
+
|
373 |
+
# correct visarga
|
374 |
+
text=re.sub(r'([\u0900-\u097f]):','\\1\u0903',text)
|
375 |
+
|
376 |
+
return text
|
377 |
+
|
378 |
+
def get_char_stats(self,text):
|
379 |
+
super(DevanagariNormalizer,self).get_char_stats(text)
|
380 |
+
|
381 |
+
print((len(re.findall('\u0929',text))))
|
382 |
+
print((len(re.findall('\u0931',text))))
|
383 |
+
print((len(re.findall('\u0934',text))))
|
384 |
+
print((len(re.findall('\u0958',text))))
|
385 |
+
print((len(re.findall('\u0959',text))))
|
386 |
+
print((len(re.findall('\u095A',text))))
|
387 |
+
print((len(re.findall('\u095B',text))))
|
388 |
+
print((len(re.findall('\u095C',text))))
|
389 |
+
print((len(re.findall('\u095D',text))))
|
390 |
+
print((len(re.findall('\u095E',text))))
|
391 |
+
print((len(re.findall('\u095F',text))))
|
392 |
+
|
393 |
+
#print(len(re.findall(u'\u0928'+DevanagariNormalizer.NUKTA,text)))
|
394 |
+
#print(len(re.findall(u'\u0930'+DevanagariNormalizer.NUKTA,text)))
|
395 |
+
#print(len(re.findall(u'\u0933'+DevanagariNormalizer.NUKTA,text)))
|
396 |
+
#print(len(re.findall(u'\u0915'+DevanagariNormalizer.NUKTA,text)))
|
397 |
+
#print(len(re.findall(u'\u0916'+DevanagariNormalizer.NUKTA,text)))
|
398 |
+
#print(len(re.findall(u'\u0917'+DevanagariNormalizer.NUKTA,text)))
|
399 |
+
#print(len(re.findall(u'\u091C'+DevanagariNormalizer.NUKTA,text)))
|
400 |
+
#print(len(re.findall(u'\u0921'+DevanagariNormalizer.NUKTA,text)))
|
401 |
+
#print(len(re.findall(u'\u0922'+DevanagariNormalizer.NUKTA,text)))
|
402 |
+
#print(len(re.findall(u'\u092B'+DevanagariNormalizer.NUKTA,text)))
|
403 |
+
#print(len(re.findall(u'\u092F'+DevanagariNormalizer.NUKTA,text)))
|
404 |
+
|
405 |
+
class GurmukhiNormalizer(BaseNormalizer):
|
406 |
+
"""
|
407 |
+
Normalizer for the Gurmukhi script. In addition to basic normalization by the super class,
|
408 |
+
|
409 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
410 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
411 |
+
* replace pipe character '|' by poorna virama character
|
412 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
413 |
+
"""
|
414 |
+
|
415 |
+
NUKTA='\u0A3C'
|
416 |
+
|
417 |
+
VOWEL_NORM_MAPS={
|
418 |
+
## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
419 |
+
## Table 12-16
|
420 |
+
'\u0a05\u0a3e': '\u0a06',
|
421 |
+
'\u0a72\u0a3f': '\u0a07',
|
422 |
+
'\u0a72\u0a40': '\u0a08',
|
423 |
+
'\u0a73\u0a41': '\u0a09',
|
424 |
+
'\u0a73\u0a42': '\u0a0a',
|
425 |
+
'\u0a72\u0a47': '\u0a0f',
|
426 |
+
'\u0a05\u0a48': '\u0a10',
|
427 |
+
'\u0a73\u0a4b': '\u0a13',
|
428 |
+
'\u0a05\u0a4c': '\u0a14',
|
429 |
+
}
|
430 |
+
|
431 |
+
def __init__(self,lang='pa',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
432 |
+
do_normalize_vowel_ending=False,
|
433 |
+
do_canonicalize_addak=False,
|
434 |
+
do_canonicalize_tippi=False,
|
435 |
+
do_replace_vowel_bases=False):
|
436 |
+
super(GurmukhiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
437 |
+
self.do_canonicalize_addak=do_canonicalize_addak
|
438 |
+
self.do_canonicalize_tippi=do_canonicalize_tippi
|
439 |
+
self.do_replace_vowel_bases=do_replace_vowel_bases
|
440 |
+
|
441 |
+
|
442 |
+
def _normalize_vowels(self,text):
|
443 |
+
"""
|
444 |
+
|
445 |
+
"""
|
446 |
+
|
447 |
+
## standard vowel replacements as per suggestions in
|
448 |
+
## http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
449 |
+
## Table 12-16
|
450 |
+
|
451 |
+
for k,v in GurmukhiNormalizer.VOWEL_NORM_MAPS.items():
|
452 |
+
text=text.replace(k,v)
|
453 |
+
|
454 |
+
## the above mappings should account for majority of the variantions,
|
455 |
+
## Rest are handled via this generic rule which looks at the diacritic
|
456 |
+
## following the 2 special characters
|
457 |
+
## TBD: don't see evidence for this in Wikipedia corpus
|
458 |
+
|
459 |
+
## If these special characters occur without any diacritic, replace them with closet
|
460 |
+
## equivalent vowels
|
461 |
+
if self.do_replace_vowel_bases:
|
462 |
+
text=text.replace('\u0a72','\u0a07')
|
463 |
+
text=text.replace('\u0a73','\u0a09')
|
464 |
+
|
465 |
+
return text
|
466 |
+
|
467 |
+
|
468 |
+
def normalize(self,text):
|
469 |
+
|
470 |
+
# Addak
|
471 |
+
if self.do_canonicalize_addak:
|
472 |
+
## replace addak+consonant with consonat+halant+consonant
|
473 |
+
text=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',text)
|
474 |
+
|
475 |
+
# Tippi
|
476 |
+
if self.do_canonicalize_tippi:
|
477 |
+
text=text.replace('\u0a70','\u0a02')
|
478 |
+
|
479 |
+
# Vowels: Gurumuki has multiple ways of representing independent vowels due
|
480 |
+
# to the characters 'iri' and 'ura'.
|
481 |
+
text=self._normalize_vowels(text)
|
482 |
+
|
483 |
+
# common normalization for Indic scripts
|
484 |
+
text=super(GurmukhiNormalizer,self).normalize(text)
|
485 |
+
|
486 |
+
# decomposing Nukta based composite characters
|
487 |
+
text=text.replace('\u0a33','\u0a32'+GurmukhiNormalizer.NUKTA)
|
488 |
+
text=text.replace('\u0a36','\u0a38'+GurmukhiNormalizer.NUKTA)
|
489 |
+
text=text.replace('\u0a59','\u0a16'+GurmukhiNormalizer.NUKTA)
|
490 |
+
text=text.replace('\u0a5a','\u0a17'+GurmukhiNormalizer.NUKTA)
|
491 |
+
text=text.replace('\u0a5b','\u0a1c'+GurmukhiNormalizer.NUKTA)
|
492 |
+
text=text.replace('\u0a5e','\u0a2b'+GurmukhiNormalizer.NUKTA)
|
493 |
+
|
494 |
+
if self.remove_nuktas:
|
495 |
+
text=text.replace(GurmukhiNormalizer.NUKTA,'')
|
496 |
+
|
497 |
+
# replace the poorna virama codes specific to script
|
498 |
+
# with generic Indic script codes
|
499 |
+
text=text.replace('\u0a64','\u0964')
|
500 |
+
text=text.replace('\u0a65','\u0965')
|
501 |
+
|
502 |
+
## replace pipe character for poorna virama
|
503 |
+
text=text.replace('\u007c','\u0964')
|
504 |
+
|
505 |
+
# correct visarge
|
506 |
+
text=re.sub(r'([\u0a00-\u0a7f]):','\\1\u0a03',text)
|
507 |
+
|
508 |
+
return text
|
509 |
+
|
510 |
+
|
511 |
+
class GujaratiNormalizer(BaseNormalizer):
|
512 |
+
"""
|
513 |
+
Normalizer for the Gujarati script. In addition to basic normalization by the super class,
|
514 |
+
|
515 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
516 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
517 |
+
"""
|
518 |
+
|
519 |
+
NUKTA='\u0ABC'
|
520 |
+
|
521 |
+
def __init__(self,lang='gu',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
522 |
+
do_normalize_vowel_ending=False):
|
523 |
+
super(GujaratiNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
524 |
+
|
525 |
+
def normalize(self,text):
|
526 |
+
|
527 |
+
# common normalization for Indic scripts
|
528 |
+
text=super(GujaratiNormalizer,self).normalize(text)
|
529 |
+
|
530 |
+
# decomposing Nukta based composite characters
|
531 |
+
if self.remove_nuktas:
|
532 |
+
text=text.replace(GujaratiNormalizer.NUKTA,'')
|
533 |
+
|
534 |
+
|
535 |
+
# replace the poorna virama codes specific to script
|
536 |
+
# with generic Indic script codes
|
537 |
+
text=text.replace('\u0ae4','\u0964')
|
538 |
+
text=text.replace('\u0ae5','\u0965')
|
539 |
+
|
540 |
+
# correct visarge
|
541 |
+
text=re.sub(r'([\u0a80-\u0aff]):','\\1\u0a83',text)
|
542 |
+
|
543 |
+
return text
|
544 |
+
|
545 |
+
|
546 |
+
class OriyaNormalizer(BaseNormalizer):
|
547 |
+
"""
|
548 |
+
Normalizer for the Oriya script. In addition to basic normalization by the super class,
|
549 |
+
|
550 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
551 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
552 |
+
* Canonicalize two part dependent vowels
|
553 |
+
* Replace 'va' with 'ba'
|
554 |
+
* replace pipe character '|' by poorna virama character
|
555 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
556 |
+
"""
|
557 |
+
|
558 |
+
NUKTA='\u0B3C'
|
559 |
+
|
560 |
+
VOWEL_NORM_MAPS={
|
561 |
+
## See Table 12-22 in http://www.unicode.org/versions/Unicode12.1.0/ch12.pdf
|
562 |
+
'\u0b05\u0b3e': '\u0b06',
|
563 |
+
'\u0b0f\u0b57': '\u0b10',
|
564 |
+
'\u0b13\u0b57': '\u0b14',
|
565 |
+
}
|
566 |
+
|
567 |
+
|
568 |
+
def __init__(self,lang='or',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
569 |
+
do_normalize_vowel_ending=False,
|
570 |
+
do_remap_wa=False):
|
571 |
+
super(OriyaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
572 |
+
self.do_remap_wa=do_remap_wa
|
573 |
+
|
574 |
+
def normalize(self,text):
|
575 |
+
|
576 |
+
# common normalization for Indic scripts
|
577 |
+
text=super(OriyaNormalizer,self).normalize(text)
|
578 |
+
|
579 |
+
## standard vowel replacements as per suggestions in Unicode documents
|
580 |
+
for k,v in OriyaNormalizer.VOWEL_NORM_MAPS.items():
|
581 |
+
text=text.replace(k,v)
|
582 |
+
|
583 |
+
# decomposing Nukta based composite characters
|
584 |
+
text=text.replace('\u0b5c','\u0b21'+OriyaNormalizer.NUKTA)
|
585 |
+
text=text.replace('\u0b5d','\u0b22'+OriyaNormalizer.NUKTA)
|
586 |
+
|
587 |
+
if self.remove_nuktas:
|
588 |
+
text=text.replace(OriyaNormalizer.NUKTA,'')
|
589 |
+
|
590 |
+
# replace the poorna virama codes specific to script
|
591 |
+
# with generic Indic script codes
|
592 |
+
text=text.replace('\u0b64','\u0964')
|
593 |
+
text=text.replace('\u0b65','\u0965')
|
594 |
+
|
595 |
+
# replace pipe character for poorna virama
|
596 |
+
text=text.replace('\u0b7c','\u0964')
|
597 |
+
|
598 |
+
# replace wa with ba
|
599 |
+
if self.do_remap_wa:
|
600 |
+
text=text.replace('\u0b71','\u0b2c')
|
601 |
+
|
602 |
+
# replace va with ba
|
603 |
+
# NOTE: documentation (chapter on Indic scripts) and codepoint chart seem contradictory
|
604 |
+
# (this applied to wa to ba rule also above)
|
605 |
+
text=text.replace('\u0b35','\u0b2c')
|
606 |
+
|
607 |
+
# AI dependent vowel sign
|
608 |
+
text=text.replace('\u0b47\u0b56','\u0b58')
|
609 |
+
|
610 |
+
# two part dependent vowels
|
611 |
+
text=text.replace('\u0b47\u0b3e','\u0b4b')
|
612 |
+
text=text.replace('\u0b47\u0b57','\u0b4c')
|
613 |
+
|
614 |
+
|
615 |
+
# additional consonant - not clear how to handle this
|
616 |
+
# ignore
|
617 |
+
|
618 |
+
# correct visarge
|
619 |
+
text=re.sub(r'([\u0b00-\u0b7f]):','\\1\u0b03',text)
|
620 |
+
|
621 |
+
return text
|
622 |
+
|
623 |
+
|
624 |
+
class BengaliNormalizer(BaseNormalizer):
|
625 |
+
"""
|
626 |
+
Normalizer for the Bengali script. In addition to basic normalization by the super class,
|
627 |
+
|
628 |
+
* Replaces the composite characters containing nuktas by their decomposed form
|
629 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
630 |
+
* Canonicalize two part dependent vowels
|
631 |
+
* replace pipe character '|' by poorna virama character
|
632 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
633 |
+
|
634 |
+
"""
|
635 |
+
|
636 |
+
NUKTA='\u09BC'
|
637 |
+
|
638 |
+
def __init__(self,lang='bn',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
639 |
+
do_normalize_vowel_ending=False,
|
640 |
+
do_remap_assamese_chars=False):
|
641 |
+
super(BengaliNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
642 |
+
self.do_remap_assamese_chars=do_remap_assamese_chars
|
643 |
+
|
644 |
+
def normalize(self,text):
|
645 |
+
|
646 |
+
# common normalization for Indic scripts
|
647 |
+
text=super(BengaliNormalizer,self).normalize(text)
|
648 |
+
|
649 |
+
# decomposing Nukta based composite characters
|
650 |
+
text=text.replace('\u09dc','\u09a1'+BengaliNormalizer.NUKTA)
|
651 |
+
text=text.replace('\u09dd','\u09a2'+BengaliNormalizer.NUKTA)
|
652 |
+
text=text.replace('\u09df','\u09af'+BengaliNormalizer.NUKTA)
|
653 |
+
|
654 |
+
if self.remove_nuktas:
|
655 |
+
text=text.replace(BengaliNormalizer.NUKTA,'')
|
656 |
+
|
657 |
+
if self.do_remap_assamese_chars and self.lang=='as':
|
658 |
+
text=text.replace('\u09f0','\u09b0') # 'ra' character
|
659 |
+
text=text.replace('\u09f1','\u09ac') # 'va' character
|
660 |
+
|
661 |
+
# replace the poorna virama codes specific to script
|
662 |
+
# with generic Indic script codes
|
663 |
+
text=text.replace('\u09e4','\u0964')
|
664 |
+
text=text.replace('\u09e5','\u0965')
|
665 |
+
|
666 |
+
# replace pipe character for poorna virama
|
667 |
+
text=text.replace('\u007c','\u0964')
|
668 |
+
# replace bengali currency numerator four for poorna virama (it looks similar and is used as a substitute)
|
669 |
+
text=text.replace('\u09f7','\u0964')
|
670 |
+
|
671 |
+
# two part dependent vowels
|
672 |
+
text=text.replace('\u09c7\u09be','\u09cb')
|
673 |
+
text=text.replace('\u09c7\u09d7','\u09cc')
|
674 |
+
|
675 |
+
# correct visarge
|
676 |
+
text=re.sub(r'([\u0980-\u09ff]):','\\1\u0983',text)
|
677 |
+
|
678 |
+
return text
|
679 |
+
|
680 |
+
|
681 |
+
class TamilNormalizer(BaseNormalizer):
|
682 |
+
"""
|
683 |
+
Normalizer for the Tamil script. In addition to basic normalization by the super class,
|
684 |
+
|
685 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
686 |
+
* canonicalize two-part dependent vowel signs
|
687 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
688 |
+
"""
|
689 |
+
|
690 |
+
def __init__(self,lang='ta',remove_nuktas=False,nasals_mode='do_nothing',
|
691 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
692 |
+
super(TamilNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
693 |
+
|
694 |
+
def normalize(self,text):
|
695 |
+
|
696 |
+
# common normalization for Indic scripts
|
697 |
+
text=super(TamilNormalizer,self).normalize(text)
|
698 |
+
|
699 |
+
# replace the poorna virama codes specific to script
|
700 |
+
# with generic Indic script codes
|
701 |
+
text=text.replace('\u0be4','\u0964')
|
702 |
+
text=text.replace('\u0be5','\u0965')
|
703 |
+
|
704 |
+
# two part dependent vowels
|
705 |
+
text=text.replace('\u0b92\u0bd7','\u0b94')
|
706 |
+
text=text.replace('\u0bc6\u0bbe','\u0bca')
|
707 |
+
text=text.replace('\u0bc7\u0bbe','\u0bcb')
|
708 |
+
text=text.replace('\u0bc6\u0bd7','\u0bcc')
|
709 |
+
|
710 |
+
# correct visarge
|
711 |
+
text=re.sub(r'([\u0b80-\u0bff]):','\\1\u0b83',text)
|
712 |
+
|
713 |
+
return text
|
714 |
+
|
715 |
+
|
716 |
+
class TeluguNormalizer(BaseNormalizer):
|
717 |
+
"""
|
718 |
+
Normalizer for the Teluguscript. In addition to basic normalization by the super class,
|
719 |
+
|
720 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
721 |
+
* canonicalize two-part dependent vowel signs
|
722 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
723 |
+
"""
|
724 |
+
|
725 |
+
def __init__(self,lang='te',remove_nuktas=False,nasals_mode='do_nothing',
|
726 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
727 |
+
super(TeluguNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
728 |
+
|
729 |
+
def normalize(self,text):
|
730 |
+
|
731 |
+
# common normalization for Indic scripts
|
732 |
+
text=super(TeluguNormalizer,self).normalize(text)
|
733 |
+
|
734 |
+
# replace the poorna virama codes specific to script
|
735 |
+
# with generic Indic script codes
|
736 |
+
text=text.replace('\u0c64','\u0964')
|
737 |
+
text=text.replace('\u0c65','\u0965')
|
738 |
+
|
739 |
+
# dependent vowels
|
740 |
+
text=text.replace('\u0c46\u0c56','\u0c48')
|
741 |
+
|
742 |
+
# correct visarge
|
743 |
+
text=re.sub(r'([\u0c00-\u0c7f]):','\\1\u0c03',text)
|
744 |
+
|
745 |
+
return text
|
746 |
+
|
747 |
+
def get_char_stats(self,text):
|
748 |
+
pass
|
749 |
+
|
750 |
+
class KannadaNormalizer(BaseNormalizer):
|
751 |
+
"""
|
752 |
+
Normalizer for the Kannada script. In addition to basic normalization by the super class,
|
753 |
+
|
754 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
755 |
+
* canonicalize two-part dependent vowel signs
|
756 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
757 |
+
"""
|
758 |
+
|
759 |
+
def __init__(self,lang='kn',remove_nuktas=False,nasals_mode='do_nothing',
|
760 |
+
do_normalize_chandras=False,do_normalize_vowel_ending=False):
|
761 |
+
super(KannadaNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
762 |
+
|
763 |
+
|
764 |
+
def normalize(self,text):
|
765 |
+
|
766 |
+
# common normalization for Indic scripts
|
767 |
+
text=super(KannadaNormalizer,self).normalize(text)
|
768 |
+
|
769 |
+
# replace the poorna virama codes specific to script
|
770 |
+
# with generic Indic script codes
|
771 |
+
text=text.replace('\u0ce4','\u0964')
|
772 |
+
text=text.replace('\u0ce5','\u0965')
|
773 |
+
|
774 |
+
# dependent vowels
|
775 |
+
text=text.replace('\u0cbf\u0cd5','\u0cc0')
|
776 |
+
text=text.replace('\u0cc6\u0cd5','\u0cc7')
|
777 |
+
text=text.replace('\u0cc6\u0cd6','\u0cc8')
|
778 |
+
text=text.replace('\u0cc6\u0cc2','\u0cca')
|
779 |
+
text=text.replace('\u0cca\u0cd5','\u0ccb')
|
780 |
+
|
781 |
+
# correct visarge
|
782 |
+
text=re.sub(r'([\u0c80-\u0cff]):','\\1\u0c83',text)
|
783 |
+
|
784 |
+
return text
|
785 |
+
|
786 |
+
|
787 |
+
class MalayalamNormalizer(BaseNormalizer):
|
788 |
+
"""
|
789 |
+
Normalizer for the Malayalam script. In addition to basic normalization by the super class,
|
790 |
+
|
791 |
+
* Replace the reserved character for poorna virama (if used) with the recommended generic Indic scripts poorna virama
|
792 |
+
* canonicalize two-part dependent vowel signs
|
793 |
+
* Change from old encoding of chillus (till Unicode 5.0) to new encoding
|
794 |
+
* replace colon ':' by visarga if the colon follows a charcter in this script
|
795 |
+
"""
|
796 |
+
|
797 |
+
CHILLU_CHAR_MAP= {
|
798 |
+
'\u0d7a': '\u0d23',
|
799 |
+
'\u0d7b': '\u0d28',
|
800 |
+
'\u0d7c': '\u0d30',
|
801 |
+
'\u0d7d': '\u0d32',
|
802 |
+
'\u0d7e': '\u0d33',
|
803 |
+
'\u0d7f': '\u0d15',
|
804 |
+
}
|
805 |
+
|
806 |
+
def _canonicalize_chillus(self,text):
|
807 |
+
for chillu, char in MalayalamNormalizer.CHILLU_CHAR_MAP.items():
|
808 |
+
text=text.replace(chillu,'{}\u0d4d'.format(char))
|
809 |
+
return text
|
810 |
+
|
811 |
+
def _correct_geminated_T(self,text):
|
812 |
+
return text.replace('\u0d31\u0d4d\u0d31','\u0d1f\u0d4d\u0d1f')
|
813 |
+
|
814 |
+
def __init__(self,lang='ml',remove_nuktas=False,nasals_mode='do_nothing',do_normalize_chandras=False,
|
815 |
+
do_normalize_vowel_ending=False,
|
816 |
+
do_canonicalize_chillus=False, do_correct_geminated_T=False):
|
817 |
+
super(MalayalamNormalizer,self).__init__(lang,remove_nuktas,nasals_mode,do_normalize_chandras,do_normalize_vowel_ending)
|
818 |
+
self.do_canonicalize_chillus=do_canonicalize_chillus
|
819 |
+
self.do_correct_geminated_T=do_correct_geminated_T
|
820 |
+
|
821 |
+
def normalize(self,text):
|
822 |
+
|
823 |
+
# Change from old encoding of chillus (till Unicode 5.0) to new encoding
|
824 |
+
text=text.replace('\u0d23\u0d4d\u200d','\u0d7a')
|
825 |
+
text=text.replace('\u0d28\u0d4d\u200d','\u0d7b')
|
826 |
+
text=text.replace('\u0d30\u0d4d\u200d','\u0d7c')
|
827 |
+
text=text.replace('\u0d32\u0d4d\u200d','\u0d7d')
|
828 |
+
text=text.replace('\u0d33\u0d4d\u200d','\u0d7e')
|
829 |
+
text=text.replace('\u0d15\u0d4d\u200d','\u0d7f')
|
830 |
+
|
831 |
+
# Normalize chillus
|
832 |
+
if self.do_canonicalize_chillus:
|
833 |
+
text=self._canonicalize_chillus(text)
|
834 |
+
|
835 |
+
# common normalization for Indic scripts
|
836 |
+
text=super(MalayalamNormalizer,self).normalize(text)
|
837 |
+
|
838 |
+
# replace the poorna virama codes specific to script
|
839 |
+
# with generic Indic script codes
|
840 |
+
text=text.replace('\u0d64','\u0964')
|
841 |
+
text=text.replace('\u0d65','\u0965')
|
842 |
+
|
843 |
+
# dependent vowels
|
844 |
+
text=text.replace('\u0d46\u0d3e','\u0d4a')
|
845 |
+
text=text.replace('\u0d47\u0d3e','\u0d4b')
|
846 |
+
|
847 |
+
# au forms
|
848 |
+
text=text.replace('\u0d46\u0d57','\u0d4c')
|
849 |
+
text=text.replace('\u0d57','\u0d4c')
|
850 |
+
|
851 |
+
# correct geminated T
|
852 |
+
if self.do_correct_geminated_T:
|
853 |
+
text=self._correct_geminated_T(text)
|
854 |
+
|
855 |
+
# correct visarga
|
856 |
+
text=re.sub(r'([\u0d00-\u0d7f]):','\\1\u0d03',text)
|
857 |
+
|
858 |
+
return text
|
859 |
+
|
860 |
+
class UrduNormalizer(NormalizerI):
|
861 |
+
'''Uses UrduHack library.
|
862 |
+
https://docs.urduhack.com/en/stable/_modules/urduhack/normalization/character.html#normalize
|
863 |
+
'''
|
864 |
+
|
865 |
+
def __init__(self, lang, remove_nuktas=True):
|
866 |
+
self.lang = lang
|
867 |
+
self.remove_nuktas = remove_nuktas
|
868 |
+
|
869 |
+
from urduhack.normalization import (
|
870 |
+
remove_diacritics,
|
871 |
+
normalize_characters,
|
872 |
+
normalize_combine_characters
|
873 |
+
) # TODO: Use only required normalizers
|
874 |
+
from urduhack.preprocessing import (
|
875 |
+
normalize_whitespace,
|
876 |
+
digits_space,
|
877 |
+
all_punctuations_space,
|
878 |
+
english_characters_space
|
879 |
+
)
|
880 |
+
|
881 |
+
def normalize(self, text):
|
882 |
+
text = self._normalize_punctuations(text)
|
883 |
+
text = UrduNormalizer.normalize_whitespace(text)
|
884 |
+
if self.remove_nuktas:
|
885 |
+
text = UrduNormalizer.remove_diacritics(text)
|
886 |
+
text = UrduNormalizer.normalize_characters(text)
|
887 |
+
text = UrduNormalizer.normalize_combine_characters(text)
|
888 |
+
text = UrduNormalizer.digits_space(text)
|
889 |
+
text = UrduNormalizer.all_punctuations_space(text)
|
890 |
+
text = UrduNormalizer.english_characters_space(text)
|
891 |
+
return text
|
892 |
+
|
893 |
+
|
894 |
+
class IndicNormalizerFactory(object):
|
895 |
+
"""
|
896 |
+
Factory class to create language specific normalizers.
|
897 |
+
|
898 |
+
"""
|
899 |
+
|
900 |
+
def get_normalizer(self,language,**kwargs):
|
901 |
+
"""
|
902 |
+
Call the get_normalizer function to get the language specific normalizer
|
903 |
+
|
904 |
+
Paramters:
|
905 |
+
|language: language code
|
906 |
+
|remove_nuktas: boolean, should the normalizer remove nukta characters
|
907 |
+
"""
|
908 |
+
normalizer=None
|
909 |
+
if language in ['hi','mr','sa','kK','ne','sd']:
|
910 |
+
normalizer=DevanagariNormalizer(lang=language, **kwargs)
|
911 |
+
elif language in ['ur']:
|
912 |
+
normalizer = UrduNormalizer(lang=language, **kwargs)
|
913 |
+
elif language in ['pa']:
|
914 |
+
normalizer=GurmukhiNormalizer(lang=language, **kwargs)
|
915 |
+
elif language in ['gu']:
|
916 |
+
normalizer=GujaratiNormalizer(lang=language, **kwargs)
|
917 |
+
elif language in ['bn']:
|
918 |
+
normalizer=BengaliNormalizer(lang=language, **kwargs)
|
919 |
+
elif language in ['as']:
|
920 |
+
normalizer=BengaliNormalizer(lang=language, **kwargs)
|
921 |
+
elif language in ['or']:
|
922 |
+
normalizer=OriyaNormalizer(lang=language, **kwargs)
|
923 |
+
elif language in ['ml']:
|
924 |
+
normalizer=MalayalamNormalizer(lang=language, **kwargs)
|
925 |
+
elif language in ['kn']:
|
926 |
+
normalizer=KannadaNormalizer(lang=language, **kwargs)
|
927 |
+
elif language in ['ta']:
|
928 |
+
normalizer=TamilNormalizer(lang=language, **kwargs)
|
929 |
+
elif language in ['te']:
|
930 |
+
normalizer=TeluguNormalizer(lang=language, **kwargs)
|
931 |
+
else:
|
932 |
+
normalizer=BaseNormalizer(lang=language, **kwargs)
|
933 |
+
|
934 |
+
return normalizer
|
935 |
+
|
936 |
+
def is_language_supported(self,language):
|
937 |
+
"""
|
938 |
+
Is the language supported?
|
939 |
+
"""
|
940 |
+
if language in ['hi','mr','sa','kK','ne','sd',
|
941 |
+
'ur',
|
942 |
+
'pa',
|
943 |
+
'gu',
|
944 |
+
'bn','as',
|
945 |
+
'or',
|
946 |
+
'ml',
|
947 |
+
'kn',
|
948 |
+
'ta',
|
949 |
+
'te']:
|
950 |
+
return True
|
951 |
+
else:
|
952 |
+
return False
|
953 |
+
|
954 |
+
|
955 |
+
if __name__ == '__main__':
|
956 |
+
|
957 |
+
if len(sys.argv)<4:
|
958 |
+
print("Usage: python normalize.py <infile> <outfile> <language> [<replace_nukta(True,False)>] [<normalize_nasals(do_nothing|to_anusvaara_strict|to_anusvaara_relaxed|to_nasal_consonants)>]")
|
959 |
+
sys.exit(1)
|
960 |
+
|
961 |
+
language=sys.argv[3]
|
962 |
+
remove_nuktas=False
|
963 |
+
normalize_nasals='do_nothing'
|
964 |
+
if len(sys.argv)>=5:
|
965 |
+
remove_nuktas=bool(sys.argv[4])
|
966 |
+
if len(sys.argv)>=6:
|
967 |
+
normalize_nasals=sys.argv[5]
|
968 |
+
|
969 |
+
# create normalizer
|
970 |
+
factory=IndicNormalizerFactory()
|
971 |
+
normalizer=factory.get_normalizer(language,remove_nuktas=remove_nuktas,nasals_mode=normalize_nasals)
|
972 |
+
|
973 |
+
# DO normalization
|
974 |
+
with codecs.open(sys.argv[1],'r','utf-8') as ifile:
|
975 |
+
with codecs.open(sys.argv[2],'w','utf-8') as ofile:
|
976 |
+
for line in ifile.readlines():
|
977 |
+
normalized_line=normalizer.normalize(line)
|
978 |
+
ofile.write(normalized_line)
|
979 |
+
|
980 |
+
## gather status about normalization
|
981 |
+
#with codecs.open(sys.argv[1],'r','utf-8') as ifile:
|
982 |
+
# normalizer=DevanagariNormalizer()
|
983 |
+
# text=string.join(ifile.readlines(),sep='')
|
984 |
+
# normalizer.get_char_stats(text)
|
indic_nlp_library/indicnlp/script/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/script/english_script.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
from indicnlp import common
|
13 |
+
from indicnlp.common import IndicNlpException
|
14 |
+
|
15 |
+
|
16 |
+
#### Maps from ARPABET to Internal Id
|
17 |
+
ARPABET_ID_MAP={}
|
18 |
+
ID_ARPABET_MAP={}
|
19 |
+
|
20 |
+
|
21 |
+
###
|
22 |
+
# Phonetic Information about script characters
|
23 |
+
###
|
24 |
+
|
25 |
+
""" Phonetic data for English """
|
26 |
+
ENGLISH_PHONETIC_DATA=None
|
27 |
+
|
28 |
+
""" Phonetic vector for English"""
|
29 |
+
ENGLISH_PHONETIC_VECTORS=None
|
30 |
+
|
31 |
+
""" Length of phonetic vector """
|
32 |
+
PHONETIC_VECTOR_LENGTH=38
|
33 |
+
|
34 |
+
""" Start offset for the phonetic feature vector in the phonetic data vector """
|
35 |
+
PHONETIC_VECTOR_START_OFFSET=6
|
36 |
+
|
37 |
+
## PHONETIC PROPERTIES in order in which they occur in the vector
|
38 |
+
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
|
39 |
+
PV_PROP=['basic_type',
|
40 |
+
'vowel_length',
|
41 |
+
'vowel_strength',
|
42 |
+
'vowel_status',
|
43 |
+
'consonant_type',
|
44 |
+
'articulation_place',
|
45 |
+
'aspiration',
|
46 |
+
'voicing',
|
47 |
+
'nasalization',
|
48 |
+
'vowel_horizontal',
|
49 |
+
'vowel_vertical',
|
50 |
+
'vowel_roundness',
|
51 |
+
]
|
52 |
+
|
53 |
+
###
|
54 |
+
# Bit vector ranges for various properties
|
55 |
+
###
|
56 |
+
|
57 |
+
PV_PROP_RANGES={
|
58 |
+
'basic_type': [0,6],
|
59 |
+
'vowel_length': [6,8],
|
60 |
+
'vowel_strength': [8,11],
|
61 |
+
'vowel_status': [11,13],
|
62 |
+
'consonant_type': [13,18],
|
63 |
+
'articulation_place': [18,23],
|
64 |
+
'aspiration': [23,25],
|
65 |
+
'voicing': [25,27],
|
66 |
+
'nasalization': [27,29],
|
67 |
+
'vowel_horizontal': [29,32],
|
68 |
+
'vowel_vertical': [32,36],
|
69 |
+
'vowel_roundness': [36,38],
|
70 |
+
}
|
71 |
+
|
72 |
+
|
73 |
+
####
|
74 |
+
# Indexes into the Phonetic Vector
|
75 |
+
####
|
76 |
+
PVIDX_BT_VOWEL=0
|
77 |
+
PVIDX_BT_CONSONANT=1
|
78 |
+
PVIDX_BT_NUKTA=2
|
79 |
+
PVIDX_BT_HALANT=3
|
80 |
+
PVIDX_BT_ANUSVAAR=4
|
81 |
+
PVIDX_BT_MISC=5
|
82 |
+
PVIDX_BT_S=PVIDX_BT_VOWEL
|
83 |
+
PVIDX_BT_E=PVIDX_BT_MISC+1
|
84 |
+
|
85 |
+
PVIDX_VSTAT_DEP=12
|
86 |
+
|
87 |
+
####
|
88 |
+
SCRIPT_RANGE_START=0x0D00
|
89 |
+
## TBD
|
90 |
+
SCRIPT_RANGE_END=0x0D2E
|
91 |
+
|
92 |
+
|
93 |
+
def init():
|
94 |
+
"""
|
95 |
+
To be called by library loader, do not call it in your program
|
96 |
+
"""
|
97 |
+
|
98 |
+
global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
|
99 |
+
|
100 |
+
ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8')
|
101 |
+
|
102 |
+
ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
|
103 |
+
|
104 |
+
PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1]
|
105 |
+
|
106 |
+
### Load mapping from ARPABET representation of phoneme to internal ID
|
107 |
+
global ARPABET_ID_MAP, ID_ARPABET_MAP
|
108 |
+
|
109 |
+
with open(common.get_resources_path()+'/script/english_arpabet_list.csv','r',encoding='utf-8') as infile:
|
110 |
+
for ph_id, name in enumerate(iter(infile)):
|
111 |
+
name=name.strip()
|
112 |
+
ARPABET_ID_MAP[name]=ph_id
|
113 |
+
ID_ARPABET_MAP[ph_id]=name
|
114 |
+
|
115 |
+
|
116 |
+
def phoneme_to_offset(ph):
|
117 |
+
return ARPABET_ID_MAP[ph]
|
118 |
+
|
119 |
+
def offset_to_phoneme(ph_id):
|
120 |
+
return ID_ARPABET_MAP[ph_id]
|
121 |
+
|
122 |
+
def phoneme_to_enc(ph):
|
123 |
+
return chr(SCRIPT_RANGE_START+phoneme_to_offset(ph))
|
124 |
+
|
125 |
+
def enc_to_phoneme(ph):
|
126 |
+
return offset_to_phoneme(enc_to_offset(ph))
|
127 |
+
|
128 |
+
def enc_to_offset(c):
|
129 |
+
return ord(c)-SCRIPT_RANGE_START
|
130 |
+
|
131 |
+
def in_range(offset):
|
132 |
+
return offset>=SCRIPT_RANGE_START and offset<SCRIPT_RANGE_END
|
133 |
+
|
134 |
+
def get_phonetic_info(lang):
|
135 |
+
return (ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS)
|
136 |
+
|
137 |
+
def invalid_vector():
|
138 |
+
## TODO: check if np datatype is correct?
|
139 |
+
return np.array([0]*PHONETIC_VECTOR_LENGTH)
|
140 |
+
|
141 |
+
def get_phonetic_feature_vector(p,lang):
|
142 |
+
|
143 |
+
offset=enc_to_offset(p)
|
144 |
+
|
145 |
+
if not in_range(offset):
|
146 |
+
return invalid_vector()
|
147 |
+
|
148 |
+
phonetic_data, phonetic_vectors= get_phonetic_info(lang)
|
149 |
+
|
150 |
+
if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
|
151 |
+
return invalid_vector()
|
152 |
+
|
153 |
+
return phonetic_vectors[offset]
|
154 |
+
|
indic_nlp_library/indicnlp/script/indic_scripts.py
ADDED
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import os
|
12 |
+
|
13 |
+
from indicnlp import common
|
14 |
+
from indicnlp.common import IndicNlpException
|
15 |
+
from indicnlp import langinfo as li
|
16 |
+
|
17 |
+
###
|
18 |
+
# Phonetic Information about script characters
|
19 |
+
###
|
20 |
+
|
21 |
+
""" Phonetic data about all languages except Tamil """
|
22 |
+
ALL_PHONETIC_DATA=None
|
23 |
+
|
24 |
+
""" Phonetic data for Tamil """
|
25 |
+
TAMIL_PHONETIC_DATA=None
|
26 |
+
|
27 |
+
""" Phonetic vector for all languages except Tamil """
|
28 |
+
ALL_PHONETIC_VECTORS=None
|
29 |
+
|
30 |
+
""" Phonetic vector for Tamil """
|
31 |
+
TAMIL_PHONETIC_VECTORS=None
|
32 |
+
|
33 |
+
""" Length of phonetic vector """
|
34 |
+
PHONETIC_VECTOR_LENGTH=38
|
35 |
+
|
36 |
+
""" Start offset for the phonetic feature vector in the phonetic data vector """
|
37 |
+
PHONETIC_VECTOR_START_OFFSET=6
|
38 |
+
|
39 |
+
## PHONETIC PROPERTIES in order in which they occur in the vector
|
40 |
+
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary
|
41 |
+
PV_PROP=['basic_type',
|
42 |
+
'vowel_length',
|
43 |
+
'vowel_strength',
|
44 |
+
'vowel_status',
|
45 |
+
'consonant_type',
|
46 |
+
'articulation_place',
|
47 |
+
'aspiration',
|
48 |
+
'voicing',
|
49 |
+
'nasalization',
|
50 |
+
'vowel_horizontal',
|
51 |
+
'vowel_vertical',
|
52 |
+
'vowel_roundness',
|
53 |
+
]
|
54 |
+
|
55 |
+
###
|
56 |
+
# Bit vector ranges for various properties
|
57 |
+
###
|
58 |
+
|
59 |
+
PV_PROP_RANGES={
|
60 |
+
'basic_type': [0,6],
|
61 |
+
'vowel_length': [6,8],
|
62 |
+
'vowel_strength': [8,11],
|
63 |
+
'vowel_status': [11,13],
|
64 |
+
'consonant_type': [13,18],
|
65 |
+
'articulation_place': [18,23],
|
66 |
+
'aspiration': [23,25],
|
67 |
+
'voicing': [25,27],
|
68 |
+
'nasalization': [27,29],
|
69 |
+
'vowel_horizontal': [29,32],
|
70 |
+
'vowel_vertical': [32,36],
|
71 |
+
'vowel_roundness': [36,38],
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
####
|
76 |
+
# Indexes into the Phonetic Vector
|
77 |
+
####
|
78 |
+
PVIDX_BT_VOWEL=0
|
79 |
+
PVIDX_BT_CONSONANT=1
|
80 |
+
PVIDX_BT_NUKTA=2
|
81 |
+
PVIDX_BT_HALANT=3
|
82 |
+
PVIDX_BT_ANUSVAAR=4
|
83 |
+
PVIDX_BT_MISC=5
|
84 |
+
PVIDX_BT_S=PVIDX_BT_VOWEL
|
85 |
+
PVIDX_BT_E=PVIDX_BT_MISC+1
|
86 |
+
|
87 |
+
PVIDX_VSTAT_DEP=12
|
88 |
+
|
89 |
+
#####
|
90 |
+
# Unicode information about characters
|
91 |
+
#####
|
92 |
+
|
93 |
+
SCRIPT_OFFSET_START=0
|
94 |
+
SCRIPT_OFFSET_RANGE=0x80
|
95 |
+
|
96 |
+
def init():
|
97 |
+
"""
|
98 |
+
To be called by library loader, do not call it in your program
|
99 |
+
"""
|
100 |
+
|
101 |
+
global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET
|
102 |
+
|
103 |
+
ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8')
|
104 |
+
TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8')
|
105 |
+
|
106 |
+
ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
|
107 |
+
TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values
|
108 |
+
|
109 |
+
PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1]
|
110 |
+
|
111 |
+
def is_supported_language(lang):
|
112 |
+
return lang in list(li.SCRIPT_RANGES.keys())
|
113 |
+
|
114 |
+
def get_offset(c,lang):
|
115 |
+
if not is_supported_language(lang):
|
116 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
117 |
+
return ord(c)-li.SCRIPT_RANGES[lang][0]
|
118 |
+
|
119 |
+
def offset_to_char(off,lang):
|
120 |
+
"""
|
121 |
+
Applicable to Brahmi derived Indic scripts
|
122 |
+
"""
|
123 |
+
if not is_supported_language(lang):
|
124 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
125 |
+
return chr(off+li.SCRIPT_RANGES[lang][0])
|
126 |
+
|
127 |
+
def is_indiclang_char(c,lang):
|
128 |
+
"""
|
129 |
+
Applicable to Brahmi derived Indic scripts
|
130 |
+
Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts
|
131 |
+
"""
|
132 |
+
if not is_supported_language(lang):
|
133 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
134 |
+
o=get_offset(c,lang)
|
135 |
+
return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \
|
136 |
+
or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA
|
137 |
+
|
138 |
+
def in_coordinated_range_offset(c_offset):
|
139 |
+
"""
|
140 |
+
Applicable to Brahmi derived Indic scripts
|
141 |
+
"""
|
142 |
+
return (c_offset>=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE)
|
143 |
+
|
144 |
+
def in_coordinated_range(c,lang):
|
145 |
+
if not is_supported_language(lang):
|
146 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
147 |
+
return in_coordinated_range_offset(get_offset(c,lang))
|
148 |
+
|
149 |
+
def get_phonetic_info(lang):
|
150 |
+
if not is_supported_language(lang):
|
151 |
+
raise IndicNlpException('Language {} not supported'.format(lang))
|
152 |
+
phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA
|
153 |
+
phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS
|
154 |
+
|
155 |
+
return (phonetic_data, phonetic_vectors)
|
156 |
+
|
157 |
+
def invalid_vector():
|
158 |
+
## TODO: check if np datatype is correct?
|
159 |
+
return np.array([0]*PHONETIC_VECTOR_LENGTH)
|
160 |
+
|
161 |
+
def get_phonetic_feature_vector(c,lang):
|
162 |
+
|
163 |
+
offset=get_offset(c,lang)
|
164 |
+
|
165 |
+
if not in_coordinated_range_offset(offset):
|
166 |
+
return invalid_vector()
|
167 |
+
|
168 |
+
phonetic_data, phonetic_vectors= get_phonetic_info(lang)
|
169 |
+
|
170 |
+
if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
|
171 |
+
return invalid_vector()
|
172 |
+
|
173 |
+
return phonetic_vectors[offset]
|
174 |
+
|
175 |
+
def get_phonetic_feature_vector_offset(offset,lang):
|
176 |
+
|
177 |
+
if not in_coordinated_range_offset(offset):
|
178 |
+
return invalid_vector()
|
179 |
+
|
180 |
+
phonetic_data, phonetic_vectors= get_phonetic_info(lang)
|
181 |
+
|
182 |
+
if phonetic_data.iloc[offset]['Valid Vector Representation']==0:
|
183 |
+
return invalid_vector()
|
184 |
+
|
185 |
+
return phonetic_vectors[offset]
|
186 |
+
|
187 |
+
### Unary operations on vectors
|
188 |
+
def is_valid(v):
|
189 |
+
return np.sum(v)>0
|
190 |
+
|
191 |
+
def is_vowel(v):
|
192 |
+
return v[PVIDX_BT_VOWEL]==1
|
193 |
+
|
194 |
+
def is_consonant(v):
|
195 |
+
return v[PVIDX_BT_CONSONANT]==1
|
196 |
+
|
197 |
+
def is_halant(v):
|
198 |
+
return v[PVIDX_BT_HALANT]==1
|
199 |
+
|
200 |
+
def is_nukta(v):
|
201 |
+
return v[PVIDX_BT_NUKTA]==1
|
202 |
+
|
203 |
+
def is_anusvaar(v):
|
204 |
+
return v[PVIDX_BT_ANUSVAAR]==1
|
205 |
+
|
206 |
+
def is_misc(v):
|
207 |
+
return v[PVIDX_BT_MISC]==1
|
208 |
+
|
209 |
+
def is_dependent_vowel(v):
|
210 |
+
return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1
|
211 |
+
|
212 |
+
def is_plosive(v):
|
213 |
+
return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1
|
214 |
+
|
215 |
+
### Binary operations on phonetic vectors
|
216 |
+
|
217 |
+
def or_vectors(v1,v2):
|
218 |
+
return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ])
|
219 |
+
|
220 |
+
def xor_vectors(v1,v2):
|
221 |
+
return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ])
|
222 |
+
|
223 |
+
### Getting properties from phonetic vectors
|
224 |
+
|
225 |
+
def get_property_vector(v,prop_name):
|
226 |
+
return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]]
|
227 |
+
|
228 |
+
def get_property_value(v,prop_name):
|
229 |
+
factor_bits=get_property_vector(v,prop_name).tolist()
|
230 |
+
|
231 |
+
v=0
|
232 |
+
c=1
|
233 |
+
for b in factor_bits[::-1]:
|
234 |
+
v+=(c*b)
|
235 |
+
c=c*2.0
|
236 |
+
|
237 |
+
return int(v)
|
238 |
+
|
239 |
+
def lcsr_indic(srcw,tgtw,slang,tlang):
|
240 |
+
"""
|
241 |
+
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
|
242 |
+
This works for Indic scripts by mapping both languages to a common script
|
243 |
+
|
244 |
+
srcw: source language string
|
245 |
+
tgtw: source language string
|
246 |
+
slang: source language
|
247 |
+
tlang: target language
|
248 |
+
"""
|
249 |
+
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
|
250 |
+
|
251 |
+
for si,sc in enumerate(srcw,1):
|
252 |
+
for ti,tc in enumerate(tgtw,1):
|
253 |
+
so=get_offset(sc,slang)
|
254 |
+
to=get_offset(tc,tlang)
|
255 |
+
|
256 |
+
if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to:
|
257 |
+
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
|
258 |
+
elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc:
|
259 |
+
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
|
260 |
+
else:
|
261 |
+
score_mat[si,ti]= max(
|
262 |
+
score_mat[si,ti-1],
|
263 |
+
score_mat[si-1,ti])
|
264 |
+
|
265 |
+
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
|
266 |
+
|
267 |
+
def lcsr_any(srcw,tgtw):
|
268 |
+
"""
|
269 |
+
LCSR computation if both languages have the same script
|
270 |
+
"""
|
271 |
+
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1))
|
272 |
+
|
273 |
+
for si,sc in enumerate(srcw,1):
|
274 |
+
for ti,tc in enumerate(tgtw,1):
|
275 |
+
|
276 |
+
if sc==tc:
|
277 |
+
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0
|
278 |
+
else:
|
279 |
+
score_mat[si,ti]= max(
|
280 |
+
score_mat[si,ti-1],
|
281 |
+
score_mat[si-1,ti])
|
282 |
+
|
283 |
+
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw)))
|
284 |
+
|
285 |
+
def lcsr(srcw,tgtw,slang,tlang):
|
286 |
+
"""
|
287 |
+
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
|
288 |
+
|
289 |
+
srcw: source language string
|
290 |
+
tgtw: source language string
|
291 |
+
slang: source language
|
292 |
+
tlang: target language
|
293 |
+
"""
|
294 |
+
|
295 |
+
if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang):
|
296 |
+
return lcsr_any(srcw,tgtw,slang,tlang)
|
297 |
+
else:
|
298 |
+
return lcsr_indic(srcw,tgtw)
|
299 |
+
|
300 |
+
|
301 |
+
|
indic_nlp_library/indicnlp/script/phonetic_sim.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
from indicnlp import loader
|
10 |
+
from indicnlp import langinfo
|
11 |
+
from indicnlp.script.indic_scripts import *
|
12 |
+
import numpy as np
|
13 |
+
import gzip
|
14 |
+
import pandas as pd
|
15 |
+
import sys
|
16 |
+
|
17 |
+
def equal(v1,v2):
|
18 |
+
return 0.0 if np.sum( xor_vectors(v1, v2)) > 0 else 1.0
|
19 |
+
|
20 |
+
def dice(v1,v2):
|
21 |
+
dotprod=2*float(np.dot( v1, v2.T ))
|
22 |
+
return dotprod/float(len(v1)+len(v2))
|
23 |
+
|
24 |
+
def jaccard(v1,v2):
|
25 |
+
dotprod=float(np.dot( v1, v2.T ))
|
26 |
+
return dotprod/float(len(v1)+len(v2)-dotprod)
|
27 |
+
|
28 |
+
def cosine(v1,v2):
|
29 |
+
dotprod=float(np.dot( v1, v2.T ))
|
30 |
+
norm1=float(np.dot( v1, v1.T ))
|
31 |
+
norm2=float(np.dot( v2, v2.T ))
|
32 |
+
return ((dotprod)/(np.sqrt(norm1*norm2)+0.00001))
|
33 |
+
|
34 |
+
def dotprod(v1,v2):
|
35 |
+
return float(np.dot( v1, v2.T ))
|
36 |
+
|
37 |
+
def sim1(v1,v2,base=5.0):
|
38 |
+
return np.power(base,dotprod(v1,v2))
|
39 |
+
|
40 |
+
def softmax(v1,v2):
|
41 |
+
return sim1(v1,v2,np.e)
|
42 |
+
|
43 |
+
def create_similarity_matrix(sim_func,slang,tlang,normalize=True):
|
44 |
+
|
45 |
+
dim=langinfo.COORDINATED_RANGE_END_INCLUSIVE-langinfo.COORDINATED_RANGE_START_INCLUSIVE+1
|
46 |
+
sim_mat=np.zeros((dim,dim))
|
47 |
+
|
48 |
+
for offset1 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
|
49 |
+
v1=get_phonetic_feature_vector(offset_to_char(offset1,slang),slang)
|
50 |
+
for offset2 in range(langinfo.COORDINATED_RANGE_START_INCLUSIVE, langinfo.COORDINATED_RANGE_END_INCLUSIVE+1):
|
51 |
+
v2=get_phonetic_feature_vector(offset_to_char(offset2,tlang),tlang)
|
52 |
+
sim_mat[offset1,offset2]=sim_func(v1,v2)
|
53 |
+
|
54 |
+
if normalize:
|
55 |
+
sums=np.sum(sim_mat, axis=1)
|
56 |
+
sim_mat=(sim_mat.transpose()/sums).transpose()
|
57 |
+
|
58 |
+
return sim_mat
|
59 |
+
|
indic_nlp_library/indicnlp/syllable/__init__.py
ADDED
File without changes
|
indic_nlp_library/indicnlp/syllable/syllabifier.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# Copyright (c) 2013-present, Anoop Kunchukuttan
|
3 |
+
# All rights reserved.
|
4 |
+
#
|
5 |
+
# This source code is licensed under the MIT license found in the
|
6 |
+
# LICENSE file in the root directory of this source tree.
|
7 |
+
#
|
8 |
+
|
9 |
+
import codecs, sys
|
10 |
+
from indicnlp.script import indic_scripts as si
|
11 |
+
import re
|
12 |
+
|
13 |
+
chillu_char_map= {
|
14 |
+
'\u0d7a': '\u0d23',
|
15 |
+
'\u0d7b': '\u0d28',
|
16 |
+
'\u0d7c': '\u0d30',
|
17 |
+
'\u0d7d': '\u0d32',
|
18 |
+
'\u0d7e': '\u0d33',
|
19 |
+
'\u0d7f': '\u0d15',
|
20 |
+
}
|
21 |
+
|
22 |
+
char_chillu_map= {}
|
23 |
+
for k,v in chillu_char_map.items():
|
24 |
+
char_chillu_map[v]=k
|
25 |
+
|
26 |
+
def normalize_malayalam(word):
|
27 |
+
|
28 |
+
word_mask=re.sub(r'[0-9]','0',word)
|
29 |
+
|
30 |
+
# instead of chillu characters, use consonant+halant
|
31 |
+
for chillu,char in chillu_char_map.items():
|
32 |
+
word=word.replace(chillu,'{}\u0d4d'.format(char))
|
33 |
+
word_mask=word_mask.replace(chillu,'41')
|
34 |
+
|
35 |
+
word_mask=re.sub(r'[^0-9]','0',word_mask)
|
36 |
+
|
37 |
+
return word, word_mask
|
38 |
+
|
39 |
+
def denormalize_malayalam(word, word_mask):
|
40 |
+
|
41 |
+
word=list(word)
|
42 |
+
word_mask=list(word_mask)
|
43 |
+
|
44 |
+
## pattern 4
|
45 |
+
idx=0
|
46 |
+
while idx>=0:
|
47 |
+
try:
|
48 |
+
idx=word_mask.index('4',idx)
|
49 |
+
word[idx:idx+2]=char_chillu_map[word[idx]]
|
50 |
+
word_mask[idx:idx+2]='0'
|
51 |
+
start=idx
|
52 |
+
except ValueError as e:
|
53 |
+
break
|
54 |
+
|
55 |
+
return ''.join(word)
|
56 |
+
|
57 |
+
def normalize_punjabi(word):
|
58 |
+
word_mask=re.sub(r'[0-9]','0',word)
|
59 |
+
|
60 |
+
## replace tippi with anusvaar
|
61 |
+
word=word.replace('\u0a70','\u0a02')
|
62 |
+
word_mask=word_mask.replace('\u0a70','2')
|
63 |
+
|
64 |
+
## replace addak+consonant with consonat+halant+consonant
|
65 |
+
word=re.sub(r'\u0a71(.)','\\1\u0a4d\\1',word)
|
66 |
+
word_mask=re.sub(r'\u0a71(.)','311',word_mask)
|
67 |
+
|
68 |
+
word_mask=re.sub(r'[^0-9]','0',word_mask)
|
69 |
+
|
70 |
+
return word, word_mask
|
71 |
+
|
72 |
+
def denormalize_punjabi(word, word_mask):
|
73 |
+
|
74 |
+
word=list(word)
|
75 |
+
word_mask=list(word_mask)
|
76 |
+
|
77 |
+
## pattern 2
|
78 |
+
idx=0
|
79 |
+
while idx>=0:
|
80 |
+
try:
|
81 |
+
idx=word_mask.index('2',idx)
|
82 |
+
word[idx]='\u0a70'
|
83 |
+
word_mask[idx]='0'
|
84 |
+
start=idx
|
85 |
+
except ValueError as e:
|
86 |
+
break
|
87 |
+
|
88 |
+
## pattern 3
|
89 |
+
idx=0
|
90 |
+
while idx>=0:
|
91 |
+
try:
|
92 |
+
idx=word_mask.index('3',idx)
|
93 |
+
word[idx:idx+3]='\u0a71{}'.format(word[idx])
|
94 |
+
word_mask[idx:idx+3]='00'
|
95 |
+
start=idx
|
96 |
+
except ValueError as e:
|
97 |
+
break
|
98 |
+
|
99 |
+
return ''.join(word)
|
100 |
+
|
101 |
+
def char_backoff(syllables_list,vocab):
|
102 |
+
syllables_final=[]
|
103 |
+
|
104 |
+
if vocab is None:
|
105 |
+
syllables_final=syllables_list
|
106 |
+
else:
|
107 |
+
for s in syllables_list:
|
108 |
+
if s in vocab:
|
109 |
+
syllables_final.append(s)
|
110 |
+
else:
|
111 |
+
for x in s:
|
112 |
+
syllables_final.append(x)
|
113 |
+
|
114 |
+
return syllables_final
|
115 |
+
|
116 |
+
|
117 |
+
def orthographic_syllabify_improved(word,lang,vocab=None):
|
118 |
+
|
119 |
+
word_mask=['0']*len(word)
|
120 |
+
|
121 |
+
if lang=='ml':
|
122 |
+
word, word_mask = normalize_malayalam(word)
|
123 |
+
word=word
|
124 |
+
elif lang=='pa':
|
125 |
+
word, word_mask = normalize_punjabi(word)
|
126 |
+
|
127 |
+
p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
|
128 |
+
|
129 |
+
syllables=[]
|
130 |
+
syllables_mask=[]
|
131 |
+
|
132 |
+
for i in range(len(word)):
|
133 |
+
v=p_vectors[i]
|
134 |
+
|
135 |
+
syllables.append(word[i])
|
136 |
+
syllables_mask.append(word_mask[i])
|
137 |
+
|
138 |
+
### simplified syllabification
|
139 |
+
#if i+1<len(word) and \
|
140 |
+
# (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
141 |
+
# syllables.append(u' ')
|
142 |
+
# syllables_mask.append(u'0')
|
143 |
+
|
144 |
+
#elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
|
145 |
+
# syllables.append(u' ')
|
146 |
+
# syllables_mask.append(u'0')
|
147 |
+
|
148 |
+
#elif i+1<len(word) and \
|
149 |
+
# (si.is_consonant(v) or si.is_nukta(v)) and \
|
150 |
+
# (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
|
151 |
+
# syllables.append(u' ')
|
152 |
+
# syllables_mask.append(u'0')
|
153 |
+
|
154 |
+
#### better syllabification
|
155 |
+
if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
156 |
+
syllables.append(' ')
|
157 |
+
syllables_mask.append('0')
|
158 |
+
|
159 |
+
elif not si.is_valid(v) or si.is_misc(v) :
|
160 |
+
syllables.append(' ')
|
161 |
+
syllables_mask.append('0')
|
162 |
+
|
163 |
+
elif si.is_vowel(v):
|
164 |
+
|
165 |
+
anu_nonplos= ( i+2<len(word) and \
|
166 |
+
si.is_anusvaar(p_vectors[i+1]) and \
|
167 |
+
not si.is_plosive(p_vectors[i+2])\
|
168 |
+
)
|
169 |
+
|
170 |
+
anu_eow= ( i+2==len(word) and \
|
171 |
+
si.is_anusvaar(p_vectors[i+1]) )
|
172 |
+
|
173 |
+
if not(anu_nonplos or anu_eow):
|
174 |
+
syllables.append(' ')
|
175 |
+
syllables_mask.append('0')
|
176 |
+
|
177 |
+
elif i+1<len(word) and \
|
178 |
+
(si.is_consonant(v) or si.is_nukta(v)):
|
179 |
+
if si.is_consonant(p_vectors[i+1]):
|
180 |
+
syllables.append(' ')
|
181 |
+
syllables_mask.append('0')
|
182 |
+
elif si.is_vowel(p_vectors[i+1]) and \
|
183 |
+
not si.is_dependent_vowel(p_vectors[i+1]):
|
184 |
+
syllables.append(' ')
|
185 |
+
syllables_mask.append('0')
|
186 |
+
elif si.is_anusvaar(p_vectors[i+1]):
|
187 |
+
anu_nonplos= ( i+2<len(word) and \
|
188 |
+
not si.is_plosive(p_vectors[i+2])\
|
189 |
+
)
|
190 |
+
|
191 |
+
anu_eow= i+2==len(word)
|
192 |
+
|
193 |
+
if not(anu_nonplos or anu_eow):
|
194 |
+
syllables.append(' ')
|
195 |
+
syllables_mask.append('0')
|
196 |
+
|
197 |
+
syllables_mask=''.join(syllables_mask)
|
198 |
+
syllables=''.join(syllables)
|
199 |
+
|
200 |
+
#assert len(syllables_mask) == len(syllables)
|
201 |
+
#assert syllables_mask.find('01') == -1
|
202 |
+
if syllables_mask.find('01') >= 0:
|
203 |
+
print('Warning')
|
204 |
+
|
205 |
+
if lang=='ml':
|
206 |
+
syllables = denormalize_malayalam(syllables,syllables_mask)
|
207 |
+
elif lang=='pa':
|
208 |
+
syllables = denormalize_punjabi(syllables,syllables_mask)
|
209 |
+
|
210 |
+
syllables_list = syllables.strip().split(' ')
|
211 |
+
return(char_backoff(syllables_list,vocab))
|
212 |
+
|
213 |
+
def orthographic_syllabify(word,lang,vocab=None):
|
214 |
+
|
215 |
+
p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
|
216 |
+
|
217 |
+
syllables=[]
|
218 |
+
|
219 |
+
for i in range(len(word)):
|
220 |
+
v=p_vectors[i]
|
221 |
+
|
222 |
+
syllables.append(word[i])
|
223 |
+
|
224 |
+
### simplified syllabification
|
225 |
+
#if i+1<len(word) and \
|
226 |
+
# (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
227 |
+
# syllables.append(u' ')
|
228 |
+
|
229 |
+
#elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
|
230 |
+
# syllables.append(u' ')
|
231 |
+
|
232 |
+
#elif i+1<len(word) and \
|
233 |
+
# (si.is_consonant(v) or si.is_nukta(v)) and \
|
234 |
+
# (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
|
235 |
+
# syllables.append(u' ')
|
236 |
+
|
237 |
+
#### better syllabification
|
238 |
+
if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
239 |
+
syllables.append(' ')
|
240 |
+
|
241 |
+
elif not si.is_valid(v) or si.is_misc(v) :
|
242 |
+
syllables.append(' ')
|
243 |
+
|
244 |
+
elif si.is_vowel(v):
|
245 |
+
|
246 |
+
anu_nonplos= ( i+2<len(word) and \
|
247 |
+
si.is_anusvaar(p_vectors[i+1]) and \
|
248 |
+
not si.is_plosive(p_vectors[i+2])\
|
249 |
+
)
|
250 |
+
|
251 |
+
anu_eow= ( i+2==len(word) and \
|
252 |
+
si.is_anusvaar(p_vectors[i+1]) )
|
253 |
+
|
254 |
+
if not(anu_nonplos or anu_eow):
|
255 |
+
syllables.append(' ')
|
256 |
+
|
257 |
+
elif i+1<len(word) and \
|
258 |
+
(si.is_consonant(v) or si.is_nukta(v)):
|
259 |
+
if si.is_consonant(p_vectors[i+1]):
|
260 |
+
syllables.append(' ')
|
261 |
+
elif si.is_vowel(p_vectors[i+1]) and \
|
262 |
+
not si.is_dependent_vowel(p_vectors[i+1]):
|
263 |
+
syllables.append(' ')
|
264 |
+
elif si.is_anusvaar(p_vectors[i+1]):
|
265 |
+
anu_nonplos= ( i+2<len(word) and \
|
266 |
+
not si.is_plosive(p_vectors[i+2])\
|
267 |
+
)
|
268 |
+
|
269 |
+
anu_eow= i+2==len(word)
|
270 |
+
|
271 |
+
if not(anu_nonplos or anu_eow):
|
272 |
+
syllables.append(' ')
|
273 |
+
|
274 |
+
syllables_list = ''.join(syllables).strip().split(' ')
|
275 |
+
return(char_backoff(syllables_list,vocab))
|
276 |
+
|
277 |
+
def orthographic_simple_syllabify(word,lang,vocab=None):
|
278 |
+
|
279 |
+
p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word]
|
280 |
+
|
281 |
+
syllables=[]
|
282 |
+
|
283 |
+
for i in range(len(word)):
|
284 |
+
v=p_vectors[i]
|
285 |
+
|
286 |
+
syllables.append(word[i])
|
287 |
+
|
288 |
+
## simplified syllabification
|
289 |
+
if i+1<len(word) and \
|
290 |
+
(not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])):
|
291 |
+
syllables.append(' ')
|
292 |
+
|
293 |
+
elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v):
|
294 |
+
syllables.append(' ')
|
295 |
+
|
296 |
+
elif i+1<len(word) and \
|
297 |
+
(si.is_consonant(v) or si.is_nukta(v)) and \
|
298 |
+
(si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])):
|
299 |
+
syllables.append(' ')
|
300 |
+
|
301 |
+
syllables_list = ''.join(syllables).strip().split(' ')
|
302 |
+
return(char_backoff(syllables_list,vocab))
|
indic_nlp_library/indicnlp/test/__init__.py
ADDED
File without changes
|