Spaces:
Sleeping
Sleeping
Upload 86 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +7 -0
- .gitattributes +6 -0
- DockerFile +0 -0
- app.py +40 -0
- libs/IndicTransToolkit/.gitignore +4 -0
- libs/IndicTransToolkit/CHANGELOG.md +16 -0
- libs/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO +131 -0
- libs/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt +15 -0
- libs/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt +1 -0
- libs/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe +1 -0
- libs/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt +8 -0
- libs/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt +1 -0
- libs/IndicTransToolkit/IndicTransToolkit/__init__.py +9 -0
- libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/collator.py +74 -0
- libs/IndicTransToolkit/IndicTransToolkit/evaluator.py +151 -0
- libs/IndicTransToolkit/IndicTransToolkit/processor.c +0 -0
- libs/IndicTransToolkit/IndicTransToolkit/processor.cp313-win_amd64.pyd +3 -0
- libs/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
- libs/IndicTransToolkit/IndicTransToolkit/processor.pyx +503 -0
- libs/IndicTransToolkit/IndicTransToolkit/version.py +1 -0
- libs/IndicTransToolkit/IndicTransToolkit/version.txt +1 -0
- libs/IndicTransToolkit/LICENSE +21 -0
- libs/IndicTransToolkit/README.md +97 -0
- libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so +3 -0
- libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
- libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o +3 -0
- libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o +3 -0
- libs/IndicTransToolkit/pyproject.toml +25 -0
- libs/IndicTransToolkit/requirements.txt +8 -0
- libs/IndicTransToolkit/setup.py +61 -0
- libs/indic_nlp_library/.gitignore +6 -0
- libs/indic_nlp_library/LICENSE +9 -0
- libs/indic_nlp_library/README.md +22 -0
- libs/indic_nlp_library/RESOURCES/script/all_script_phonetic_data.csv +113 -0
- libs/indic_nlp_library/RESOURCES/script/english_arpabet_list.csv +46 -0
- libs/indic_nlp_library/RESOURCES/script/english_script_phonetic_data.csv +47 -0
- libs/indic_nlp_library/RESOURCES/script/tamil_script_phonetic_data.csv +113 -0
- libs/indic_nlp_library/RESOURCES/transliterate/offset_itrans_map.csv +129 -0
- libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/PKG-INFO +52 -0
- libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/SOURCES.txt +40 -0
- libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/dependency_links.txt +1 -0
- libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/requires.txt +5 -0
- libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/top_level.txt +1 -0
.dockerignore
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
*.pyo
|
4 |
+
*.pyd
|
5 |
+
.env
|
6 |
+
.venv/
|
7 |
+
.git/
|
.gitattributes
CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
37 |
+
libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
38 |
+
libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o filter=lfs diff=lfs merge=lfs -text
|
39 |
+
libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o filter=lfs diff=lfs merge=lfs -text
|
40 |
+
libs/IndicTransToolkit/IndicTransToolkit/processor.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
|
41 |
+
libs/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
DockerFile
ADDED
File without changes
|
app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
import torch
|
4 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
5 |
+
import os
|
6 |
+
import sys
|
7 |
+
|
8 |
+
sys.path.append(os.path.abspath("libs/IndicTransToolkit"))
|
9 |
+
from IndicTransToolkit.processor import IndicProcessor
|
10 |
+
|
11 |
+
app = FastAPI(title="IndicTrans Translator API")
|
12 |
+
|
13 |
+
ip = IndicProcessor(inference=True)
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
|
15 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
|
16 |
+
|
17 |
+
class TranslationRequest(BaseModel):
|
18 |
+
text: str
|
19 |
+
target_lang: str
|
20 |
+
|
21 |
+
@app.post("/translate")
|
22 |
+
def translate_text(req: TranslationRequest):
|
23 |
+
if not req.text.strip():
|
24 |
+
raise HTTPException(status_code=400, detail="Input text is empty.")
|
25 |
+
|
26 |
+
try:
|
27 |
+
batch = ip.preprocess_batch([req.text], src_lang="eng_Latn", tgt_lang=req.target_lang)
|
28 |
+
batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
|
29 |
+
|
30 |
+
with torch.inference_mode():
|
31 |
+
outputs = model.generate(**batch, num_beams=5, max_length=256)
|
32 |
+
|
33 |
+
with tokenizer.as_target_tokenizer():
|
34 |
+
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
35 |
+
|
36 |
+
translated = ip.postprocess_batch(decoded, lang=req.target_lang)[0]
|
37 |
+
return {"translation": translated}
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
raise HTTPException(status_code=500, detail=str(e))
|
libs/IndicTransToolkit/.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dist/
|
2 |
+
build/
|
3 |
+
*.egg-info/
|
4 |
+
*/*/__pycache__/
|
libs/IndicTransToolkit/CHANGELOG.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Changelog
|
2 |
+
|
3 |
+
# 📢 Release v1.0.3
|
4 |
+
- 🚨 The `IndicProcessor` class has been re-written in [Cython](https://github.com/cython/cython) for faster implementation. This gives us atleast `+10 lines/s`.
|
5 |
+
- A new `visualize` argument as been added to `preprocess_batch` to track the processing with a `tqdm` bar.
|
6 |
+
|
7 |
+
# 📢 Release v1.0.2
|
8 |
+
- The repository has been renamed to `IndicTransToolkit`.
|
9 |
+
- 🚨 The custom tokenizer is now **removed** from the repository. Please revert to a previous commit ([v1.0.1](https://github.com/VarunGumma/IndicTransToolkit/tree/0e68fb5872f4d821578a5252f90ad43c9649370f)) to use it **(strongly discouraged)**. The official _(and only tokenizer)_ is available on HF along with the models.
|
10 |
+
|
11 |
+
# 📢 Release v1.0.0
|
12 |
+
- The [PreTrainedTokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer) for IndicTrans2 is now available on HF 🎉🎉 Note that, you still need the `IndicProcessor` to pre-process the sentences before tokenization.
|
13 |
+
- 🚨 **In favor of the standard PreTrainedTokenizer, we deprecated the custom tokenizer. However, this custom tokenizer will still be available here for backward compatibility, but no further updates/bug-fixes will be provided.**
|
14 |
+
- The `indic_evaluate` function is now consolidated into a concrete `IndicEvaluator` class.
|
15 |
+
- The data collation function for training is consolidated into a concrete `IndicDataCollator` class.
|
16 |
+
- A simple batching method is now available in the `IndicProcessor`.
|
libs/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.4
|
2 |
+
Name: IndicTransToolkit
|
3 |
+
Version: 1.0.3
|
4 |
+
Summary: A simple, consistent, and extendable module for IndicTrans2 tokenizer compatible with HuggingFace models
|
5 |
+
Home-page: https://github.com/VarunGumma/IndicTransToolkit
|
6 |
+
Author: Varun Gumma
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: MIT
|
9 |
+
Classifier: Programming Language :: Python :: 3
|
10 |
+
Classifier: License :: OSI Approved :: MIT License
|
11 |
+
Classifier: Operating System :: OS Independent
|
12 |
+
Requires-Python: >=3.8
|
13 |
+
Description-Content-Type: text/markdown
|
14 |
+
License-File: LICENSE
|
15 |
+
Requires-Dist: setuptools>=68.2.2
|
16 |
+
Requires-Dist: torch
|
17 |
+
Requires-Dist: cython
|
18 |
+
Requires-Dist: sacremoses
|
19 |
+
Requires-Dist: sentencepiece
|
20 |
+
Requires-Dist: transformers
|
21 |
+
Requires-Dist: sacrebleu
|
22 |
+
Requires-Dist: indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git
|
23 |
+
Dynamic: author
|
24 |
+
Dynamic: author-email
|
25 |
+
Dynamic: classifier
|
26 |
+
Dynamic: description
|
27 |
+
Dynamic: description-content-type
|
28 |
+
Dynamic: home-page
|
29 |
+
Dynamic: license
|
30 |
+
Dynamic: license-file
|
31 |
+
Dynamic: requires-dist
|
32 |
+
Dynamic: requires-python
|
33 |
+
Dynamic: summary
|
34 |
+
|
35 |
+
# IndicTransToolkit
|
36 |
+
|
37 |
+
## About
|
38 |
+
The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
|
39 |
+
|
40 |
+
## Pre-requisites
|
41 |
+
- `Python 3.8+`
|
42 |
+
- [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
|
43 |
+
- Other requirements as listed in `requirements.txt`
|
44 |
+
|
45 |
+
## Configuration
|
46 |
+
- Editable installation (Note, this may take a while):
|
47 |
+
```bash
|
48 |
+
git clone https://github.com/VarunGumma/IndicTransToolkit
|
49 |
+
cd IndicTransToolkit
|
50 |
+
|
51 |
+
pip install --editable . --use-pep517 # required for pip >= 25.0
|
52 |
+
|
53 |
+
# in case it fails, try:
|
54 |
+
# pip install --editable . --use-pep517 --config-settings editable_mode=compat
|
55 |
+
```
|
56 |
+
|
57 |
+
## Examples
|
58 |
+
For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
|
59 |
+
|
60 |
+
### PreTainedTokenizer
|
61 |
+
```python
|
62 |
+
import torch
|
63 |
+
from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
|
64 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
65 |
+
|
66 |
+
ip = IndicProcessor(inference=True)
|
67 |
+
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
|
68 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
|
69 |
+
|
70 |
+
sentences = [
|
71 |
+
"This is a test sentence.",
|
72 |
+
"This is another longer different test sentence.",
|
73 |
+
"Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
|
74 |
+
]
|
75 |
+
|
76 |
+
batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
|
77 |
+
batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
|
78 |
+
|
79 |
+
with torch.inference_mode():
|
80 |
+
outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
|
81 |
+
|
82 |
+
with tokenizer.as_target_tokenizer():
|
83 |
+
# This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
|
84 |
+
# Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
|
85 |
+
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
86 |
+
|
87 |
+
outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
|
88 |
+
print(outputs)
|
89 |
+
|
90 |
+
>>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
|
91 |
+
```
|
92 |
+
|
93 |
+
### Evaluation
|
94 |
+
- `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
|
95 |
+
- We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
|
96 |
+
```python
|
97 |
+
from IndicTransToolkit import IndicEvaluator
|
98 |
+
|
99 |
+
# this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
|
100 |
+
evaluator = IndicEvaluator()
|
101 |
+
scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
|
102 |
+
|
103 |
+
# alternatively, you can pass the list of predictions and references instead of files
|
104 |
+
# scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
|
105 |
+
```
|
106 |
+
|
107 |
+
## Authors
|
108 |
+
- Varun Gumma ([email protected])
|
109 |
+
- Jay Gala ([email protected])
|
110 |
+
- Pranjal Agadh Chitale ([email protected])
|
111 |
+
- Raj Dabre ([email protected])
|
112 |
+
|
113 |
+
|
114 |
+
## Bugs and Contribution
|
115 |
+
Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
|
116 |
+
|
117 |
+
|
118 |
+
## Citation
|
119 |
+
If you use our codebase, or models, please do cite the following paper:
|
120 |
+
```bibtex
|
121 |
+
@article{
|
122 |
+
gala2023indictrans,
|
123 |
+
title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
|
124 |
+
author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
|
125 |
+
journal={Transactions on Machine Learning Research},
|
126 |
+
issn={2835-8856},
|
127 |
+
year={2023},
|
128 |
+
url={https://openreview.net/forum?id=vfT4YuzAYA},
|
129 |
+
note={}
|
130 |
+
}
|
131 |
+
```
|
libs/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LICENSE
|
2 |
+
README.md
|
3 |
+
pyproject.toml
|
4 |
+
setup.py
|
5 |
+
IndicTransToolkit/__init__.py
|
6 |
+
IndicTransToolkit/collator.py
|
7 |
+
IndicTransToolkit/evaluator.py
|
8 |
+
IndicTransToolkit/processor.c
|
9 |
+
IndicTransToolkit/version.py
|
10 |
+
IndicTransToolkit.egg-info/PKG-INFO
|
11 |
+
IndicTransToolkit.egg-info/SOURCES.txt
|
12 |
+
IndicTransToolkit.egg-info/dependency_links.txt
|
13 |
+
IndicTransToolkit.egg-info/not-zip-safe
|
14 |
+
IndicTransToolkit.egg-info/requires.txt
|
15 |
+
IndicTransToolkit.egg-info/top_level.txt
|
libs/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
libs/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
libs/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
setuptools>=68.2.2
|
2 |
+
torch
|
3 |
+
cython
|
4 |
+
sacremoses
|
5 |
+
sentencepiece
|
6 |
+
transformers
|
7 |
+
sacrebleu
|
8 |
+
indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git
|
libs/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
IndicTransToolkit
|
libs/IndicTransToolkit/IndicTransToolkit/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .evaluator import IndicEvaluator
|
2 |
+
from .collator import IndicDataCollator
|
3 |
+
from .processor import IndicProcessor
|
4 |
+
|
5 |
+
__all__ = [
|
6 |
+
"IndicEvaluator",
|
7 |
+
"IndicDataCollator",
|
8 |
+
"IndicProcessor",
|
9 |
+
]
|
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (337 Bytes). View file
|
|
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc
ADDED
Binary file (378 Bytes). View file
|
|
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc
ADDED
Binary file (2.14 kB). View file
|
|
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc
ADDED
Binary file (3.2 kB). View file
|
|
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc
ADDED
Binary file (4.15 kB). View file
|
|
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc
ADDED
Binary file (6.37 kB). View file
|
|
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc
ADDED
Binary file (11.7 kB). View file
|
|
libs/IndicTransToolkit/IndicTransToolkit/collator.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from typing import Any, Optional, Union
|
4 |
+
|
5 |
+
from transformers.utils import PaddingStrategy
|
6 |
+
from transformers.tokenization_utils import PreTrainedTokenizerBase
|
7 |
+
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
|
8 |
+
|
9 |
+
|
10 |
+
@dataclass
|
11 |
+
class IndicDataCollator:
|
12 |
+
tokenizer: PreTrainedTokenizerBase
|
13 |
+
model: Optional[Any] = None
|
14 |
+
padding: Union[bool, str, PaddingStrategy] = True
|
15 |
+
max_length: Optional[int] = None
|
16 |
+
pad_to_multiple_of: Optional[int] = None
|
17 |
+
label_pad_token_id: int = -100
|
18 |
+
return_tensors: str = "pt"
|
19 |
+
|
20 |
+
def __call__(self, features, return_tensors=None):
|
21 |
+
|
22 |
+
if return_tensors is None:
|
23 |
+
return_tensors = self.return_tensors
|
24 |
+
|
25 |
+
labels = (
|
26 |
+
[feature["labels"] for feature in features]
|
27 |
+
if "labels" in features[0].keys()
|
28 |
+
else None
|
29 |
+
)
|
30 |
+
# We have to pad the labels before calling `tokenizer.pad` as
|
31 |
+
# this method won't pad them and needs them of the same length to return tensors.
|
32 |
+
if labels is not None:
|
33 |
+
max_label_length = max(len(l) for l in labels)
|
34 |
+
if self.pad_to_multiple_of is not None:
|
35 |
+
max_label_length = (
|
36 |
+
(max_label_length + self.pad_to_multiple_of - 1)
|
37 |
+
// self.pad_to_multiple_of
|
38 |
+
* self.pad_to_multiple_of
|
39 |
+
)
|
40 |
+
|
41 |
+
# fairseq by defaults right pad the labels for seq2seq tasks
|
42 |
+
for feature in features:
|
43 |
+
remainder = [self.label_pad_token_id] * (
|
44 |
+
max_label_length - len(feature["labels"])
|
45 |
+
)
|
46 |
+
if isinstance(feature["labels"], list):
|
47 |
+
feature["labels"] = feature["labels"] + remainder
|
48 |
+
else:
|
49 |
+
feature["labels"] = np.concatenate(
|
50 |
+
[feature["labels"], remainder]
|
51 |
+
).astype(np.int64)
|
52 |
+
|
53 |
+
self.tokenizer.padding_side = "left"
|
54 |
+
features = pad_without_fast_tokenizer_warning(
|
55 |
+
self.tokenizer,
|
56 |
+
features,
|
57 |
+
padding=self.padding,
|
58 |
+
max_length=self.max_length,
|
59 |
+
return_tensors=return_tensors,
|
60 |
+
pad_to_multiple_of=self.pad_to_multiple_of,
|
61 |
+
)
|
62 |
+
|
63 |
+
# prepare decoder_input_ids
|
64 |
+
if (
|
65 |
+
labels is not None
|
66 |
+
and self.model is not None
|
67 |
+
and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
|
68 |
+
):
|
69 |
+
decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
|
70 |
+
labels=features["labels"]
|
71 |
+
)
|
72 |
+
features["decoder_input_ids"] = decoder_input_ids
|
73 |
+
|
74 |
+
return features
|
libs/IndicTransToolkit/IndicTransToolkit/evaluator.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Union
|
2 |
+
from sacrebleu.metrics import CHRF, BLEU
|
3 |
+
|
4 |
+
from indicnlp.tokenize import indic_tokenize
|
5 |
+
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
|
6 |
+
|
7 |
+
|
8 |
+
class IndicEvaluator:
|
9 |
+
def __init__(self):
|
10 |
+
# === Metrics ===
|
11 |
+
self._chrf2_metric = CHRF(word_order=2)
|
12 |
+
self._bleu_metric_13a = BLEU(tokenize="13a")
|
13 |
+
self._bleu_metric_none = BLEU(tokenize="none")
|
14 |
+
|
15 |
+
# === Normalizer factory and cache ===
|
16 |
+
self._indic_norm_factory = IndicNormalizerFactory()
|
17 |
+
self._normalizer_cache = {} # Cache normalizers by iso_lang
|
18 |
+
|
19 |
+
# === FLORES -> ISO codes ===
|
20 |
+
self._flores_codes = {
|
21 |
+
"asm_Beng": "as",
|
22 |
+
"awa_Deva": "hi",
|
23 |
+
"ben_Beng": "bn",
|
24 |
+
"bho_Deva": "hi",
|
25 |
+
"brx_Deva": "hi",
|
26 |
+
"doi_Deva": "hi",
|
27 |
+
"eng_Latn": "en",
|
28 |
+
"gom_Deva": "kK",
|
29 |
+
"gon_Deva": "hi",
|
30 |
+
"guj_Gujr": "gu",
|
31 |
+
"hin_Deva": "hi",
|
32 |
+
"hne_Deva": "hi",
|
33 |
+
"kan_Knda": "kn",
|
34 |
+
"kas_Arab": "ur",
|
35 |
+
"kas_Deva": "hi",
|
36 |
+
"kha_Latn": "en",
|
37 |
+
"lus_Latn": "en",
|
38 |
+
"mag_Deva": "hi",
|
39 |
+
"mai_Deva": "hi",
|
40 |
+
"mal_Mlym": "ml",
|
41 |
+
"mar_Deva": "mr",
|
42 |
+
"mni_Beng": "bn",
|
43 |
+
"mni_Mtei": "hi",
|
44 |
+
"npi_Deva": "ne",
|
45 |
+
"ory_Orya": "or",
|
46 |
+
"pan_Guru": "pa",
|
47 |
+
"san_Deva": "hi",
|
48 |
+
"sat_Olck": "or",
|
49 |
+
"snd_Arab": "ur",
|
50 |
+
"snd_Deva": "hi",
|
51 |
+
"tam_Taml": "ta",
|
52 |
+
"tel_Telu": "te",
|
53 |
+
"urd_Arab": "ur",
|
54 |
+
"unr_Deva": "hi",
|
55 |
+
}
|
56 |
+
|
57 |
+
def _get_normalizer(self, iso_lang: str):
|
58 |
+
"""
|
59 |
+
Return a cached normalizer for a given iso_lang.
|
60 |
+
"""
|
61 |
+
if iso_lang not in self._normalizer_cache:
|
62 |
+
self._normalizer_cache[iso_lang] = self._indic_norm_factory.get_normalizer(iso_lang)
|
63 |
+
return self._normalizer_cache[iso_lang]
|
64 |
+
|
65 |
+
def _preprocess(self, sentences: List[str], lang: str) -> List[str]:
|
66 |
+
"""
|
67 |
+
Preprocess the sentences using IndicNLP:
|
68 |
+
1) Normalization (using a cached normalizer),
|
69 |
+
2) Trivial tokenization.
|
70 |
+
"""
|
71 |
+
iso_lang = self._flores_codes.get(lang, "hi")
|
72 |
+
# Fetch from cache to avoid reconstructing the normalizer
|
73 |
+
normalizer = self._get_normalizer(iso_lang)
|
74 |
+
|
75 |
+
# Local references for speed
|
76 |
+
trivial_tokenize = indic_tokenize.trivial_tokenize
|
77 |
+
normalize_fn = normalizer.normalize
|
78 |
+
|
79 |
+
processed_sentences = []
|
80 |
+
for line in sentences:
|
81 |
+
# single .strip() before normalizing
|
82 |
+
line = line.strip()
|
83 |
+
norm_line = normalize_fn(line)
|
84 |
+
tokens = trivial_tokenize(norm_line, iso_lang)
|
85 |
+
processed_sentences.append(" ".join(tokens))
|
86 |
+
|
87 |
+
return processed_sentences
|
88 |
+
|
89 |
+
def evaluate(
|
90 |
+
self,
|
91 |
+
tgt_lang: str,
|
92 |
+
preds: Union[List[str], str],
|
93 |
+
refs: Union[List[str], str],
|
94 |
+
):
|
95 |
+
"""
|
96 |
+
Evaluate BLEU and chrF2++ scores for the given predictions and references.
|
97 |
+
- If preds/refs are strings (filenames), read them from disk.
|
98 |
+
- If they are lists, evaluate them directly.
|
99 |
+
- For non-English languages, applies Indic NLP preprocessing before scoring.
|
100 |
+
"""
|
101 |
+
assert preds is not None and refs is not None, "Predictions and References cannot be None"
|
102 |
+
|
103 |
+
# Convert file paths to lists if needed
|
104 |
+
if isinstance(preds, str):
|
105 |
+
with open(preds, "r", encoding="utf-8") as fp:
|
106 |
+
preds = [line.strip() for line in fp]
|
107 |
+
if isinstance(refs, str):
|
108 |
+
with open(refs, "r", encoding="utf-8") as fr:
|
109 |
+
refs = [line.strip() for line in fr]
|
110 |
+
|
111 |
+
assert len(preds) == len(refs), "Number of predictions and references do not match"
|
112 |
+
|
113 |
+
# Local references to metrics for speed
|
114 |
+
bleu_none = self._bleu_metric_none
|
115 |
+
bleu_13a = self._bleu_metric_13a
|
116 |
+
chrf2 = self._chrf2_metric
|
117 |
+
|
118 |
+
scores = {}
|
119 |
+
|
120 |
+
# For English (eng_Latn), skip Indic NLP normalization
|
121 |
+
if tgt_lang != "eng_Latn":
|
122 |
+
preds_ = self._preprocess(preds, tgt_lang)
|
123 |
+
refs_ = self._preprocess(refs, tgt_lang)
|
124 |
+
|
125 |
+
bleu_score = bleu_none.corpus_score(preds_, [refs_])
|
126 |
+
chrf_score = chrf2.corpus_score(preds_, [refs_])
|
127 |
+
|
128 |
+
scores["bleu"] = {
|
129 |
+
"score": round(bleu_score.score, 1),
|
130 |
+
"signature": bleu_none.get_signature().format(),
|
131 |
+
}
|
132 |
+
scores["chrF2++"] = {
|
133 |
+
"score": round(chrf_score.score, 1),
|
134 |
+
"signature": chrf2.get_signature().format(),
|
135 |
+
}
|
136 |
+
|
137 |
+
else:
|
138 |
+
# For English, 13a tokenization is standard
|
139 |
+
bleu_score = bleu_13a.corpus_score(preds, [refs])
|
140 |
+
chrf_score = chrf2.corpus_score(preds, [refs])
|
141 |
+
|
142 |
+
scores["bleu"] = {
|
143 |
+
"score": round(bleu_score.score, 1),
|
144 |
+
"signature": bleu_13a.get_signature().format(),
|
145 |
+
}
|
146 |
+
scores["chrF2++"] = {
|
147 |
+
"score": round(chrf_score.score, 1),
|
148 |
+
"signature": chrf2.get_signature().format(),
|
149 |
+
}
|
150 |
+
|
151 |
+
return scores
|
libs/IndicTransToolkit/IndicTransToolkit/processor.c
ADDED
The diff for this file is too large to render.
See raw diff
|
|
libs/IndicTransToolkit/IndicTransToolkit/processor.cp313-win_amd64.pyd
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6171c90f22a4602e67c36a791a618ccbf0d3703f17ea0c214186cb5fe3030487
|
3 |
+
size 139776
|
libs/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
|
3 |
+
size 229200
|
libs/IndicTransToolkit/IndicTransToolkit/processor.pyx
ADDED
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cython: language_level=3, boundscheck=False, cdivision=True, wraparound=False
|
2 |
+
"""
|
3 |
+
Cython version of the IndicProcessor class with optimizations for performance.
|
4 |
+
Only preprocess_batch and postprocess_batch are exposed as cpdef methods.
|
5 |
+
All other methods are internal (cdef) for optimized Cython usage.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import regex as re
|
9 |
+
from tqdm import tqdm
|
10 |
+
from queue import Queue
|
11 |
+
from typing import List, Dict, Union
|
12 |
+
|
13 |
+
# Importing Python objects since these libraries don't offer C-extensions
|
14 |
+
from indicnlp.tokenize import indic_tokenize, indic_detokenize
|
15 |
+
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
|
16 |
+
from sacremoses import MosesPunctNormalizer, MosesTokenizer, MosesDetokenizer
|
17 |
+
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
|
18 |
+
|
19 |
+
|
20 |
+
cdef class IndicProcessor:
|
21 |
+
cdef public bint inference
|
22 |
+
|
23 |
+
# Precompiled regex patterns and placeholders
|
24 |
+
cdef object _MULTISPACE_REGEX
|
25 |
+
cdef object _DIGIT_SPACE_PERCENT
|
26 |
+
cdef object _DOUBLE_QUOT_PUNC
|
27 |
+
cdef object _DIGIT_NBSP_DIGIT
|
28 |
+
cdef object _END_BRACKET_SPACE_PUNC_REGEX
|
29 |
+
|
30 |
+
cdef object _URL_PATTERN
|
31 |
+
cdef object _NUMERAL_PATTERN
|
32 |
+
cdef object _EMAIL_PATTERN
|
33 |
+
cdef object _OTHER_PATTERN
|
34 |
+
|
35 |
+
cdef list _PUNC_REPLACEMENTS
|
36 |
+
cdef list _INDIC_FAILURE_CASES
|
37 |
+
|
38 |
+
cdef dict _flores_codes
|
39 |
+
cdef dict _digits_translation_table
|
40 |
+
|
41 |
+
# Placeholder maps stored in a Python Queue (treated as `object` for Cython)
|
42 |
+
cdef object _placeholder_entity_maps
|
43 |
+
|
44 |
+
# Tools (also Python objects)
|
45 |
+
cdef object _en_tok
|
46 |
+
cdef object _en_normalizer
|
47 |
+
cdef object _en_detok
|
48 |
+
cdef object _xliterator
|
49 |
+
|
50 |
+
def __cinit__(self, bint inference=True):
|
51 |
+
"""
|
52 |
+
Constructor for IndicProcessor. Initializes all necessary components.
|
53 |
+
"""
|
54 |
+
self.inference = inference
|
55 |
+
|
56 |
+
##############################
|
57 |
+
# FLORES -> ISO CODES
|
58 |
+
##############################
|
59 |
+
self._flores_codes = {
|
60 |
+
"asm_Beng": "as",
|
61 |
+
"awa_Deva": "hi",
|
62 |
+
"ben_Beng": "bn",
|
63 |
+
"bho_Deva": "hi",
|
64 |
+
"brx_Deva": "hi",
|
65 |
+
"doi_Deva": "hi",
|
66 |
+
"eng_Latn": "en",
|
67 |
+
"gom_Deva": "kK",
|
68 |
+
"gon_Deva": "hi",
|
69 |
+
"guj_Gujr": "gu",
|
70 |
+
"hin_Deva": "hi",
|
71 |
+
"hne_Deva": "hi",
|
72 |
+
"kan_Knda": "kn",
|
73 |
+
"kas_Arab": "ur",
|
74 |
+
"kas_Deva": "hi",
|
75 |
+
"kha_Latn": "en",
|
76 |
+
"lus_Latn": "en",
|
77 |
+
"mag_Deva": "hi",
|
78 |
+
"mai_Deva": "hi",
|
79 |
+
"mal_Mlym": "ml",
|
80 |
+
"mar_Deva": "mr",
|
81 |
+
"mni_Beng": "bn",
|
82 |
+
"mni_Mtei": "hi",
|
83 |
+
"npi_Deva": "ne",
|
84 |
+
"ory_Orya": "or",
|
85 |
+
"pan_Guru": "pa",
|
86 |
+
"san_Deva": "hi",
|
87 |
+
"sat_Olck": "or",
|
88 |
+
"snd_Arab": "ur",
|
89 |
+
"snd_Deva": "hi",
|
90 |
+
"tam_Taml": "ta",
|
91 |
+
"tel_Telu": "te",
|
92 |
+
"urd_Arab": "ur",
|
93 |
+
"unr_Deva": "hi",
|
94 |
+
}
|
95 |
+
|
96 |
+
##############################
|
97 |
+
# INDIC DIGIT TRANSLATION (str.translate)
|
98 |
+
##############################
|
99 |
+
self._digits_translation_table = {}
|
100 |
+
cdef dict digits_dict = {
|
101 |
+
"\u09e6": "0", "\u0ae6": "0", "\u0ce6": "0", "\u0966": "0",
|
102 |
+
"\u0660": "0", "\uabf0": "0", "\u0b66": "0", "\u0a66": "0",
|
103 |
+
"\u1c50": "0", "\u06f0": "0",
|
104 |
+
|
105 |
+
"\u09e7": "1", "\u0ae7": "1", "\u0967": "1", "\u0ce7": "1",
|
106 |
+
"\u06f1": "1", "\uabf1": "1", "\u0b67": "1", "\u0a67": "1",
|
107 |
+
"\u1c51": "1", "\u0c67": "1",
|
108 |
+
|
109 |
+
"\u09e8": "2", "\u0ae8": "2", "\u0968": "2", "\u0ce8": "2",
|
110 |
+
"\u06f2": "2", "\uabf2": "2", "\u0b68": "2", "\u0a68": "2",
|
111 |
+
"\u1c52": "2", "\u0c68": "2",
|
112 |
+
|
113 |
+
"\u09e9": "3", "\u0ae9": "3", "\u0969": "3", "\u0ce9": "3",
|
114 |
+
"\u06f3": "3", "\uabf3": "3", "\u0b69": "3", "\u0a69": "3",
|
115 |
+
"\u1c53": "3", "\u0c69": "3",
|
116 |
+
|
117 |
+
"\u09ea": "4", "\u0aea": "4", "\u096a": "4", "\u0cea": "4",
|
118 |
+
"\u06f4": "4", "\uabf4": "4", "\u0b6a": "4", "\u0a6a": "4",
|
119 |
+
"\u1c54": "4", "\u0c6a": "4",
|
120 |
+
|
121 |
+
"\u09eb": "5", "\u0aeb": "5", "\u096b": "5", "\u0ceb": "5",
|
122 |
+
"\u06f5": "5", "\uabf5": "5", "\u0b6b": "5", "\u0a6b": "5",
|
123 |
+
"\u1c55": "5", "\u0c6b": "5",
|
124 |
+
|
125 |
+
"\u09ec": "6", "\u0aec": "6", "\u096c": "6", "\u0cec": "6",
|
126 |
+
"\u06f6": "6", "\uabf6": "6", "\u0b6c": "6", "\u0a6c": "6",
|
127 |
+
"\u1c56": "6", "\u0c6c": "6",
|
128 |
+
|
129 |
+
"\u09ed": "7", "\u0aed": "7", "\u096d": "7", "\u0ced": "7",
|
130 |
+
"\u06f7": "7", "\uabf7": "7", "\u0b6d": "7", "\u0a6d": "7",
|
131 |
+
"\u1c57": "7", "\u0c6d": "7",
|
132 |
+
|
133 |
+
"\u09ee": "8", "\u0aee": "8", "\u096e": "8", "\u0cee": "8",
|
134 |
+
"\u06f8": "8", "\uabf8": "8", "\u0b6e": "8", "\u0a6e": "8",
|
135 |
+
"\u1c58": "8", "\u0c6e": "8",
|
136 |
+
|
137 |
+
"\u09ef": "9", "\u0aef": "9", "\u096f": "9", "\u0cef": "9",
|
138 |
+
"\u06f9": "9", "\uabf9": "9", "\u0b6f": "9", "\u0a6f": "9",
|
139 |
+
"\u1c59": "9", "\u0c6f": "9",
|
140 |
+
}
|
141 |
+
for k, v in digits_dict.items():
|
142 |
+
self._digits_translation_table[ord(k)] = v
|
143 |
+
|
144 |
+
# Also map ASCII '0'-'9'
|
145 |
+
for c in range(ord('0'), ord('9') + 1):
|
146 |
+
self._digits_translation_table[c] = chr(c)
|
147 |
+
|
148 |
+
##############################
|
149 |
+
# PLACEHOLDER MAP QUEUE
|
150 |
+
##############################
|
151 |
+
self._placeholder_entity_maps = Queue()
|
152 |
+
|
153 |
+
##############################
|
154 |
+
# MOSES (as Python objects)
|
155 |
+
##############################
|
156 |
+
self._en_tok = MosesTokenizer(lang="en")
|
157 |
+
self._en_normalizer = MosesPunctNormalizer()
|
158 |
+
self._en_detok = MosesDetokenizer(lang="en")
|
159 |
+
|
160 |
+
##############################
|
161 |
+
# TRANSLITERATOR (Python object)
|
162 |
+
##############################
|
163 |
+
self._xliterator = UnicodeIndicTransliterator()
|
164 |
+
|
165 |
+
##############################
|
166 |
+
# Precompiled Patterns
|
167 |
+
##############################
|
168 |
+
self._MULTISPACE_REGEX = re.compile(r"[ ]{2,}")
|
169 |
+
self._DIGIT_SPACE_PERCENT = re.compile(r"(\d) %")
|
170 |
+
self._DOUBLE_QUOT_PUNC = re.compile(r"\"([,\.]+)")
|
171 |
+
self._DIGIT_NBSP_DIGIT = re.compile(r"(\d) (\d)")
|
172 |
+
self._END_BRACKET_SPACE_PUNC_REGEX = re.compile(r"\) ([\.!:?;,])")
|
173 |
+
|
174 |
+
self._URL_PATTERN = re.compile(
|
175 |
+
r"\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b"
|
176 |
+
)
|
177 |
+
self._NUMERAL_PATTERN = re.compile(
|
178 |
+
r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
|
179 |
+
)
|
180 |
+
self._EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}")
|
181 |
+
self._OTHER_PATTERN = re.compile(r"[A-Za-z0-9]*[#|@]\w+")
|
182 |
+
|
183 |
+
# Combined punctuation replacements
|
184 |
+
self._PUNC_REPLACEMENTS = [
|
185 |
+
(re.compile(r"\r"), ""),
|
186 |
+
(re.compile(r"\(\s*"), "("),
|
187 |
+
(re.compile(r"\s*\)"), ")"),
|
188 |
+
(re.compile(r"\s:\s?"), ":"),
|
189 |
+
(re.compile(r"\s;\s?"), ";"),
|
190 |
+
(re.compile(r"[`´‘‚’]"), "'"),
|
191 |
+
(re.compile(r"[„“”«»]"), '"'),
|
192 |
+
(re.compile(r"[–—]"), "-"),
|
193 |
+
(re.compile(r"\.\.\."), "..."),
|
194 |
+
(re.compile(r" %"), "%"),
|
195 |
+
(re.compile(r"nº "), "nº "),
|
196 |
+
(re.compile(r" ºC"), " ºC"),
|
197 |
+
(re.compile(r" [?!;]"), lambda m: m.group(0).strip()),
|
198 |
+
(re.compile(r", "), ", "),
|
199 |
+
]
|
200 |
+
|
201 |
+
self._INDIC_FAILURE_CASES = [
|
202 |
+
"آی ڈی ",
|
203 |
+
"ꯑꯥꯏꯗꯤ",
|
204 |
+
"आईडी",
|
205 |
+
"आई . डी . ",
|
206 |
+
"आई . डी .",
|
207 |
+
"आई. डी. ",
|
208 |
+
"आई. डी.",
|
209 |
+
"आय. डी. ",
|
210 |
+
"आय. डी.",
|
211 |
+
"आय . डी . ",
|
212 |
+
"आय . डी .",
|
213 |
+
"ऐटि",
|
214 |
+
"آئی ڈی ",
|
215 |
+
"ᱟᱭᱰᱤ ᱾",
|
216 |
+
"आयडी",
|
217 |
+
"ऐडि",
|
218 |
+
"आइडि",
|
219 |
+
"ᱟᱭᱰᱤ",
|
220 |
+
]
|
221 |
+
|
222 |
+
# Internal Method: Apply punctuation replacements
|
223 |
+
cdef str _apply_punc_replacements(self, str text, list replacements) except *:
|
224 |
+
"""
|
225 |
+
Apply a list of (pattern, replacement) in sequence to text.
|
226 |
+
"""
|
227 |
+
cdef int i
|
228 |
+
cdef tuple pair
|
229 |
+
for i in range(len(replacements)):
|
230 |
+
pair = replacements[i]
|
231 |
+
text = pair[0].sub(pair[1], text)
|
232 |
+
return text
|
233 |
+
|
234 |
+
# Internal Method: Punctuation Normalization
|
235 |
+
cdef str _punc_norm(self, str text) except *:
|
236 |
+
"""
|
237 |
+
Consolidate punctuation normalization in fewer passes.
|
238 |
+
"""
|
239 |
+
# 1) Apply replacements
|
240 |
+
text = self._apply_punc_replacements(text, self._PUNC_REPLACEMENTS)
|
241 |
+
|
242 |
+
# 2) Additional patterns
|
243 |
+
text = self._MULTISPACE_REGEX.sub(" ", text)
|
244 |
+
text = self._END_BRACKET_SPACE_PUNC_REGEX.sub(r")\1", text)
|
245 |
+
text = self._DIGIT_SPACE_PERCENT.sub(r"\1%", text)
|
246 |
+
text = self._DOUBLE_QUOT_PUNC.sub(r'\1"', text)
|
247 |
+
text = self._DIGIT_NBSP_DIGIT.sub(r"\1.\2", text)
|
248 |
+
return text.strip()
|
249 |
+
|
250 |
+
# Internal Method: Wrap Text with Placeholders
|
251 |
+
cdef str _wrap_with_placeholders(self, str text) except *:
|
252 |
+
"""
|
253 |
+
Wrap substrings with matched patterns in the text with placeholders.
|
254 |
+
Store the placeholder map in the queue for retrieval in postprocessing.
|
255 |
+
"""
|
256 |
+
cdef int serial_no = 1
|
257 |
+
cdef dict placeholder_entity_map = {}
|
258 |
+
cdef list patterns = [
|
259 |
+
self._EMAIL_PATTERN,
|
260 |
+
self._URL_PATTERN,
|
261 |
+
self._NUMERAL_PATTERN,
|
262 |
+
self._OTHER_PATTERN,
|
263 |
+
]
|
264 |
+
cdef object pattern
|
265 |
+
cdef set matches
|
266 |
+
cdef str match
|
267 |
+
cdef str base_placeholder
|
268 |
+
cdef int i
|
269 |
+
|
270 |
+
for pattern in patterns:
|
271 |
+
matches = set(pattern.findall(text))
|
272 |
+
for match in matches:
|
273 |
+
# Additional checks
|
274 |
+
if pattern is self._URL_PATTERN:
|
275 |
+
if len(match.replace(".", "")) < 4:
|
276 |
+
continue
|
277 |
+
if pattern is self._NUMERAL_PATTERN:
|
278 |
+
if len(match.replace(" ", "").replace(".", "").replace(":", "")) < 4:
|
279 |
+
continue
|
280 |
+
|
281 |
+
base_placeholder = f"<ID{serial_no}>"
|
282 |
+
# Map various placeholder formats to the matched text
|
283 |
+
placeholder_entity_map[f"<ID{serial_no}>"] = match
|
284 |
+
placeholder_entity_map[f"< ID{serial_no} >"] = match
|
285 |
+
placeholder_entity_map[f"[ID{serial_no}]"] = match
|
286 |
+
placeholder_entity_map[f"[ ID{serial_no} ]"] = match
|
287 |
+
placeholder_entity_map[f"[ID {serial_no}]"] = match
|
288 |
+
placeholder_entity_map[f"<ID{serial_no}]"] = match
|
289 |
+
placeholder_entity_map[f"< ID{serial_no}]"] = match
|
290 |
+
placeholder_entity_map[f"<ID{serial_no} ]"] = match
|
291 |
+
|
292 |
+
# Handle Indic failure cases
|
293 |
+
for i in range(len(self._INDIC_FAILURE_CASES)):
|
294 |
+
indic_case = self._INDIC_FAILURE_CASES[i]
|
295 |
+
placeholder_entity_map[f"<{indic_case}{serial_no}>"] = match
|
296 |
+
placeholder_entity_map[f"< {indic_case}{serial_no} >"] = match
|
297 |
+
placeholder_entity_map[f"< {indic_case} {serial_no} >"] = match
|
298 |
+
placeholder_entity_map[f"<{indic_case} {serial_no}]"] = match
|
299 |
+
placeholder_entity_map[f"< {indic_case} {serial_no} ]"] = match
|
300 |
+
placeholder_entity_map[f"[{indic_case}{serial_no}]"] = match
|
301 |
+
placeholder_entity_map[f"[{indic_case} {serial_no}]"] = match
|
302 |
+
placeholder_entity_map[f"[ {indic_case}{serial_no} ]"] = match
|
303 |
+
placeholder_entity_map[f"[ {indic_case} {serial_no} ]"] = match
|
304 |
+
placeholder_entity_map[f"{indic_case} {serial_no}"] = match
|
305 |
+
placeholder_entity_map[f"{indic_case}{serial_no}"] = match
|
306 |
+
|
307 |
+
# Replace the match with the base placeholder
|
308 |
+
text = text.replace(match, base_placeholder)
|
309 |
+
serial_no += 1
|
310 |
+
|
311 |
+
# Clean up any remaining placeholder artifacts
|
312 |
+
text = re.sub(r"\s+", " ", text).replace(">/", ">").replace("]/", "]")
|
313 |
+
self._placeholder_entity_maps.put(placeholder_entity_map)
|
314 |
+
return text
|
315 |
+
|
316 |
+
# Internal Method: Normalize Text
|
317 |
+
cdef str _normalize(self, str text) except *:
|
318 |
+
"""
|
319 |
+
Normalizes numerals and optionally wraps placeholders.
|
320 |
+
"""
|
321 |
+
# Single-pass digit translation
|
322 |
+
text = text.translate(self._digits_translation_table)
|
323 |
+
|
324 |
+
if self.inference:
|
325 |
+
text = self._wrap_with_placeholders(text)
|
326 |
+
return text
|
327 |
+
|
328 |
+
# Internal Method: Indic Tokenize and Transliterate
|
329 |
+
cdef str _do_indic_tokenize_and_transliterate(
|
330 |
+
self,
|
331 |
+
str sentence,
|
332 |
+
object normalizer,
|
333 |
+
str iso_lang,
|
334 |
+
bint transliterate
|
335 |
+
) except *:
|
336 |
+
"""
|
337 |
+
Helper method: normalizes, tokenizes, optionally transliterates from iso_lang -> 'hi'.
|
338 |
+
"""
|
339 |
+
cdef str normed
|
340 |
+
cdef list tokens
|
341 |
+
cdef str joined
|
342 |
+
cdef str xlated
|
343 |
+
|
344 |
+
normed = normalizer.normalize(sentence.strip())
|
345 |
+
tokens = indic_tokenize.trivial_tokenize(normed, iso_lang)
|
346 |
+
joined = " ".join(tokens)
|
347 |
+
xlated = joined
|
348 |
+
if transliterate:
|
349 |
+
xlated = self._xliterator.transliterate(joined, iso_lang, "hi")
|
350 |
+
xlated = xlated.replace(" ् ", "्")
|
351 |
+
return xlated
|
352 |
+
|
353 |
+
# Internal Method: Preprocess a Single Sentence
|
354 |
+
cdef str _preprocess(
|
355 |
+
self,
|
356 |
+
str sent,
|
357 |
+
str src_lang,
|
358 |
+
str tgt_lang,
|
359 |
+
object normalizer,
|
360 |
+
bint is_target
|
361 |
+
) except *:
|
362 |
+
"""
|
363 |
+
Preprocess a single sentence: punctuation normalization, numeral normalization,
|
364 |
+
tokenization, transliteration, and adding language tags if necessary.
|
365 |
+
"""
|
366 |
+
cdef str iso_lang = self._flores_codes.get(src_lang, "hi")
|
367 |
+
cdef str script_part = src_lang.split("_")[1]
|
368 |
+
cdef bint do_transliterate = True
|
369 |
+
cdef str e_strip
|
370 |
+
cdef str e_norm
|
371 |
+
cdef list e_tokens
|
372 |
+
cdef str processed_sent
|
373 |
+
|
374 |
+
# 1) Punctuation normalization
|
375 |
+
sent = self._punc_norm(sent)
|
376 |
+
|
377 |
+
# 2) Numerals & placeholders
|
378 |
+
sent = self._normalize(sent)
|
379 |
+
|
380 |
+
if script_part in ["Arab", "Aran", "Olck", "Mtei", "Latn"]:
|
381 |
+
do_transliterate = False
|
382 |
+
|
383 |
+
if iso_lang == "en":
|
384 |
+
# English path
|
385 |
+
e_strip = sent.strip()
|
386 |
+
e_norm = self._en_normalizer.normalize(e_strip)
|
387 |
+
e_tokens = self._en_tok.tokenize(e_norm, escape=False)
|
388 |
+
processed_sent = " ".join(e_tokens)
|
389 |
+
else:
|
390 |
+
# Indic path
|
391 |
+
processed_sent = self._do_indic_tokenize_and_transliterate(sent, normalizer, iso_lang, do_transliterate)
|
392 |
+
|
393 |
+
processed_sent = processed_sent.strip()
|
394 |
+
if not is_target:
|
395 |
+
return f"{src_lang} {tgt_lang} {processed_sent}"
|
396 |
+
else:
|
397 |
+
return processed_sent
|
398 |
+
|
399 |
+
# Internal Method: Postprocess a Single Sentence
|
400 |
+
cdef str _postprocess(self, object sent, str lang) except *:
|
401 |
+
"""
|
402 |
+
Postprocess a single sentence:
|
403 |
+
1) Pull placeholder map from queue
|
404 |
+
2) Fix scripts for Perso-Arabic
|
405 |
+
3) Restore placeholders
|
406 |
+
4) Detokenize
|
407 |
+
"""
|
408 |
+
cdef dict placeholder_entity_map
|
409 |
+
cdef str lang_code
|
410 |
+
cdef str script_code
|
411 |
+
cdef str iso_lang
|
412 |
+
cdef str k
|
413 |
+
cdef str v
|
414 |
+
cdef str xlated
|
415 |
+
|
416 |
+
# Unwrap if sent is a tuple or list
|
417 |
+
if isinstance(sent, (tuple, list)):
|
418 |
+
sent = sent[0]
|
419 |
+
|
420 |
+
placeholder_entity_map = self._placeholder_entity_maps.get()
|
421 |
+
lang_code, script_code = lang.split("_", 1)
|
422 |
+
iso_lang = self._flores_codes.get(lang, "hi")
|
423 |
+
|
424 |
+
# Fix for Perso-Arabic scripts
|
425 |
+
if script_code in ["Arab", "Aran"]:
|
426 |
+
sent = (
|
427 |
+
sent.replace(" ؟", "؟")
|
428 |
+
.replace(" ۔", "۔")
|
429 |
+
.replace(" ،", "،")
|
430 |
+
.replace("ٮ۪", "ؠ")
|
431 |
+
)
|
432 |
+
|
433 |
+
# Oriya fix
|
434 |
+
if lang_code == "ory":
|
435 |
+
sent = sent.replace("ଯ଼", "ୟ")
|
436 |
+
|
437 |
+
# Restore placeholders
|
438 |
+
for k, v in placeholder_entity_map.items():
|
439 |
+
sent = sent.replace(k, v)
|
440 |
+
|
441 |
+
# Detokenize
|
442 |
+
if lang == "eng_Latn":
|
443 |
+
return self._en_detok.detokenize(sent.split(" "))
|
444 |
+
else:
|
445 |
+
xlated = self._xliterator.transliterate(sent, "hi", iso_lang)
|
446 |
+
return indic_detokenize.trivial_detokenize(xlated, iso_lang)
|
447 |
+
|
448 |
+
# Exposed Method: Preprocess a Batch of Sentences
|
449 |
+
cpdef list preprocess_batch(
|
450 |
+
self,
|
451 |
+
List[str] batch,
|
452 |
+
str src_lang,
|
453 |
+
str tgt_lang=None,
|
454 |
+
bint is_target=False,
|
455 |
+
bint visualize=False
|
456 |
+
):
|
457 |
+
"""
|
458 |
+
Preprocess an array of sentences (normalize, tokenize, transliterate).
|
459 |
+
This is exposed for external use.
|
460 |
+
"""
|
461 |
+
cdef object normalizer = None
|
462 |
+
cdef str iso_code = self._flores_codes.get(src_lang, "hi")
|
463 |
+
cdef object iterator
|
464 |
+
cdef list results
|
465 |
+
cdef int i
|
466 |
+
cdef int n = len(batch)
|
467 |
+
|
468 |
+
if src_lang != "eng_Latn":
|
469 |
+
normalizer = IndicNormalizerFactory().get_normalizer(iso_code)
|
470 |
+
|
471 |
+
if visualize:
|
472 |
+
iterator = tqdm(batch, total=n, desc=f" | > Pre-processing {src_lang}", unit="line")
|
473 |
+
else:
|
474 |
+
iterator = batch
|
475 |
+
|
476 |
+
return [self._preprocess(s, src_lang, tgt_lang, normalizer, is_target) for s in iterator]
|
477 |
+
|
478 |
+
# Exposed Method: Postprocess a Batch of Sentences
|
479 |
+
cpdef list postprocess_batch(
|
480 |
+
self,
|
481 |
+
List[str] sents,
|
482 |
+
str lang="hin_Deva",
|
483 |
+
bint visualize=False
|
484 |
+
):
|
485 |
+
"""
|
486 |
+
Postprocess a batch of sentences:
|
487 |
+
Restore placeholders, fix script issues, and detokenize.
|
488 |
+
This is exposed for external use.
|
489 |
+
"""
|
490 |
+
cdef object iterator
|
491 |
+
cdef list results
|
492 |
+
cdef int i
|
493 |
+
cdef int n = len(sents)
|
494 |
+
|
495 |
+
if visualize:
|
496 |
+
iterator = tqdm(sents, total=n, desc=f" | > Post-processing {lang}", unit="line")
|
497 |
+
else:
|
498 |
+
iterator = sents
|
499 |
+
|
500 |
+
results = [self._postprocess(s, lang) for s in iterator]
|
501 |
+
self._placeholder_entity_maps.queue.clear()
|
502 |
+
|
503 |
+
return results
|
libs/IndicTransToolkit/IndicTransToolkit/version.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__version__ = "1.0.3"
|
libs/IndicTransToolkit/IndicTransToolkit/version.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
1.0.3
|
libs/IndicTransToolkit/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) Varun Gumma.
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE
|
libs/IndicTransToolkit/README.md
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# IndicTransToolkit
|
2 |
+
|
3 |
+
## About
|
4 |
+
The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
|
5 |
+
|
6 |
+
## Pre-requisites
|
7 |
+
- `Python 3.8+`
|
8 |
+
- [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
|
9 |
+
- Other requirements as listed in `requirements.txt`
|
10 |
+
|
11 |
+
## Configuration
|
12 |
+
- Editable installation (Note, this may take a while):
|
13 |
+
```bash
|
14 |
+
git clone https://github.com/VarunGumma/IndicTransToolkit
|
15 |
+
cd IndicTransToolkit
|
16 |
+
|
17 |
+
pip install --editable . --use-pep517 # required for pip >= 25.0
|
18 |
+
|
19 |
+
# in case it fails, try:
|
20 |
+
# pip install --editable . --use-pep517 --config-settings editable_mode=compat
|
21 |
+
```
|
22 |
+
|
23 |
+
## Examples
|
24 |
+
For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
|
25 |
+
|
26 |
+
### PreTainedTokenizer
|
27 |
+
```python
|
28 |
+
import torch
|
29 |
+
from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
|
30 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
31 |
+
|
32 |
+
ip = IndicProcessor(inference=True)
|
33 |
+
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
|
34 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
|
35 |
+
|
36 |
+
sentences = [
|
37 |
+
"This is a test sentence.",
|
38 |
+
"This is another longer different test sentence.",
|
39 |
+
"Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
|
40 |
+
]
|
41 |
+
|
42 |
+
batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
|
43 |
+
batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
|
44 |
+
|
45 |
+
with torch.inference_mode():
|
46 |
+
outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
|
47 |
+
|
48 |
+
with tokenizer.as_target_tokenizer():
|
49 |
+
# This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
|
50 |
+
# Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
|
51 |
+
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
52 |
+
|
53 |
+
outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
|
54 |
+
print(outputs)
|
55 |
+
|
56 |
+
>>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
|
57 |
+
```
|
58 |
+
|
59 |
+
### Evaluation
|
60 |
+
- `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
|
61 |
+
- We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
|
62 |
+
```python
|
63 |
+
from IndicTransToolkit import IndicEvaluator
|
64 |
+
|
65 |
+
# this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
|
66 |
+
evaluator = IndicEvaluator()
|
67 |
+
scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
|
68 |
+
|
69 |
+
# alternatively, you can pass the list of predictions and references instead of files
|
70 |
+
# scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
|
71 |
+
```
|
72 |
+
|
73 |
+
## Authors
|
74 |
+
- Varun Gumma ([email protected])
|
75 |
+
- Jay Gala ([email protected])
|
76 |
+
- Pranjal Agadh Chitale ([email protected])
|
77 |
+
- Raj Dabre ([email protected])
|
78 |
+
|
79 |
+
|
80 |
+
## Bugs and Contribution
|
81 |
+
Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
|
82 |
+
|
83 |
+
|
84 |
+
## Citation
|
85 |
+
If you use our codebase, or models, please do cite the following paper:
|
86 |
+
```bibtex
|
87 |
+
@article{
|
88 |
+
gala2023indictrans,
|
89 |
+
title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
|
90 |
+
author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
|
91 |
+
journal={Transactions on Machine Learning Research},
|
92 |
+
issn={2835-8856},
|
93 |
+
year={2023},
|
94 |
+
url={https://openreview.net/forum?id=vfT4YuzAYA},
|
95 |
+
note={}
|
96 |
+
}
|
97 |
+
```
|
libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d57d4239b3638a272e4b70292f10494ee4a0fee201a9d74c62fc35a3d263a45
|
3 |
+
size 260304
|
libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
|
3 |
+
size 229200
|
libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9e82df38b208dc0a9b468ff669c9da159c7deaabcb389fcfacd43e038504fec
|
3 |
+
size 347184
|
libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d27c2cc00c97a89f97f7c28bc9175c5c403a0e2a372a0b39f1c5fe8609adda09
|
3 |
+
size 303696
|
libs/IndicTransToolkit/pyproject.toml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = [
|
3 |
+
"setuptools>=68.2.2",
|
4 |
+
"wheel",
|
5 |
+
"Cython",
|
6 |
+
]
|
7 |
+
build-backend = "setuptools.build_meta"
|
8 |
+
|
9 |
+
[tool.black]
|
10 |
+
# Black configuration for code formatting
|
11 |
+
line-length = 88
|
12 |
+
target-version = ['py38']
|
13 |
+
exclude = '''
|
14 |
+
/(
|
15 |
+
\.git
|
16 |
+
| \.hg
|
17 |
+
| \.mypy_cache
|
18 |
+
| \.tox
|
19 |
+
| \.venv
|
20 |
+
| _build
|
21 |
+
| buck-out
|
22 |
+
| build
|
23 |
+
| dist
|
24 |
+
)/
|
25 |
+
'''
|
libs/IndicTransToolkit/requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
setuptools>=68.2.2
|
2 |
+
torch
|
3 |
+
cython
|
4 |
+
sacremoses
|
5 |
+
sentencepiece
|
6 |
+
transformers
|
7 |
+
sacrebleu
|
8 |
+
indic-nlp-library-IT2 @ git+https://github.com/VarunGumma/indic_nlp_library.git
|
libs/IndicTransToolkit/setup.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pathlib
|
3 |
+
from sys import version_info, exit
|
4 |
+
from setuptools import setup, find_packages
|
5 |
+
from Cython.Build import cythonize
|
6 |
+
from pkg_resources import parse_requirements
|
7 |
+
|
8 |
+
def write_version_py():
|
9 |
+
version_txt_path = os.path.join("IndicTransToolkit", "version.txt")
|
10 |
+
with open(version_txt_path, "r", encoding="utf-8") as f:
|
11 |
+
version = f.read().strip()
|
12 |
+
|
13 |
+
version_py_path = os.path.join("IndicTransToolkit", "version.py")
|
14 |
+
with open(version_py_path, "w", encoding="utf-8") as f:
|
15 |
+
f.write(f'__version__ = "{version}"\n')
|
16 |
+
return version
|
17 |
+
|
18 |
+
# Enforce Python >= 3.8
|
19 |
+
if version_info < (3, 8):
|
20 |
+
exit("Sorry, Python >= 3.8 is required for IndicTransToolkit.")
|
21 |
+
|
22 |
+
# Read long description from README
|
23 |
+
with open("README.md", "r", errors="ignore", encoding="utf-8") as fh:
|
24 |
+
long_description = fh.read().strip()
|
25 |
+
|
26 |
+
# Write version.py from version.txt
|
27 |
+
version = write_version_py()
|
28 |
+
|
29 |
+
# Parse requirements.txt
|
30 |
+
req_file = pathlib.Path("requirements.txt")
|
31 |
+
requirements = [str(req) for req in parse_requirements(req_file.open())]
|
32 |
+
|
33 |
+
# Cython files to compile (adjust if your .pyx name differs)
|
34 |
+
cython_extensions = cythonize(
|
35 |
+
[
|
36 |
+
"IndicTransToolkit/processor.pyx",
|
37 |
+
],
|
38 |
+
compiler_directives={"language_level": "3", "boundscheck": False},
|
39 |
+
)
|
40 |
+
|
41 |
+
setup(
|
42 |
+
name="IndicTransToolkit",
|
43 |
+
version=version,
|
44 |
+
author="Varun Gumma",
|
45 |
+
author_email="[email protected]",
|
46 |
+
description="A simple, consistent, and extendable module for IndicTrans2 tokenizer compatible with HuggingFace models",
|
47 |
+
long_description=long_description,
|
48 |
+
long_description_content_type="text/markdown",
|
49 |
+
url="https://github.com/VarunGumma/IndicTransToolkit",
|
50 |
+
packages=find_packages(), # Auto-detect packages
|
51 |
+
license="MIT",
|
52 |
+
classifiers=[
|
53 |
+
"Programming Language :: Python :: 3",
|
54 |
+
"License :: OSI Approved :: MIT License",
|
55 |
+
"Operating System :: OS Independent",
|
56 |
+
],
|
57 |
+
python_requires=">=3.8",
|
58 |
+
install_requires=requirements,
|
59 |
+
ext_modules=cython_extensions,
|
60 |
+
zip_safe=False,
|
61 |
+
)
|
libs/indic_nlp_library/.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/__pycache__/
|
2 |
+
*.egg-info/
|
3 |
+
dist/
|
4 |
+
build/
|
5 |
+
contrib/
|
6 |
+
docs/
|
libs/indic_nlp_library/LICENSE
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2013-present Anoop Kunchukuttan
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
6 |
+
|
7 |
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
8 |
+
|
9 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
libs/indic_nlp_library/README.md
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Indic NLP Library
|
2 |
+
This repository is a _de-bloated_ fork of the original [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library) and integrates [UrduHack](https://github.com/urduhack/urduhack) submodule and [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) directly. This allows to work with Urdu normalization and tokenization without needing to install [urduhack](https://pypi.org/project/urduhack/) and `indic_nlp_resources` separately, which can be an issue sometimes as it is `TensorFlow` based. This repository is mainly created and mainted for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)
|
3 |
+
|
4 |
+
For any queries, please get in touch with the original authors/maintainers of the respective libraries:
|
5 |
+
|
6 |
+
- `Indic NLP Library`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
|
7 |
+
- `Indic NLP Resources`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
|
8 |
+
- `UrduHack`: [UrduHack](https://github.com/urduhack)
|
9 |
+
|
10 |
+
## Usage:
|
11 |
+
```
|
12 |
+
git clone https://github.com/VarunGumma/indic_nlp_library.git
|
13 |
+
|
14 |
+
cd indic_nlp_library
|
15 |
+
pip install --editable ./
|
16 |
+
```
|
17 |
+
|
18 |
+
## Updates:
|
19 |
+
- Integrated `urduhack` directly into the repository.
|
20 |
+
- Renamed `master` branch as `main`.
|
21 |
+
- Integrated `indic_nlp_resources` directly into the repository.
|
22 |
+
- _De-bloated_ the repository.
|
libs/indic_nlp_library/RESOURCES/script/all_script_phonetic_data.csv
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded
|
2 |
+
900,0,ऀ,ऀ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
3 |
+
901,1,ँ,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
4 |
+
902,2,ं,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
5 |
+
903,3,ः,H,Should represent as pure aspiration and not as a vowel,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
6 |
+
904,4,ऄ,ऄ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
7 |
+
905,5,अ,a,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
8 |
+
906,6,आ,A,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
9 |
+
907,7,इ,i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
10 |
+
908,8,ई,I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
11 |
+
909,9,उ,u,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
12 |
+
90a,10,ऊ,uu,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
13 |
+
90b,11,ऋ,R^i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
|
14 |
+
90c,12,ऌ,LLi,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
|
15 |
+
90d,13,ऍ,ऍ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
|
16 |
+
90e,14,ऎ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
17 |
+
90f,15,ए,e,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
18 |
+
910,16,ऐ,ai,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
|
19 |
+
911,17,ऑ,ऑ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
|
20 |
+
912,18,ऒ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
21 |
+
913,19,ओ,o,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
22 |
+
914,20,औ,au,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
|
23 |
+
915,21,क,ka,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
24 |
+
916,22,ख,kha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
25 |
+
917,23,ग,ga,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
26 |
+
918,24,घ,gha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
27 |
+
919,25,ङ,~Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
28 |
+
91a,26,च,ca,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
29 |
+
91b,27,छ,Cha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
30 |
+
91c,28,ज,ja,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
31 |
+
91d,29,झ,jha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
32 |
+
91e,30,ञ,JNa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
33 |
+
91f,31,ट,Ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
34 |
+
920,32,ठ,Tha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
35 |
+
921,33,ड,Da,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
36 |
+
922,34,ढ,Dha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
37 |
+
923,35,ण,Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
38 |
+
924,36,त,ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
39 |
+
925,37,थ,tha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
40 |
+
926,38,द,da,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
41 |
+
927,39,ध,dha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
42 |
+
928,40,न,na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
43 |
+
929,41,ऩ,ऩ,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
44 |
+
92a,42,प,pa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
45 |
+
92b,43,फ,pha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
46 |
+
92c,44,ब,ba,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
47 |
+
92d,45,भ,bha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
48 |
+
92e,46,म,ma,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
49 |
+
92f,47,य,ya,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
50 |
+
930,48,र,ra,alveolar or dental- approximated by dental/ can also be considered a rhotic consonant (flap ie tap),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
51 |
+
931,49,ऱ,Ra,retroflex (trill),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
52 |
+
932,50,ल,la,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
53 |
+
933,51,ळ,La,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
54 |
+
934,52,ऴ,zha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
55 |
+
935,53,व,va,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
56 |
+
936,54,श,sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
57 |
+
937,55,ष,Sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
58 |
+
938,56,स,sa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
59 |
+
939,57,ह,ha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
60 |
+
93a,58,ऺ,ऺ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
61 |
+
93b,59,ऻ,ऻ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
62 |
+
93c,60,़,़,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
63 |
+
93d,61,ऽ,.a,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
64 |
+
93e,62,ा,A,,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
65 |
+
93f,63,ि,i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
66 |
+
940,64,ी,I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
67 |
+
941,65,ु,u,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
68 |
+
942,66,ू,uu,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
69 |
+
943,67,ृ,R^i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
|
70 |
+
944,68,ॄ,R^I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
|
71 |
+
945,69,ॅ,ॅ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
|
72 |
+
946,70,ॆ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
73 |
+
947,71,े,e,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
74 |
+
948,72,ै,ai,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
|
75 |
+
949,73,ॉ,ॉ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
|
76 |
+
94a,74,ॊ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
77 |
+
94b,75,ो,o,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
78 |
+
94c,76,ौ,au,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
|
79 |
+
94d,77,्,,,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
80 |
+
94e,78,ॎ,ॎ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
81 |
+
94f,79,ॏ,ॏ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
82 |
+
950,80,ॐ,AUM,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
83 |
+
951,81,॑,॑,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
84 |
+
952,82,॒,॒,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
85 |
+
953,83,॓,॓,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
86 |
+
954,84,॔,॔,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
87 |
+
955,85,ॕ,ॕ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
88 |
+
956,86,ॖ,ॖ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
89 |
+
957,87,ॗ,ॗ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
90 |
+
958,88,क़,क़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
91 |
+
959,89,ख़,ख़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
92 |
+
95a,90,ग़,ग़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
93 |
+
95b,91,ज़,ज़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
94 |
+
95c,92,ड़,ड़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
95 |
+
95d,93,ढ़,ढ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
96 |
+
95e,94,फ़,फ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
97 |
+
95f,95,य़,य़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
98 |
+
960,96,ॠ,R^I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
|
99 |
+
961,97,ॡ,L^I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
|
100 |
+
962,98,ॢ,LLi,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
|
101 |
+
963,99,ॣ,L^I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
|
102 |
+
964,100,।,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
103 |
+
965,101,॥,..,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
104 |
+
966,102,०,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
105 |
+
967,103,१,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
106 |
+
968,104,२,2,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
107 |
+
969,105,३,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
108 |
+
96a,106,४,4,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
109 |
+
96b,107,५,5,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
110 |
+
96c,108,६,6,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
111 |
+
96d,109,७,7,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
112 |
+
96e,110,८,8,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
113 |
+
96f,111,९,9,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
libs/indic_nlp_library/RESOURCES/script/english_arpabet_list.csv
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AO
|
2 |
+
AA
|
3 |
+
IY
|
4 |
+
UW
|
5 |
+
EH
|
6 |
+
IH
|
7 |
+
UH
|
8 |
+
AH
|
9 |
+
AX
|
10 |
+
AE
|
11 |
+
EY
|
12 |
+
AY
|
13 |
+
OW
|
14 |
+
AW
|
15 |
+
OY
|
16 |
+
P
|
17 |
+
B
|
18 |
+
T
|
19 |
+
D
|
20 |
+
K
|
21 |
+
G
|
22 |
+
CH
|
23 |
+
JH
|
24 |
+
F
|
25 |
+
V
|
26 |
+
TH
|
27 |
+
DH
|
28 |
+
S
|
29 |
+
Z
|
30 |
+
SH
|
31 |
+
ZH
|
32 |
+
HH
|
33 |
+
M
|
34 |
+
EM
|
35 |
+
N
|
36 |
+
EN
|
37 |
+
NG
|
38 |
+
ENG
|
39 |
+
L
|
40 |
+
EL
|
41 |
+
R
|
42 |
+
DX
|
43 |
+
NX
|
44 |
+
Y
|
45 |
+
W
|
46 |
+
Q
|
libs/indic_nlp_library/RESOURCES/script/english_script_phonetic_data.csv
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded
|
2 |
+
900,0,,AO,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
|
3 |
+
901,1,,AA,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
4 |
+
902,2,,IY,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
5 |
+
903,3,,UW,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
6 |
+
904,4,ए,EH,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
7 |
+
905,5,इ,IH,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
8 |
+
906,6,उ,UH,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
9 |
+
907,7,अ,AH,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
10 |
+
908,8,अ,AX,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
11 |
+
909,9,ऍ,AE,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
|
12 |
+
90a,10,,EY,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
13 |
+
90b,11,ऐ,AY,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
|
14 |
+
90c,12,ओ,OW,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
15 |
+
90d,13,औ,AW,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
|
16 |
+
90e,14,,OY,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
|
17 |
+
90f,15,,P,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
18 |
+
910,16,,B,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
19 |
+
911,17,,T,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
20 |
+
912,18,,D,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
21 |
+
913,19,,K,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
22 |
+
914,20,,G,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
23 |
+
915,21,,CH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
24 |
+
916,22,,JH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
25 |
+
917,23,,F,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
26 |
+
918,24,,V,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
27 |
+
919,25,,TH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
|
28 |
+
91a,26,,DH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
29 |
+
91b,27,,S,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
30 |
+
91c,28,,Z,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
31 |
+
91d,29,,SH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
32 |
+
91e,30,,ZH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
33 |
+
91f,31,,HH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
34 |
+
920,32,,M,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
35 |
+
921,33,,EM,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
36 |
+
922,34,,N,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
37 |
+
923,35,,EN,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
38 |
+
924,36,,NG,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
39 |
+
925,37,,ENG,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
40 |
+
926,38,,L,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
41 |
+
927,39,,EL,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
42 |
+
928,40,,R,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
43 |
+
929,41,,DX,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
44 |
+
92a,42,,NX,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
45 |
+
92b,43,,Y,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
46 |
+
92c,44,,W,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
47 |
+
92d,45,,Q,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
libs/indic_nlp_library/RESOURCES/script/tamil_script_phonetic_data.csv
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded
|
2 |
+
900,0,ऀ,ऀ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
3 |
+
901,1,ँ,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
4 |
+
902,2,ं,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
5 |
+
903,3,ः,H,Should represent as pure aspiration and not as a vowel,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
6 |
+
904,4,ऄ,ऄ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
7 |
+
905,5,अ,a,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
8 |
+
906,6,आ,A,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
9 |
+
907,7,इ,i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
10 |
+
908,8,ई,I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
11 |
+
909,9,उ,u,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
12 |
+
90a,10,ऊ,uu,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
13 |
+
90b,11,ऋ,R^i,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
14 |
+
90c,12,ऌ,LLi,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
15 |
+
90d,13,ऍ,ऍ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
|
16 |
+
90e,14,ऎ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
17 |
+
90f,15,ए,e,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
18 |
+
910,16,ऐ,ai,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
|
19 |
+
911,17,ऑ,ऑ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
|
20 |
+
912,18,ऒ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
21 |
+
913,19,ओ,o,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
22 |
+
914,20,औ,au,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
|
23 |
+
915,21,क,ka,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
|
24 |
+
916,22,ख,kha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
25 |
+
917,23,ग,ga,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
26 |
+
918,24,घ,gha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
27 |
+
919,25,ङ,~Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
28 |
+
91a,26,च,ca,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
|
29 |
+
91b,27,छ,Cha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
30 |
+
91c,28,ज,ja,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
31 |
+
91d,29,झ,jha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
32 |
+
91e,30,ञ,JNa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
33 |
+
91f,31,ट,Ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
|
34 |
+
920,32,ठ,Tha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
35 |
+
921,33,ड,Da,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
36 |
+
922,34,ढ,Dha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
37 |
+
923,35,ण,Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
38 |
+
924,36,त,ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
|
39 |
+
925,37,थ,tha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
40 |
+
926,38,द,da,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
41 |
+
927,39,ध,dha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
42 |
+
928,40,न,na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
43 |
+
929,41,ऩ,ऩ,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
44 |
+
92a,42,प,pa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
|
45 |
+
92b,43,फ,pha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
46 |
+
92c,44,ब,ba,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
47 |
+
92d,45,भ,bha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
48 |
+
92e,46,म,ma,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
|
49 |
+
92f,47,य,ya,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
50 |
+
930,48,र,ra,alveolar or dental- approximated by dental/ can also be considered a rhotic consonant (flap ie tap),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
51 |
+
931,49,ऱ,Ra,retroflex (trill),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
52 |
+
932,50,ल,la,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
53 |
+
933,51,ळ,La,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
54 |
+
934,52,ऴ,zha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
55 |
+
935,53,व,va,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
|
56 |
+
936,54,श,sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
57 |
+
937,55,ष,Sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
58 |
+
938,56,स,sa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
|
59 |
+
939,57,ह,ha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
|
60 |
+
93a,58,ऺ,ऺ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
61 |
+
93b,59,ऻ,ऻ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
62 |
+
93c,60,़,़,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
63 |
+
93d,61,ऽ,.a,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
64 |
+
93e,62,ा,A,,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
|
65 |
+
93f,63,ि,i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
66 |
+
940,64,ी,I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
|
67 |
+
941,65,ु,u,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
68 |
+
942,66,ू,uu,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
|
69 |
+
943,67,ृ,R^i,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
70 |
+
944,68,ॄ,R^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
71 |
+
945,69,ॅ,ॅ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
|
72 |
+
946,70,ॆ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
73 |
+
947,71,े,e,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
|
74 |
+
948,72,ै,ai,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
|
75 |
+
949,73,ॉ,ॉ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
|
76 |
+
94a,74,ॊ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
77 |
+
94b,75,ो,o,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
|
78 |
+
94c,76,ौ,au,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
|
79 |
+
94d,77,्,,,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
80 |
+
94e,78,ॎ,ॎ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
81 |
+
94f,79,ॏ,ॏ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
82 |
+
950,80,ॐ,AUM,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
83 |
+
951,81,॑,॑,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
84 |
+
952,82,॒,॒,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
85 |
+
953,83,॓,॓,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
86 |
+
954,84,॔,॔,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
87 |
+
955,85,ॕ,ॕ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
88 |
+
956,86,ॖ,ॖ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
89 |
+
957,87,ॗ,ॗ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
90 |
+
958,88,क़,क़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
91 |
+
959,89,ख़,ख़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
92 |
+
95a,90,ग़,ग़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
93 |
+
95b,91,ज़,ज़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
94 |
+
95c,92,ड़,ड़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
95 |
+
95d,93,ढ़,ढ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
96 |
+
95e,94,फ़,फ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
97 |
+
95f,95,य़,य़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
98 |
+
960,96,ॠ,R^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
|
99 |
+
961,97,ॡ,L^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
|
100 |
+
962,98,ॢ,LLi,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
101 |
+
963,99,ॣ,L^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
102 |
+
964,100,।,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
103 |
+
965,101,॥,..,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
104 |
+
966,102,०,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
105 |
+
967,103,१,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
106 |
+
968,104,२,2,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
107 |
+
969,105,३,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
108 |
+
96a,106,४,4,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
109 |
+
96b,107,५,5,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
110 |
+
96c,108,६,6,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
111 |
+
96d,109,७,7,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
112 |
+
96e,110,८,8,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
113 |
+
96f,111,९,9,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
libs/indic_nlp_library/RESOURCES/transliterate/offset_itrans_map.csv
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
offset_hex,devnag_char,itrans
|
2 |
+
0x0,ऀ,ऀ
|
3 |
+
0x1,ँ,ँ
|
4 |
+
0x2,ं,.m
|
5 |
+
0x3,ः,H
|
6 |
+
0x4,ऄ,ऄ
|
7 |
+
0x5,अ,a
|
8 |
+
0x6,आ,aa
|
9 |
+
0x7,इ,i
|
10 |
+
0x8,ई,ii
|
11 |
+
0x9,उ,u
|
12 |
+
0xa,ऊ,uu
|
13 |
+
0xb,ऋ,R^i
|
14 |
+
0xc,ऌ,L^i
|
15 |
+
0xd,ऍ,ऍ
|
16 |
+
0xe,ऎ,.e
|
17 |
+
0xf,ए,e
|
18 |
+
0x10,ऐ,ai
|
19 |
+
0x11,ऑ,ऑ
|
20 |
+
0x12,ऒ,.o
|
21 |
+
0x13,ओ,o
|
22 |
+
0x14,औ,au
|
23 |
+
0x15,क,ka
|
24 |
+
0x16,ख,kha
|
25 |
+
0x17,ग,ga
|
26 |
+
0x18,घ,gha
|
27 |
+
0x19,ङ,~Na
|
28 |
+
0x1a,च,cha
|
29 |
+
0x1b,छ,Cha
|
30 |
+
0x1c,ज,ja
|
31 |
+
0x1d,झ,jha
|
32 |
+
0x1e,ञ,~na
|
33 |
+
0x1f,ट,Ta
|
34 |
+
0x20,ठ,Tha
|
35 |
+
0x21,ड,Da
|
36 |
+
0x22,ढ,Dha
|
37 |
+
0x23,ण,Na
|
38 |
+
0x24,त,ta
|
39 |
+
0x25,थ,tha
|
40 |
+
0x26,द,da
|
41 |
+
0x27,ध,dha
|
42 |
+
0x28,न,na
|
43 |
+
0x29,ऩ,*na
|
44 |
+
0x2a,प,pa
|
45 |
+
0x2b,फ,pha
|
46 |
+
0x2c,ब,ba
|
47 |
+
0x2d,भ,bha
|
48 |
+
0x2e,म,ma
|
49 |
+
0x2f,य,ya
|
50 |
+
0x30,र,ra
|
51 |
+
0x31,ऱ,Ra
|
52 |
+
0x32,ल,la
|
53 |
+
0x33,ळ,lda
|
54 |
+
0x34,ऴ,zha
|
55 |
+
0x35,व,va
|
56 |
+
0x36,श,sha
|
57 |
+
0x37,ष,Sha
|
58 |
+
0x38,स,sa
|
59 |
+
0x39,ह,ha
|
60 |
+
0x3a,ऺ,ऺ
|
61 |
+
0x3b,ऻ,ऻ
|
62 |
+
0x3c,़,़
|
63 |
+
0x3d,ऽ,.a
|
64 |
+
0x3e,ा,aa
|
65 |
+
0x3f,ि,i
|
66 |
+
0x40,ी,ii
|
67 |
+
0x41,ु,u
|
68 |
+
0x42,ू,uu
|
69 |
+
0x43,ृ,R^i
|
70 |
+
0x44,ॄ,R^I
|
71 |
+
0x45,ॅ,ॅ
|
72 |
+
0x46,ॆ,.e
|
73 |
+
0x47,े,e
|
74 |
+
0x48,ै,ai
|
75 |
+
0x49,ॉ,ॉ
|
76 |
+
0x4a,ॊ,.o
|
77 |
+
0x4b,ो,o
|
78 |
+
0x4c,ौ,au
|
79 |
+
0x4d,्,
|
80 |
+
0x4e,ॎ,ॎ
|
81 |
+
0x4f,ॏ,ॏ
|
82 |
+
0x50,ॐ,AUM
|
83 |
+
0x51,॑,॑
|
84 |
+
0x52,॒,॒
|
85 |
+
0x53,॓,॓
|
86 |
+
0x54,॔,॔
|
87 |
+
0x55,ॕ,ॕ
|
88 |
+
0x56,ॖ,ॖ
|
89 |
+
0x57,ॗ,ॗ
|
90 |
+
0x58,क़,क़
|
91 |
+
0x59,ख़,ख़
|
92 |
+
0x5a,ग़,ग़
|
93 |
+
0x5b,ज़,ज़
|
94 |
+
0x5c,ड़,ड़
|
95 |
+
0x5d,ढ़,ढ़
|
96 |
+
0x5e,फ़,फ़
|
97 |
+
0x5f,य़,य़
|
98 |
+
0x60,ॠ,R^I
|
99 |
+
0x61,ॡ,L^I
|
100 |
+
0x62,ॢ,L^i
|
101 |
+
0x63,ॣ,L^I
|
102 |
+
0x64,।,.
|
103 |
+
0x65,॥,..
|
104 |
+
0x66,०,0
|
105 |
+
0x67,१,1
|
106 |
+
0x68,२,2
|
107 |
+
0x69,३,3
|
108 |
+
0x6a,४,4
|
109 |
+
0x6b,५,5
|
110 |
+
0x6c,६,6
|
111 |
+
0x6d,७,7
|
112 |
+
0x6e,८,8
|
113 |
+
0x6f,९,9
|
114 |
+
0x70,॰,॰
|
115 |
+
0x71,ॱ,ॱ
|
116 |
+
0x72,ॲ,ॲ
|
117 |
+
0x73,ॳ,ॳ
|
118 |
+
0x74,ॴ,ॴ
|
119 |
+
0x75,ॵ,ॵ
|
120 |
+
0x76,ॶ,ॶ
|
121 |
+
0x77,ॷ,ॷ
|
122 |
+
0x78,ॸ,ॸ
|
123 |
+
0x79,ॹ,ॹ
|
124 |
+
0x7a,ॺ,ॺ
|
125 |
+
0x7b,ॻ,ॻ
|
126 |
+
0x7c,ॼ,ॼ
|
127 |
+
0x7d,ॽ,ॽ
|
128 |
+
0x7e,ॾ,ॾ
|
129 |
+
0x7f,ॿ,a
|
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.2
|
2 |
+
Name: indic_nlp_library_IT2
|
3 |
+
Version: 0.0.2
|
4 |
+
Summary: The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. This fork is specialized for IndicTrans2.
|
5 |
+
Home-page: https://github.com/VarunGumma/indic_nlp_library
|
6 |
+
Author: Varun Gumma
|
7 |
+
Author-email: [email protected]
|
8 |
+
License: MIT
|
9 |
+
Classifier: Programming Language :: Python :: 3
|
10 |
+
Classifier: License :: OSI Approved :: MIT License
|
11 |
+
Classifier: Operating System :: OS Independent
|
12 |
+
Requires-Python: >=3.8
|
13 |
+
Description-Content-Type: text/markdown
|
14 |
+
License-File: LICENSE
|
15 |
+
Requires-Dist: sphinx-argparse
|
16 |
+
Requires-Dist: sphinx_rtd_theme
|
17 |
+
Requires-Dist: morfessor
|
18 |
+
Requires-Dist: pandas
|
19 |
+
Requires-Dist: numpy
|
20 |
+
Dynamic: author
|
21 |
+
Dynamic: author-email
|
22 |
+
Dynamic: classifier
|
23 |
+
Dynamic: description
|
24 |
+
Dynamic: description-content-type
|
25 |
+
Dynamic: home-page
|
26 |
+
Dynamic: license
|
27 |
+
Dynamic: requires-dist
|
28 |
+
Dynamic: requires-python
|
29 |
+
Dynamic: summary
|
30 |
+
|
31 |
+
# Indic NLP Library
|
32 |
+
This repository is a _de-bloated_ fork of the original [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library) and integrates [UrduHack](https://github.com/urduhack/urduhack) submodule and [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) directly. This allows to work with Urdu normalization and tokenization without needing to install [urduhack](https://pypi.org/project/urduhack/) and `indic_nlp_resources` separately, which can be an issue sometimes as it is `TensorFlow` based. This repository is mainly created and mainted for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)
|
33 |
+
|
34 |
+
For any queries, please get in touch with the original authors/maintainers of the respective libraries:
|
35 |
+
|
36 |
+
- `Indic NLP Library`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
|
37 |
+
- `Indic NLP Resources`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
|
38 |
+
- `UrduHack`: [UrduHack](https://github.com/urduhack)
|
39 |
+
|
40 |
+
## Usage:
|
41 |
+
```
|
42 |
+
git clone https://github.com/VarunGumma/indic_nlp_library.git
|
43 |
+
|
44 |
+
cd indic_nlp_library
|
45 |
+
pip install --editable ./
|
46 |
+
```
|
47 |
+
|
48 |
+
## Updates:
|
49 |
+
- Integrated `urduhack` directly into the repository.
|
50 |
+
- Renamed `master` branch as `main`.
|
51 |
+
- Integrated `indic_nlp_resources` directly into the repository.
|
52 |
+
- _De-bloated_ the repository.
|
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LICENSE
|
2 |
+
README.md
|
3 |
+
setup.py
|
4 |
+
indic_nlp_library_IT2.egg-info/PKG-INFO
|
5 |
+
indic_nlp_library_IT2.egg-info/SOURCES.txt
|
6 |
+
indic_nlp_library_IT2.egg-info/dependency_links.txt
|
7 |
+
indic_nlp_library_IT2.egg-info/requires.txt
|
8 |
+
indic_nlp_library_IT2.egg-info/top_level.txt
|
9 |
+
indicnlp/__init__.py
|
10 |
+
indicnlp/common.py
|
11 |
+
indicnlp/langinfo.py
|
12 |
+
indicnlp/loader.py
|
13 |
+
indicnlp/version.py
|
14 |
+
indicnlp/normalize/__init__.py
|
15 |
+
indicnlp/normalize/indic_normalize.py
|
16 |
+
indicnlp/script/__init__.py
|
17 |
+
indicnlp/script/english_script.py
|
18 |
+
indicnlp/script/indic_scripts.py
|
19 |
+
indicnlp/script/phonetic_sim.py
|
20 |
+
indicnlp/tokenize/__init__.py
|
21 |
+
indicnlp/tokenize/indic_detokenize.py
|
22 |
+
indicnlp/tokenize/indic_tokenize.py
|
23 |
+
indicnlp/tokenize/sentence_tokenize.py
|
24 |
+
indicnlp/transliterate/__init__.py
|
25 |
+
indicnlp/transliterate/acronym_transliterator.py
|
26 |
+
indicnlp/transliterate/script_unifier.py
|
27 |
+
indicnlp/transliterate/unicode_transliterate.py
|
28 |
+
indicnlp/urduhack/__init__.py
|
29 |
+
indicnlp/urduhack/stop_words.py
|
30 |
+
indicnlp/urduhack/urdu_characters.py
|
31 |
+
indicnlp/urduhack/normalization/__init__.py
|
32 |
+
indicnlp/urduhack/normalization/character.py
|
33 |
+
indicnlp/urduhack/normalization/regexes.py
|
34 |
+
indicnlp/urduhack/preprocessing/__init__.py
|
35 |
+
indicnlp/urduhack/preprocessing/character.py
|
36 |
+
indicnlp/urduhack/preprocessing/regexes.py
|
37 |
+
indicnlp/urduhack/preprocessing/util.py
|
38 |
+
indicnlp/urduhack/tokenization/__init__.py
|
39 |
+
indicnlp/urduhack/tokenization/eos.py
|
40 |
+
indicnlp/urduhack/tokenization/tokenizer.py
|
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/requires.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sphinx-argparse
|
2 |
+
sphinx_rtd_theme
|
3 |
+
morfessor
|
4 |
+
pandas
|
5 |
+
numpy
|
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
indicnlp
|