Rajendransp133 commited on
Commit
ac901c7
·
verified ·
1 Parent(s): b1e54bd

Upload 86 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +7 -0
  2. .gitattributes +6 -0
  3. DockerFile +0 -0
  4. app.py +40 -0
  5. libs/IndicTransToolkit/.gitignore +4 -0
  6. libs/IndicTransToolkit/CHANGELOG.md +16 -0
  7. libs/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO +131 -0
  8. libs/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt +15 -0
  9. libs/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt +1 -0
  10. libs/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe +1 -0
  11. libs/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt +8 -0
  12. libs/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt +1 -0
  13. libs/IndicTransToolkit/IndicTransToolkit/__init__.py +9 -0
  14. libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc +0 -0
  15. libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc +0 -0
  16. libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc +0 -0
  17. libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc +0 -0
  18. libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc +0 -0
  19. libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc +0 -0
  20. libs/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc +0 -0
  21. libs/IndicTransToolkit/IndicTransToolkit/collator.py +74 -0
  22. libs/IndicTransToolkit/IndicTransToolkit/evaluator.py +151 -0
  23. libs/IndicTransToolkit/IndicTransToolkit/processor.c +0 -0
  24. libs/IndicTransToolkit/IndicTransToolkit/processor.cp313-win_amd64.pyd +3 -0
  25. libs/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
  26. libs/IndicTransToolkit/IndicTransToolkit/processor.pyx +503 -0
  27. libs/IndicTransToolkit/IndicTransToolkit/version.py +1 -0
  28. libs/IndicTransToolkit/IndicTransToolkit/version.txt +1 -0
  29. libs/IndicTransToolkit/LICENSE +21 -0
  30. libs/IndicTransToolkit/README.md +97 -0
  31. libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so +3 -0
  32. libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so +3 -0
  33. libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o +3 -0
  34. libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o +3 -0
  35. libs/IndicTransToolkit/pyproject.toml +25 -0
  36. libs/IndicTransToolkit/requirements.txt +8 -0
  37. libs/IndicTransToolkit/setup.py +61 -0
  38. libs/indic_nlp_library/.gitignore +6 -0
  39. libs/indic_nlp_library/LICENSE +9 -0
  40. libs/indic_nlp_library/README.md +22 -0
  41. libs/indic_nlp_library/RESOURCES/script/all_script_phonetic_data.csv +113 -0
  42. libs/indic_nlp_library/RESOURCES/script/english_arpabet_list.csv +46 -0
  43. libs/indic_nlp_library/RESOURCES/script/english_script_phonetic_data.csv +47 -0
  44. libs/indic_nlp_library/RESOURCES/script/tamil_script_phonetic_data.csv +113 -0
  45. libs/indic_nlp_library/RESOURCES/transliterate/offset_itrans_map.csv +129 -0
  46. libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/PKG-INFO +52 -0
  47. libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/SOURCES.txt +40 -0
  48. libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/dependency_links.txt +1 -0
  49. libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/requires.txt +5 -0
  50. libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/top_level.txt +1 -0
.dockerignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .env
6
+ .venv/
7
+ .git/
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
37
+ libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
38
+ libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o filter=lfs diff=lfs merge=lfs -text
39
+ libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o filter=lfs diff=lfs merge=lfs -text
40
+ libs/IndicTransToolkit/IndicTransToolkit/processor.cp313-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
41
+ libs/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
DockerFile ADDED
File without changes
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import torch
4
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
5
+ import os
6
+ import sys
7
+
8
+ sys.path.append(os.path.abspath("libs/IndicTransToolkit"))
9
+ from IndicTransToolkit.processor import IndicProcessor
10
+
11
+ app = FastAPI(title="IndicTrans Translator API")
12
+
13
+ ip = IndicProcessor(inference=True)
14
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
15
+ model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
16
+
17
+ class TranslationRequest(BaseModel):
18
+ text: str
19
+ target_lang: str
20
+
21
+ @app.post("/translate")
22
+ def translate_text(req: TranslationRequest):
23
+ if not req.text.strip():
24
+ raise HTTPException(status_code=400, detail="Input text is empty.")
25
+
26
+ try:
27
+ batch = ip.preprocess_batch([req.text], src_lang="eng_Latn", tgt_lang=req.target_lang)
28
+ batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
29
+
30
+ with torch.inference_mode():
31
+ outputs = model.generate(**batch, num_beams=5, max_length=256)
32
+
33
+ with tokenizer.as_target_tokenizer():
34
+ decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
35
+
36
+ translated = ip.postprocess_batch(decoded, lang=req.target_lang)[0]
37
+ return {"translation": translated}
38
+
39
+ except Exception as e:
40
+ raise HTTPException(status_code=500, detail=str(e))
libs/IndicTransToolkit/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ dist/
2
+ build/
3
+ *.egg-info/
4
+ */*/__pycache__/
libs/IndicTransToolkit/CHANGELOG.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ # 📢 Release v1.0.3
4
+ - 🚨 The `IndicProcessor` class has been re-written in [Cython](https://github.com/cython/cython) for faster implementation. This gives us atleast `+10 lines/s`.
5
+ - A new `visualize` argument as been added to `preprocess_batch` to track the processing with a `tqdm` bar.
6
+
7
+ # 📢 Release v1.0.2
8
+ - The repository has been renamed to `IndicTransToolkit`.
9
+ - 🚨 The custom tokenizer is now **removed** from the repository. Please revert to a previous commit ([v1.0.1](https://github.com/VarunGumma/IndicTransToolkit/tree/0e68fb5872f4d821578a5252f90ad43c9649370f)) to use it **(strongly discouraged)**. The official _(and only tokenizer)_ is available on HF along with the models.
10
+
11
+ # 📢 Release v1.0.0
12
+ - The [PreTrainedTokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer) for IndicTrans2 is now available on HF 🎉🎉 Note that, you still need the `IndicProcessor` to pre-process the sentences before tokenization.
13
+ - 🚨 **In favor of the standard PreTrainedTokenizer, we deprecated the custom tokenizer. However, this custom tokenizer will still be available here for backward compatibility, but no further updates/bug-fixes will be provided.**
14
+ - The `indic_evaluate` function is now consolidated into a concrete `IndicEvaluator` class.
15
+ - The data collation function for training is consolidated into a concrete `IndicDataCollator` class.
16
+ - A simple batching method is now available in the `IndicProcessor`.
libs/IndicTransToolkit/IndicTransToolkit.egg-info/PKG-INFO ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: IndicTransToolkit
3
+ Version: 1.0.3
4
+ Summary: A simple, consistent, and extendable module for IndicTrans2 tokenizer compatible with HuggingFace models
5
+ Home-page: https://github.com/VarunGumma/IndicTransToolkit
6
+ Author: Varun Gumma
7
+ Author-email: [email protected]
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: setuptools>=68.2.2
16
+ Requires-Dist: torch
17
+ Requires-Dist: cython
18
+ Requires-Dist: sacremoses
19
+ Requires-Dist: sentencepiece
20
+ Requires-Dist: transformers
21
+ Requires-Dist: sacrebleu
22
+ Requires-Dist: indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git
23
+ Dynamic: author
24
+ Dynamic: author-email
25
+ Dynamic: classifier
26
+ Dynamic: description
27
+ Dynamic: description-content-type
28
+ Dynamic: home-page
29
+ Dynamic: license
30
+ Dynamic: license-file
31
+ Dynamic: requires-dist
32
+ Dynamic: requires-python
33
+ Dynamic: summary
34
+
35
+ # IndicTransToolkit
36
+
37
+ ## About
38
+ The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
39
+
40
+ ## Pre-requisites
41
+ - `Python 3.8+`
42
+ - [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
43
+ - Other requirements as listed in `requirements.txt`
44
+
45
+ ## Configuration
46
+ - Editable installation (Note, this may take a while):
47
+ ```bash
48
+ git clone https://github.com/VarunGumma/IndicTransToolkit
49
+ cd IndicTransToolkit
50
+
51
+ pip install --editable . --use-pep517 # required for pip >= 25.0
52
+
53
+ # in case it fails, try:
54
+ # pip install --editable . --use-pep517 --config-settings editable_mode=compat
55
+ ```
56
+
57
+ ## Examples
58
+ For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
59
+
60
+ ### PreTainedTokenizer
61
+ ```python
62
+ import torch
63
+ from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
64
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
65
+
66
+ ip = IndicProcessor(inference=True)
67
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
68
+ model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
69
+
70
+ sentences = [
71
+ "This is a test sentence.",
72
+ "This is another longer different test sentence.",
73
+ "Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
74
+ ]
75
+
76
+ batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
77
+ batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
78
+
79
+ with torch.inference_mode():
80
+ outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
81
+
82
+ with tokenizer.as_target_tokenizer():
83
+ # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
84
+ # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
85
+ outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
86
+
87
+ outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
88
+ print(outputs)
89
+
90
+ >>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
91
+ ```
92
+
93
+ ### Evaluation
94
+ - `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
95
+ - We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
96
+ ```python
97
+ from IndicTransToolkit import IndicEvaluator
98
+
99
+ # this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
100
+ evaluator = IndicEvaluator()
101
+ scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
102
+
103
+ # alternatively, you can pass the list of predictions and references instead of files
104
+ # scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
105
+ ```
106
+
107
+ ## Authors
108
+ - Varun Gumma ([email protected])
109
+ - Jay Gala ([email protected])
110
+ - Pranjal Agadh Chitale ([email protected])
111
+ - Raj Dabre ([email protected])
112
+
113
+
114
+ ## Bugs and Contribution
115
+ Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
116
+
117
+
118
+ ## Citation
119
+ If you use our codebase, or models, please do cite the following paper:
120
+ ```bibtex
121
+ @article{
122
+ gala2023indictrans,
123
+ title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
124
+ author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
125
+ journal={Transactions on Machine Learning Research},
126
+ issn={2835-8856},
127
+ year={2023},
128
+ url={https://openreview.net/forum?id=vfT4YuzAYA},
129
+ note={}
130
+ }
131
+ ```
libs/IndicTransToolkit/IndicTransToolkit.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ IndicTransToolkit/__init__.py
6
+ IndicTransToolkit/collator.py
7
+ IndicTransToolkit/evaluator.py
8
+ IndicTransToolkit/processor.c
9
+ IndicTransToolkit/version.py
10
+ IndicTransToolkit.egg-info/PKG-INFO
11
+ IndicTransToolkit.egg-info/SOURCES.txt
12
+ IndicTransToolkit.egg-info/dependency_links.txt
13
+ IndicTransToolkit.egg-info/not-zip-safe
14
+ IndicTransToolkit.egg-info/requires.txt
15
+ IndicTransToolkit.egg-info/top_level.txt
libs/IndicTransToolkit/IndicTransToolkit.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
libs/IndicTransToolkit/IndicTransToolkit.egg-info/not-zip-safe ADDED
@@ -0,0 +1 @@
 
 
1
+
libs/IndicTransToolkit/IndicTransToolkit.egg-info/requires.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ setuptools>=68.2.2
2
+ torch
3
+ cython
4
+ sacremoses
5
+ sentencepiece
6
+ transformers
7
+ sacrebleu
8
+ indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git
libs/IndicTransToolkit/IndicTransToolkit.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ IndicTransToolkit
libs/IndicTransToolkit/IndicTransToolkit/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .evaluator import IndicEvaluator
2
+ from .collator import IndicDataCollator
3
+ from .processor import IndicProcessor
4
+
5
+ __all__ = [
6
+ "IndicEvaluator",
7
+ "IndicDataCollator",
8
+ "IndicProcessor",
9
+ ]
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (337 Bytes). View file
 
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (378 Bytes). View file
 
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-310.pyc ADDED
Binary file (2.14 kB). View file
 
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/collator.cpython-313.pyc ADDED
Binary file (3.2 kB). View file
 
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-310.pyc ADDED
Binary file (4.15 kB). View file
 
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/evaluator.cpython-313.pyc ADDED
Binary file (6.37 kB). View file
 
libs/IndicTransToolkit/IndicTransToolkit/__pycache__/processor.cpython-310.pyc ADDED
Binary file (11.7 kB). View file
 
libs/IndicTransToolkit/IndicTransToolkit/collator.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from dataclasses import dataclass
3
+ from typing import Any, Optional, Union
4
+
5
+ from transformers.utils import PaddingStrategy
6
+ from transformers.tokenization_utils import PreTrainedTokenizerBase
7
+ from transformers.data.data_collator import pad_without_fast_tokenizer_warning
8
+
9
+
10
+ @dataclass
11
+ class IndicDataCollator:
12
+ tokenizer: PreTrainedTokenizerBase
13
+ model: Optional[Any] = None
14
+ padding: Union[bool, str, PaddingStrategy] = True
15
+ max_length: Optional[int] = None
16
+ pad_to_multiple_of: Optional[int] = None
17
+ label_pad_token_id: int = -100
18
+ return_tensors: str = "pt"
19
+
20
+ def __call__(self, features, return_tensors=None):
21
+
22
+ if return_tensors is None:
23
+ return_tensors = self.return_tensors
24
+
25
+ labels = (
26
+ [feature["labels"] for feature in features]
27
+ if "labels" in features[0].keys()
28
+ else None
29
+ )
30
+ # We have to pad the labels before calling `tokenizer.pad` as
31
+ # this method won't pad them and needs them of the same length to return tensors.
32
+ if labels is not None:
33
+ max_label_length = max(len(l) for l in labels)
34
+ if self.pad_to_multiple_of is not None:
35
+ max_label_length = (
36
+ (max_label_length + self.pad_to_multiple_of - 1)
37
+ // self.pad_to_multiple_of
38
+ * self.pad_to_multiple_of
39
+ )
40
+
41
+ # fairseq by defaults right pad the labels for seq2seq tasks
42
+ for feature in features:
43
+ remainder = [self.label_pad_token_id] * (
44
+ max_label_length - len(feature["labels"])
45
+ )
46
+ if isinstance(feature["labels"], list):
47
+ feature["labels"] = feature["labels"] + remainder
48
+ else:
49
+ feature["labels"] = np.concatenate(
50
+ [feature["labels"], remainder]
51
+ ).astype(np.int64)
52
+
53
+ self.tokenizer.padding_side = "left"
54
+ features = pad_without_fast_tokenizer_warning(
55
+ self.tokenizer,
56
+ features,
57
+ padding=self.padding,
58
+ max_length=self.max_length,
59
+ return_tensors=return_tensors,
60
+ pad_to_multiple_of=self.pad_to_multiple_of,
61
+ )
62
+
63
+ # prepare decoder_input_ids
64
+ if (
65
+ labels is not None
66
+ and self.model is not None
67
+ and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
68
+ ):
69
+ decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
70
+ labels=features["labels"]
71
+ )
72
+ features["decoder_input_ids"] = decoder_input_ids
73
+
74
+ return features
libs/IndicTransToolkit/IndicTransToolkit/evaluator.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+ from sacrebleu.metrics import CHRF, BLEU
3
+
4
+ from indicnlp.tokenize import indic_tokenize
5
+ from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
6
+
7
+
8
+ class IndicEvaluator:
9
+ def __init__(self):
10
+ # === Metrics ===
11
+ self._chrf2_metric = CHRF(word_order=2)
12
+ self._bleu_metric_13a = BLEU(tokenize="13a")
13
+ self._bleu_metric_none = BLEU(tokenize="none")
14
+
15
+ # === Normalizer factory and cache ===
16
+ self._indic_norm_factory = IndicNormalizerFactory()
17
+ self._normalizer_cache = {} # Cache normalizers by iso_lang
18
+
19
+ # === FLORES -> ISO codes ===
20
+ self._flores_codes = {
21
+ "asm_Beng": "as",
22
+ "awa_Deva": "hi",
23
+ "ben_Beng": "bn",
24
+ "bho_Deva": "hi",
25
+ "brx_Deva": "hi",
26
+ "doi_Deva": "hi",
27
+ "eng_Latn": "en",
28
+ "gom_Deva": "kK",
29
+ "gon_Deva": "hi",
30
+ "guj_Gujr": "gu",
31
+ "hin_Deva": "hi",
32
+ "hne_Deva": "hi",
33
+ "kan_Knda": "kn",
34
+ "kas_Arab": "ur",
35
+ "kas_Deva": "hi",
36
+ "kha_Latn": "en",
37
+ "lus_Latn": "en",
38
+ "mag_Deva": "hi",
39
+ "mai_Deva": "hi",
40
+ "mal_Mlym": "ml",
41
+ "mar_Deva": "mr",
42
+ "mni_Beng": "bn",
43
+ "mni_Mtei": "hi",
44
+ "npi_Deva": "ne",
45
+ "ory_Orya": "or",
46
+ "pan_Guru": "pa",
47
+ "san_Deva": "hi",
48
+ "sat_Olck": "or",
49
+ "snd_Arab": "ur",
50
+ "snd_Deva": "hi",
51
+ "tam_Taml": "ta",
52
+ "tel_Telu": "te",
53
+ "urd_Arab": "ur",
54
+ "unr_Deva": "hi",
55
+ }
56
+
57
+ def _get_normalizer(self, iso_lang: str):
58
+ """
59
+ Return a cached normalizer for a given iso_lang.
60
+ """
61
+ if iso_lang not in self._normalizer_cache:
62
+ self._normalizer_cache[iso_lang] = self._indic_norm_factory.get_normalizer(iso_lang)
63
+ return self._normalizer_cache[iso_lang]
64
+
65
+ def _preprocess(self, sentences: List[str], lang: str) -> List[str]:
66
+ """
67
+ Preprocess the sentences using IndicNLP:
68
+ 1) Normalization (using a cached normalizer),
69
+ 2) Trivial tokenization.
70
+ """
71
+ iso_lang = self._flores_codes.get(lang, "hi")
72
+ # Fetch from cache to avoid reconstructing the normalizer
73
+ normalizer = self._get_normalizer(iso_lang)
74
+
75
+ # Local references for speed
76
+ trivial_tokenize = indic_tokenize.trivial_tokenize
77
+ normalize_fn = normalizer.normalize
78
+
79
+ processed_sentences = []
80
+ for line in sentences:
81
+ # single .strip() before normalizing
82
+ line = line.strip()
83
+ norm_line = normalize_fn(line)
84
+ tokens = trivial_tokenize(norm_line, iso_lang)
85
+ processed_sentences.append(" ".join(tokens))
86
+
87
+ return processed_sentences
88
+
89
+ def evaluate(
90
+ self,
91
+ tgt_lang: str,
92
+ preds: Union[List[str], str],
93
+ refs: Union[List[str], str],
94
+ ):
95
+ """
96
+ Evaluate BLEU and chrF2++ scores for the given predictions and references.
97
+ - If preds/refs are strings (filenames), read them from disk.
98
+ - If they are lists, evaluate them directly.
99
+ - For non-English languages, applies Indic NLP preprocessing before scoring.
100
+ """
101
+ assert preds is not None and refs is not None, "Predictions and References cannot be None"
102
+
103
+ # Convert file paths to lists if needed
104
+ if isinstance(preds, str):
105
+ with open(preds, "r", encoding="utf-8") as fp:
106
+ preds = [line.strip() for line in fp]
107
+ if isinstance(refs, str):
108
+ with open(refs, "r", encoding="utf-8") as fr:
109
+ refs = [line.strip() for line in fr]
110
+
111
+ assert len(preds) == len(refs), "Number of predictions and references do not match"
112
+
113
+ # Local references to metrics for speed
114
+ bleu_none = self._bleu_metric_none
115
+ bleu_13a = self._bleu_metric_13a
116
+ chrf2 = self._chrf2_metric
117
+
118
+ scores = {}
119
+
120
+ # For English (eng_Latn), skip Indic NLP normalization
121
+ if tgt_lang != "eng_Latn":
122
+ preds_ = self._preprocess(preds, tgt_lang)
123
+ refs_ = self._preprocess(refs, tgt_lang)
124
+
125
+ bleu_score = bleu_none.corpus_score(preds_, [refs_])
126
+ chrf_score = chrf2.corpus_score(preds_, [refs_])
127
+
128
+ scores["bleu"] = {
129
+ "score": round(bleu_score.score, 1),
130
+ "signature": bleu_none.get_signature().format(),
131
+ }
132
+ scores["chrF2++"] = {
133
+ "score": round(chrf_score.score, 1),
134
+ "signature": chrf2.get_signature().format(),
135
+ }
136
+
137
+ else:
138
+ # For English, 13a tokenization is standard
139
+ bleu_score = bleu_13a.corpus_score(preds, [refs])
140
+ chrf_score = chrf2.corpus_score(preds, [refs])
141
+
142
+ scores["bleu"] = {
143
+ "score": round(bleu_score.score, 1),
144
+ "signature": bleu_13a.get_signature().format(),
145
+ }
146
+ scores["chrF2++"] = {
147
+ "score": round(chrf_score.score, 1),
148
+ "signature": chrf2.get_signature().format(),
149
+ }
150
+
151
+ return scores
libs/IndicTransToolkit/IndicTransToolkit/processor.c ADDED
The diff for this file is too large to render. See raw diff
 
libs/IndicTransToolkit/IndicTransToolkit/processor.cp313-win_amd64.pyd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6171c90f22a4602e67c36a791a618ccbf0d3703f17ea0c214186cb5fe3030487
3
+ size 139776
libs/IndicTransToolkit/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
3
+ size 229200
libs/IndicTransToolkit/IndicTransToolkit/processor.pyx ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cython: language_level=3, boundscheck=False, cdivision=True, wraparound=False
2
+ """
3
+ Cython version of the IndicProcessor class with optimizations for performance.
4
+ Only preprocess_batch and postprocess_batch are exposed as cpdef methods.
5
+ All other methods are internal (cdef) for optimized Cython usage.
6
+ """
7
+
8
+ import regex as re
9
+ from tqdm import tqdm
10
+ from queue import Queue
11
+ from typing import List, Dict, Union
12
+
13
+ # Importing Python objects since these libraries don't offer C-extensions
14
+ from indicnlp.tokenize import indic_tokenize, indic_detokenize
15
+ from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
16
+ from sacremoses import MosesPunctNormalizer, MosesTokenizer, MosesDetokenizer
17
+ from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
18
+
19
+
20
+ cdef class IndicProcessor:
21
+ cdef public bint inference
22
+
23
+ # Precompiled regex patterns and placeholders
24
+ cdef object _MULTISPACE_REGEX
25
+ cdef object _DIGIT_SPACE_PERCENT
26
+ cdef object _DOUBLE_QUOT_PUNC
27
+ cdef object _DIGIT_NBSP_DIGIT
28
+ cdef object _END_BRACKET_SPACE_PUNC_REGEX
29
+
30
+ cdef object _URL_PATTERN
31
+ cdef object _NUMERAL_PATTERN
32
+ cdef object _EMAIL_PATTERN
33
+ cdef object _OTHER_PATTERN
34
+
35
+ cdef list _PUNC_REPLACEMENTS
36
+ cdef list _INDIC_FAILURE_CASES
37
+
38
+ cdef dict _flores_codes
39
+ cdef dict _digits_translation_table
40
+
41
+ # Placeholder maps stored in a Python Queue (treated as `object` for Cython)
42
+ cdef object _placeholder_entity_maps
43
+
44
+ # Tools (also Python objects)
45
+ cdef object _en_tok
46
+ cdef object _en_normalizer
47
+ cdef object _en_detok
48
+ cdef object _xliterator
49
+
50
+ def __cinit__(self, bint inference=True):
51
+ """
52
+ Constructor for IndicProcessor. Initializes all necessary components.
53
+ """
54
+ self.inference = inference
55
+
56
+ ##############################
57
+ # FLORES -> ISO CODES
58
+ ##############################
59
+ self._flores_codes = {
60
+ "asm_Beng": "as",
61
+ "awa_Deva": "hi",
62
+ "ben_Beng": "bn",
63
+ "bho_Deva": "hi",
64
+ "brx_Deva": "hi",
65
+ "doi_Deva": "hi",
66
+ "eng_Latn": "en",
67
+ "gom_Deva": "kK",
68
+ "gon_Deva": "hi",
69
+ "guj_Gujr": "gu",
70
+ "hin_Deva": "hi",
71
+ "hne_Deva": "hi",
72
+ "kan_Knda": "kn",
73
+ "kas_Arab": "ur",
74
+ "kas_Deva": "hi",
75
+ "kha_Latn": "en",
76
+ "lus_Latn": "en",
77
+ "mag_Deva": "hi",
78
+ "mai_Deva": "hi",
79
+ "mal_Mlym": "ml",
80
+ "mar_Deva": "mr",
81
+ "mni_Beng": "bn",
82
+ "mni_Mtei": "hi",
83
+ "npi_Deva": "ne",
84
+ "ory_Orya": "or",
85
+ "pan_Guru": "pa",
86
+ "san_Deva": "hi",
87
+ "sat_Olck": "or",
88
+ "snd_Arab": "ur",
89
+ "snd_Deva": "hi",
90
+ "tam_Taml": "ta",
91
+ "tel_Telu": "te",
92
+ "urd_Arab": "ur",
93
+ "unr_Deva": "hi",
94
+ }
95
+
96
+ ##############################
97
+ # INDIC DIGIT TRANSLATION (str.translate)
98
+ ##############################
99
+ self._digits_translation_table = {}
100
+ cdef dict digits_dict = {
101
+ "\u09e6": "0", "\u0ae6": "0", "\u0ce6": "0", "\u0966": "0",
102
+ "\u0660": "0", "\uabf0": "0", "\u0b66": "0", "\u0a66": "0",
103
+ "\u1c50": "0", "\u06f0": "0",
104
+
105
+ "\u09e7": "1", "\u0ae7": "1", "\u0967": "1", "\u0ce7": "1",
106
+ "\u06f1": "1", "\uabf1": "1", "\u0b67": "1", "\u0a67": "1",
107
+ "\u1c51": "1", "\u0c67": "1",
108
+
109
+ "\u09e8": "2", "\u0ae8": "2", "\u0968": "2", "\u0ce8": "2",
110
+ "\u06f2": "2", "\uabf2": "2", "\u0b68": "2", "\u0a68": "2",
111
+ "\u1c52": "2", "\u0c68": "2",
112
+
113
+ "\u09e9": "3", "\u0ae9": "3", "\u0969": "3", "\u0ce9": "3",
114
+ "\u06f3": "3", "\uabf3": "3", "\u0b69": "3", "\u0a69": "3",
115
+ "\u1c53": "3", "\u0c69": "3",
116
+
117
+ "\u09ea": "4", "\u0aea": "4", "\u096a": "4", "\u0cea": "4",
118
+ "\u06f4": "4", "\uabf4": "4", "\u0b6a": "4", "\u0a6a": "4",
119
+ "\u1c54": "4", "\u0c6a": "4",
120
+
121
+ "\u09eb": "5", "\u0aeb": "5", "\u096b": "5", "\u0ceb": "5",
122
+ "\u06f5": "5", "\uabf5": "5", "\u0b6b": "5", "\u0a6b": "5",
123
+ "\u1c55": "5", "\u0c6b": "5",
124
+
125
+ "\u09ec": "6", "\u0aec": "6", "\u096c": "6", "\u0cec": "6",
126
+ "\u06f6": "6", "\uabf6": "6", "\u0b6c": "6", "\u0a6c": "6",
127
+ "\u1c56": "6", "\u0c6c": "6",
128
+
129
+ "\u09ed": "7", "\u0aed": "7", "\u096d": "7", "\u0ced": "7",
130
+ "\u06f7": "7", "\uabf7": "7", "\u0b6d": "7", "\u0a6d": "7",
131
+ "\u1c57": "7", "\u0c6d": "7",
132
+
133
+ "\u09ee": "8", "\u0aee": "8", "\u096e": "8", "\u0cee": "8",
134
+ "\u06f8": "8", "\uabf8": "8", "\u0b6e": "8", "\u0a6e": "8",
135
+ "\u1c58": "8", "\u0c6e": "8",
136
+
137
+ "\u09ef": "9", "\u0aef": "9", "\u096f": "9", "\u0cef": "9",
138
+ "\u06f9": "9", "\uabf9": "9", "\u0b6f": "9", "\u0a6f": "9",
139
+ "\u1c59": "9", "\u0c6f": "9",
140
+ }
141
+ for k, v in digits_dict.items():
142
+ self._digits_translation_table[ord(k)] = v
143
+
144
+ # Also map ASCII '0'-'9'
145
+ for c in range(ord('0'), ord('9') + 1):
146
+ self._digits_translation_table[c] = chr(c)
147
+
148
+ ##############################
149
+ # PLACEHOLDER MAP QUEUE
150
+ ##############################
151
+ self._placeholder_entity_maps = Queue()
152
+
153
+ ##############################
154
+ # MOSES (as Python objects)
155
+ ##############################
156
+ self._en_tok = MosesTokenizer(lang="en")
157
+ self._en_normalizer = MosesPunctNormalizer()
158
+ self._en_detok = MosesDetokenizer(lang="en")
159
+
160
+ ##############################
161
+ # TRANSLITERATOR (Python object)
162
+ ##############################
163
+ self._xliterator = UnicodeIndicTransliterator()
164
+
165
+ ##############################
166
+ # Precompiled Patterns
167
+ ##############################
168
+ self._MULTISPACE_REGEX = re.compile(r"[ ]{2,}")
169
+ self._DIGIT_SPACE_PERCENT = re.compile(r"(\d) %")
170
+ self._DOUBLE_QUOT_PUNC = re.compile(r"\"([,\.]+)")
171
+ self._DIGIT_NBSP_DIGIT = re.compile(r"(\d) (\d)")
172
+ self._END_BRACKET_SPACE_PUNC_REGEX = re.compile(r"\) ([\.!:?;,])")
173
+
174
+ self._URL_PATTERN = re.compile(
175
+ r"\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b"
176
+ )
177
+ self._NUMERAL_PATTERN = re.compile(
178
+ r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
179
+ )
180
+ self._EMAIL_PATTERN = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}")
181
+ self._OTHER_PATTERN = re.compile(r"[A-Za-z0-9]*[#|@]\w+")
182
+
183
+ # Combined punctuation replacements
184
+ self._PUNC_REPLACEMENTS = [
185
+ (re.compile(r"\r"), ""),
186
+ (re.compile(r"\(\s*"), "("),
187
+ (re.compile(r"\s*\)"), ")"),
188
+ (re.compile(r"\s:\s?"), ":"),
189
+ (re.compile(r"\s;\s?"), ";"),
190
+ (re.compile(r"[`´‘‚’]"), "'"),
191
+ (re.compile(r"[„“”«»]"), '"'),
192
+ (re.compile(r"[–—]"), "-"),
193
+ (re.compile(r"\.\.\."), "..."),
194
+ (re.compile(r" %"), "%"),
195
+ (re.compile(r"nº "), "nº "),
196
+ (re.compile(r" ºC"), " ºC"),
197
+ (re.compile(r" [?!;]"), lambda m: m.group(0).strip()),
198
+ (re.compile(r", "), ", "),
199
+ ]
200
+
201
+ self._INDIC_FAILURE_CASES = [
202
+ "آی ڈی ",
203
+ "ꯑꯥꯏꯗꯤ",
204
+ "आईडी",
205
+ "आई . डी . ",
206
+ "आई . डी .",
207
+ "आई. डी. ",
208
+ "आई. डी.",
209
+ "आय. डी. ",
210
+ "आय. डी.",
211
+ "आय . डी . ",
212
+ "आय . डी .",
213
+ "ऐटि",
214
+ "آئی ڈی ",
215
+ "ᱟᱭᱰᱤ ᱾",
216
+ "आयडी",
217
+ "ऐडि",
218
+ "आइडि",
219
+ "ᱟᱭᱰᱤ",
220
+ ]
221
+
222
+ # Internal Method: Apply punctuation replacements
223
+ cdef str _apply_punc_replacements(self, str text, list replacements) except *:
224
+ """
225
+ Apply a list of (pattern, replacement) in sequence to text.
226
+ """
227
+ cdef int i
228
+ cdef tuple pair
229
+ for i in range(len(replacements)):
230
+ pair = replacements[i]
231
+ text = pair[0].sub(pair[1], text)
232
+ return text
233
+
234
+ # Internal Method: Punctuation Normalization
235
+ cdef str _punc_norm(self, str text) except *:
236
+ """
237
+ Consolidate punctuation normalization in fewer passes.
238
+ """
239
+ # 1) Apply replacements
240
+ text = self._apply_punc_replacements(text, self._PUNC_REPLACEMENTS)
241
+
242
+ # 2) Additional patterns
243
+ text = self._MULTISPACE_REGEX.sub(" ", text)
244
+ text = self._END_BRACKET_SPACE_PUNC_REGEX.sub(r")\1", text)
245
+ text = self._DIGIT_SPACE_PERCENT.sub(r"\1%", text)
246
+ text = self._DOUBLE_QUOT_PUNC.sub(r'\1"', text)
247
+ text = self._DIGIT_NBSP_DIGIT.sub(r"\1.\2", text)
248
+ return text.strip()
249
+
250
+ # Internal Method: Wrap Text with Placeholders
251
+ cdef str _wrap_with_placeholders(self, str text) except *:
252
+ """
253
+ Wrap substrings with matched patterns in the text with placeholders.
254
+ Store the placeholder map in the queue for retrieval in postprocessing.
255
+ """
256
+ cdef int serial_no = 1
257
+ cdef dict placeholder_entity_map = {}
258
+ cdef list patterns = [
259
+ self._EMAIL_PATTERN,
260
+ self._URL_PATTERN,
261
+ self._NUMERAL_PATTERN,
262
+ self._OTHER_PATTERN,
263
+ ]
264
+ cdef object pattern
265
+ cdef set matches
266
+ cdef str match
267
+ cdef str base_placeholder
268
+ cdef int i
269
+
270
+ for pattern in patterns:
271
+ matches = set(pattern.findall(text))
272
+ for match in matches:
273
+ # Additional checks
274
+ if pattern is self._URL_PATTERN:
275
+ if len(match.replace(".", "")) < 4:
276
+ continue
277
+ if pattern is self._NUMERAL_PATTERN:
278
+ if len(match.replace(" ", "").replace(".", "").replace(":", "")) < 4:
279
+ continue
280
+
281
+ base_placeholder = f"<ID{serial_no}>"
282
+ # Map various placeholder formats to the matched text
283
+ placeholder_entity_map[f"<ID{serial_no}>"] = match
284
+ placeholder_entity_map[f"< ID{serial_no} >"] = match
285
+ placeholder_entity_map[f"[ID{serial_no}]"] = match
286
+ placeholder_entity_map[f"[ ID{serial_no} ]"] = match
287
+ placeholder_entity_map[f"[ID {serial_no}]"] = match
288
+ placeholder_entity_map[f"<ID{serial_no}]"] = match
289
+ placeholder_entity_map[f"< ID{serial_no}]"] = match
290
+ placeholder_entity_map[f"<ID{serial_no} ]"] = match
291
+
292
+ # Handle Indic failure cases
293
+ for i in range(len(self._INDIC_FAILURE_CASES)):
294
+ indic_case = self._INDIC_FAILURE_CASES[i]
295
+ placeholder_entity_map[f"<{indic_case}{serial_no}>"] = match
296
+ placeholder_entity_map[f"< {indic_case}{serial_no} >"] = match
297
+ placeholder_entity_map[f"< {indic_case} {serial_no} >"] = match
298
+ placeholder_entity_map[f"<{indic_case} {serial_no}]"] = match
299
+ placeholder_entity_map[f"< {indic_case} {serial_no} ]"] = match
300
+ placeholder_entity_map[f"[{indic_case}{serial_no}]"] = match
301
+ placeholder_entity_map[f"[{indic_case} {serial_no}]"] = match
302
+ placeholder_entity_map[f"[ {indic_case}{serial_no} ]"] = match
303
+ placeholder_entity_map[f"[ {indic_case} {serial_no} ]"] = match
304
+ placeholder_entity_map[f"{indic_case} {serial_no}"] = match
305
+ placeholder_entity_map[f"{indic_case}{serial_no}"] = match
306
+
307
+ # Replace the match with the base placeholder
308
+ text = text.replace(match, base_placeholder)
309
+ serial_no += 1
310
+
311
+ # Clean up any remaining placeholder artifacts
312
+ text = re.sub(r"\s+", " ", text).replace(">/", ">").replace("]/", "]")
313
+ self._placeholder_entity_maps.put(placeholder_entity_map)
314
+ return text
315
+
316
+ # Internal Method: Normalize Text
317
+ cdef str _normalize(self, str text) except *:
318
+ """
319
+ Normalizes numerals and optionally wraps placeholders.
320
+ """
321
+ # Single-pass digit translation
322
+ text = text.translate(self._digits_translation_table)
323
+
324
+ if self.inference:
325
+ text = self._wrap_with_placeholders(text)
326
+ return text
327
+
328
+ # Internal Method: Indic Tokenize and Transliterate
329
+ cdef str _do_indic_tokenize_and_transliterate(
330
+ self,
331
+ str sentence,
332
+ object normalizer,
333
+ str iso_lang,
334
+ bint transliterate
335
+ ) except *:
336
+ """
337
+ Helper method: normalizes, tokenizes, optionally transliterates from iso_lang -> 'hi'.
338
+ """
339
+ cdef str normed
340
+ cdef list tokens
341
+ cdef str joined
342
+ cdef str xlated
343
+
344
+ normed = normalizer.normalize(sentence.strip())
345
+ tokens = indic_tokenize.trivial_tokenize(normed, iso_lang)
346
+ joined = " ".join(tokens)
347
+ xlated = joined
348
+ if transliterate:
349
+ xlated = self._xliterator.transliterate(joined, iso_lang, "hi")
350
+ xlated = xlated.replace(" ् ", "्")
351
+ return xlated
352
+
353
+ # Internal Method: Preprocess a Single Sentence
354
+ cdef str _preprocess(
355
+ self,
356
+ str sent,
357
+ str src_lang,
358
+ str tgt_lang,
359
+ object normalizer,
360
+ bint is_target
361
+ ) except *:
362
+ """
363
+ Preprocess a single sentence: punctuation normalization, numeral normalization,
364
+ tokenization, transliteration, and adding language tags if necessary.
365
+ """
366
+ cdef str iso_lang = self._flores_codes.get(src_lang, "hi")
367
+ cdef str script_part = src_lang.split("_")[1]
368
+ cdef bint do_transliterate = True
369
+ cdef str e_strip
370
+ cdef str e_norm
371
+ cdef list e_tokens
372
+ cdef str processed_sent
373
+
374
+ # 1) Punctuation normalization
375
+ sent = self._punc_norm(sent)
376
+
377
+ # 2) Numerals & placeholders
378
+ sent = self._normalize(sent)
379
+
380
+ if script_part in ["Arab", "Aran", "Olck", "Mtei", "Latn"]:
381
+ do_transliterate = False
382
+
383
+ if iso_lang == "en":
384
+ # English path
385
+ e_strip = sent.strip()
386
+ e_norm = self._en_normalizer.normalize(e_strip)
387
+ e_tokens = self._en_tok.tokenize(e_norm, escape=False)
388
+ processed_sent = " ".join(e_tokens)
389
+ else:
390
+ # Indic path
391
+ processed_sent = self._do_indic_tokenize_and_transliterate(sent, normalizer, iso_lang, do_transliterate)
392
+
393
+ processed_sent = processed_sent.strip()
394
+ if not is_target:
395
+ return f"{src_lang} {tgt_lang} {processed_sent}"
396
+ else:
397
+ return processed_sent
398
+
399
+ # Internal Method: Postprocess a Single Sentence
400
+ cdef str _postprocess(self, object sent, str lang) except *:
401
+ """
402
+ Postprocess a single sentence:
403
+ 1) Pull placeholder map from queue
404
+ 2) Fix scripts for Perso-Arabic
405
+ 3) Restore placeholders
406
+ 4) Detokenize
407
+ """
408
+ cdef dict placeholder_entity_map
409
+ cdef str lang_code
410
+ cdef str script_code
411
+ cdef str iso_lang
412
+ cdef str k
413
+ cdef str v
414
+ cdef str xlated
415
+
416
+ # Unwrap if sent is a tuple or list
417
+ if isinstance(sent, (tuple, list)):
418
+ sent = sent[0]
419
+
420
+ placeholder_entity_map = self._placeholder_entity_maps.get()
421
+ lang_code, script_code = lang.split("_", 1)
422
+ iso_lang = self._flores_codes.get(lang, "hi")
423
+
424
+ # Fix for Perso-Arabic scripts
425
+ if script_code in ["Arab", "Aran"]:
426
+ sent = (
427
+ sent.replace(" ؟", "؟")
428
+ .replace(" ۔", "۔")
429
+ .replace(" ،", "،")
430
+ .replace("ٮ۪", "ؠ")
431
+ )
432
+
433
+ # Oriya fix
434
+ if lang_code == "ory":
435
+ sent = sent.replace("ଯ଼", "ୟ")
436
+
437
+ # Restore placeholders
438
+ for k, v in placeholder_entity_map.items():
439
+ sent = sent.replace(k, v)
440
+
441
+ # Detokenize
442
+ if lang == "eng_Latn":
443
+ return self._en_detok.detokenize(sent.split(" "))
444
+ else:
445
+ xlated = self._xliterator.transliterate(sent, "hi", iso_lang)
446
+ return indic_detokenize.trivial_detokenize(xlated, iso_lang)
447
+
448
+ # Exposed Method: Preprocess a Batch of Sentences
449
+ cpdef list preprocess_batch(
450
+ self,
451
+ List[str] batch,
452
+ str src_lang,
453
+ str tgt_lang=None,
454
+ bint is_target=False,
455
+ bint visualize=False
456
+ ):
457
+ """
458
+ Preprocess an array of sentences (normalize, tokenize, transliterate).
459
+ This is exposed for external use.
460
+ """
461
+ cdef object normalizer = None
462
+ cdef str iso_code = self._flores_codes.get(src_lang, "hi")
463
+ cdef object iterator
464
+ cdef list results
465
+ cdef int i
466
+ cdef int n = len(batch)
467
+
468
+ if src_lang != "eng_Latn":
469
+ normalizer = IndicNormalizerFactory().get_normalizer(iso_code)
470
+
471
+ if visualize:
472
+ iterator = tqdm(batch, total=n, desc=f" | > Pre-processing {src_lang}", unit="line")
473
+ else:
474
+ iterator = batch
475
+
476
+ return [self._preprocess(s, src_lang, tgt_lang, normalizer, is_target) for s in iterator]
477
+
478
+ # Exposed Method: Postprocess a Batch of Sentences
479
+ cpdef list postprocess_batch(
480
+ self,
481
+ List[str] sents,
482
+ str lang="hin_Deva",
483
+ bint visualize=False
484
+ ):
485
+ """
486
+ Postprocess a batch of sentences:
487
+ Restore placeholders, fix script issues, and detokenize.
488
+ This is exposed for external use.
489
+ """
490
+ cdef object iterator
491
+ cdef list results
492
+ cdef int i
493
+ cdef int n = len(sents)
494
+
495
+ if visualize:
496
+ iterator = tqdm(sents, total=n, desc=f" | > Post-processing {lang}", unit="line")
497
+ else:
498
+ iterator = sents
499
+
500
+ results = [self._postprocess(s, lang) for s in iterator]
501
+ self._placeholder_entity_maps.queue.clear()
502
+
503
+ return results
libs/IndicTransToolkit/IndicTransToolkit/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "1.0.3"
libs/IndicTransToolkit/IndicTransToolkit/version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1.0.3
libs/IndicTransToolkit/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Varun Gumma.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
libs/IndicTransToolkit/README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IndicTransToolkit
2
+
3
+ ## About
4
+ The goal of this repository is to provide a simple, modular, and extendable toolkit for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released. Please refer to the `CHANGELOG.md` for latest developments.
5
+
6
+ ## Pre-requisites
7
+ - `Python 3.8+`
8
+ - [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
9
+ - Other requirements as listed in `requirements.txt`
10
+
11
+ ## Configuration
12
+ - Editable installation (Note, this may take a while):
13
+ ```bash
14
+ git clone https://github.com/VarunGumma/IndicTransToolkit
15
+ cd IndicTransToolkit
16
+
17
+ pip install --editable . --use-pep517 # required for pip >= 25.0
18
+
19
+ # in case it fails, try:
20
+ # pip install --editable . --use-pep517 --config-settings editable_mode=compat
21
+ ```
22
+
23
+ ## Examples
24
+ For the training usecase, please refer [here](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface).
25
+
26
+ ### PreTainedTokenizer
27
+ ```python
28
+ import torch
29
+ from IndicTransToolkit.processor import IndicProcessor # NOW IMPLEMENTED IN CYTHON !!
30
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
31
+
32
+ ip = IndicProcessor(inference=True)
33
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
34
+ model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
35
+
36
+ sentences = [
37
+ "This is a test sentence.",
38
+ "This is another longer different test sentence.",
39
+ "Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
40
+ ]
41
+
42
+ batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva", visualize=False) # set it to visualize=True to print a progress bar
43
+ batch = tokenizer(batch, padding="longest", truncation=True, max_length=256, return_tensors="pt")
44
+
45
+ with torch.inference_mode():
46
+ outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
47
+
48
+ with tokenizer.as_target_tokenizer():
49
+ # This scoping is absolutely necessary, as it will instruct the tokenizer to tokenize using the target vocabulary.
50
+ # Failure to use this scoping will result in gibberish/unexpected predictions as the output will be de-tokenized with the source vocabulary instead.
51
+ outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True)
52
+
53
+ outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
54
+ print(outputs)
55
+
56
+ >>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
57
+ ```
58
+
59
+ ### Evaluation
60
+ - `IndicEvaluator` is a python implementation of [compute_metrics.sh](https://github.com/AI4Bharat/IndicTrans2/blob/main/compute_metrics.sh).
61
+ - We have found that this python implementation gives slightly lower scores than the original `compute_metrics.sh`. So, please use this function cautiously, and feel free to raise a PR if you have found the bug/fix.
62
+ ```python
63
+ from IndicTransToolkit import IndicEvaluator
64
+
65
+ # this method returns a dictionary with BLEU and ChrF2++ scores with appropriate signatures
66
+ evaluator = IndicEvaluator()
67
+ scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=pred_file, refs=ref_file)
68
+
69
+ # alternatively, you can pass the list of predictions and references instead of files
70
+ # scores = evaluator.evaluate(tgt_lang=tgt_lang, preds=preds, refs=refs)
71
+ ```
72
+
73
+ ## Authors
74
+ - Varun Gumma ([email protected])
75
+ - Jay Gala ([email protected])
76
+ - Pranjal Agadh Chitale ([email protected])
77
+ - Raj Dabre ([email protected])
78
+
79
+
80
+ ## Bugs and Contribution
81
+ Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
82
+
83
+
84
+ ## Citation
85
+ If you use our codebase, or models, please do cite the following paper:
86
+ ```bibtex
87
+ @article{
88
+ gala2023indictrans,
89
+ title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
90
+ author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
91
+ journal={Transactions on Machine Learning Research},
92
+ issn={2835-8856},
93
+ year={2023},
94
+ url={https://openreview.net/forum?id=vfT4YuzAYA},
95
+ note={}
96
+ }
97
+ ```
libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d57d4239b3638a272e4b70292f10494ee4a0fee201a9d74c62fc35a3d263a45
3
+ size 260304
libs/IndicTransToolkit/build/lib.linux-x86_64-cpython-310/IndicTransToolkit/processor.cpython-310-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a101ecb27adaf367f00c90b3f8e96e7fbda3bf0560d48c368fec3750a040a4
3
+ size 229200
libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/fast_processor.o ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9e82df38b208dc0a9b468ff669c9da159c7deaabcb389fcfacd43e038504fec
3
+ size 347184
libs/IndicTransToolkit/build/temp.linux-x86_64-cpython-310/IndicTransToolkit/processor.o ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d27c2cc00c97a89f97f7c28bc9175c5c403a0e2a372a0b39f1c5fe8609adda09
3
+ size 303696
libs/IndicTransToolkit/pyproject.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = [
3
+ "setuptools>=68.2.2",
4
+ "wheel",
5
+ "Cython",
6
+ ]
7
+ build-backend = "setuptools.build_meta"
8
+
9
+ [tool.black]
10
+ # Black configuration for code formatting
11
+ line-length = 88
12
+ target-version = ['py38']
13
+ exclude = '''
14
+ /(
15
+ \.git
16
+ | \.hg
17
+ | \.mypy_cache
18
+ | \.tox
19
+ | \.venv
20
+ | _build
21
+ | buck-out
22
+ | build
23
+ | dist
24
+ )/
25
+ '''
libs/IndicTransToolkit/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ setuptools>=68.2.2
2
+ torch
3
+ cython
4
+ sacremoses
5
+ sentencepiece
6
+ transformers
7
+ sacrebleu
8
+ indic-nlp-library-IT2 @ git+https://github.com/VarunGumma/indic_nlp_library.git
libs/IndicTransToolkit/setup.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pathlib
3
+ from sys import version_info, exit
4
+ from setuptools import setup, find_packages
5
+ from Cython.Build import cythonize
6
+ from pkg_resources import parse_requirements
7
+
8
+ def write_version_py():
9
+ version_txt_path = os.path.join("IndicTransToolkit", "version.txt")
10
+ with open(version_txt_path, "r", encoding="utf-8") as f:
11
+ version = f.read().strip()
12
+
13
+ version_py_path = os.path.join("IndicTransToolkit", "version.py")
14
+ with open(version_py_path, "w", encoding="utf-8") as f:
15
+ f.write(f'__version__ = "{version}"\n')
16
+ return version
17
+
18
+ # Enforce Python >= 3.8
19
+ if version_info < (3, 8):
20
+ exit("Sorry, Python >= 3.8 is required for IndicTransToolkit.")
21
+
22
+ # Read long description from README
23
+ with open("README.md", "r", errors="ignore", encoding="utf-8") as fh:
24
+ long_description = fh.read().strip()
25
+
26
+ # Write version.py from version.txt
27
+ version = write_version_py()
28
+
29
+ # Parse requirements.txt
30
+ req_file = pathlib.Path("requirements.txt")
31
+ requirements = [str(req) for req in parse_requirements(req_file.open())]
32
+
33
+ # Cython files to compile (adjust if your .pyx name differs)
34
+ cython_extensions = cythonize(
35
+ [
36
+ "IndicTransToolkit/processor.pyx",
37
+ ],
38
+ compiler_directives={"language_level": "3", "boundscheck": False},
39
+ )
40
+
41
+ setup(
42
+ name="IndicTransToolkit",
43
+ version=version,
44
+ author="Varun Gumma",
45
+ author_email="[email protected]",
46
+ description="A simple, consistent, and extendable module for IndicTrans2 tokenizer compatible with HuggingFace models",
47
+ long_description=long_description,
48
+ long_description_content_type="text/markdown",
49
+ url="https://github.com/VarunGumma/IndicTransToolkit",
50
+ packages=find_packages(), # Auto-detect packages
51
+ license="MIT",
52
+ classifiers=[
53
+ "Programming Language :: Python :: 3",
54
+ "License :: OSI Approved :: MIT License",
55
+ "Operating System :: OS Independent",
56
+ ],
57
+ python_requires=">=3.8",
58
+ install_requires=requirements,
59
+ ext_modules=cython_extensions,
60
+ zip_safe=False,
61
+ )
libs/indic_nlp_library/.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ **/__pycache__/
2
+ *.egg-info/
3
+ dist/
4
+ build/
5
+ contrib/
6
+ docs/
libs/indic_nlp_library/LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2013-present Anoop Kunchukuttan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
libs/indic_nlp_library/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indic NLP Library
2
+ This repository is a _de-bloated_ fork of the original [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library) and integrates [UrduHack](https://github.com/urduhack/urduhack) submodule and [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) directly. This allows to work with Urdu normalization and tokenization without needing to install [urduhack](https://pypi.org/project/urduhack/) and `indic_nlp_resources` separately, which can be an issue sometimes as it is `TensorFlow` based. This repository is mainly created and mainted for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)
3
+
4
+ For any queries, please get in touch with the original authors/maintainers of the respective libraries:
5
+
6
+ - `Indic NLP Library`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
7
+ - `Indic NLP Resources`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
8
+ - `UrduHack`: [UrduHack](https://github.com/urduhack)
9
+
10
+ ## Usage:
11
+ ```
12
+ git clone https://github.com/VarunGumma/indic_nlp_library.git
13
+
14
+ cd indic_nlp_library
15
+ pip install --editable ./
16
+ ```
17
+
18
+ ## Updates:
19
+ - Integrated `urduhack` directly into the repository.
20
+ - Renamed `master` branch as `main`.
21
+ - Integrated `indic_nlp_resources` directly into the repository.
22
+ - _De-bloated_ the repository.
libs/indic_nlp_library/RESOURCES/script/all_script_phonetic_data.csv ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded
2
+ 900,0,ऀ,ऀ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3
+ 901,1,ँ,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
4
+ 902,2,ं,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
5
+ 903,3,ः,H,Should represent as pure aspiration and not as a vowel,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6
+ 904,4,ऄ,ऄ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7
+ 905,5,अ,a,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
8
+ 906,6,आ,A,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
9
+ 907,7,इ,i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
10
+ 908,8,ई,I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
11
+ 909,9,उ,u,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
12
+ 90a,10,ऊ,uu,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
13
+ 90b,11,ऋ,R^i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
14
+ 90c,12,ऌ,LLi,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
15
+ 90d,13,ऍ,ऍ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
16
+ 90e,14,ऎ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
17
+ 90f,15,ए,e,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
18
+ 910,16,ऐ,ai,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
19
+ 911,17,ऑ,ऑ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
20
+ 912,18,ऒ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
21
+ 913,19,ओ,o,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
22
+ 914,20,औ,au,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
23
+ 915,21,क,ka,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
24
+ 916,22,ख,kha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
25
+ 917,23,ग,ga,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
26
+ 918,24,घ,gha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
27
+ 919,25,ङ,~Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
28
+ 91a,26,च,ca,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
29
+ 91b,27,छ,Cha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
30
+ 91c,28,ज,ja,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
31
+ 91d,29,झ,jha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
32
+ 91e,30,ञ,JNa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
33
+ 91f,31,ट,Ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
34
+ 920,32,ठ,Tha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
35
+ 921,33,ड,Da,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
36
+ 922,34,ढ,Dha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
37
+ 923,35,ण,Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
38
+ 924,36,त,ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
39
+ 925,37,थ,tha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
40
+ 926,38,द,da,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
41
+ 927,39,ध,dha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
42
+ 928,40,न,na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
43
+ 929,41,ऩ,ऩ,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
44
+ 92a,42,प,pa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
45
+ 92b,43,फ,pha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
46
+ 92c,44,ब,ba,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
47
+ 92d,45,भ,bha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
48
+ 92e,46,म,ma,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
49
+ 92f,47,य,ya,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
50
+ 930,48,र,ra,alveolar or dental- approximated by dental/ can also be considered a rhotic consonant (flap ie tap),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
51
+ 931,49,ऱ,Ra,retroflex (trill),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
52
+ 932,50,ल,la,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
53
+ 933,51,ळ,La,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
54
+ 934,52,ऴ,zha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
55
+ 935,53,व,va,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
56
+ 936,54,श,sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
57
+ 937,55,ष,Sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
58
+ 938,56,स,sa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
59
+ 939,57,ह,ha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
60
+ 93a,58,ऺ,ऺ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
61
+ 93b,59,ऻ,ऻ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
62
+ 93c,60,़,़,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
63
+ 93d,61,ऽ,.a,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64
+ 93e,62,ा,A,,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
65
+ 93f,63,ि,i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
66
+ 940,64,ी,I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
67
+ 941,65,ु,u,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
68
+ 942,66,ू,uu,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
69
+ 943,67,ृ,R^i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
70
+ 944,68,ॄ,R^I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
71
+ 945,69,ॅ,ॅ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
72
+ 946,70,ॆ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
73
+ 947,71,े,e,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
74
+ 948,72,ै,ai,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
75
+ 949,73,ॉ,ॉ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
76
+ 94a,74,ॊ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
77
+ 94b,75,ो,o,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
78
+ 94c,76,ौ,au,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
79
+ 94d,77,्,,,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
80
+ 94e,78,ॎ,ॎ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
81
+ 94f,79,ॏ,ॏ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
82
+ 950,80,ॐ,AUM,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
83
+ 951,81,॑,॑,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
84
+ 952,82,॒,॒,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
85
+ 953,83,॓,॓,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
86
+ 954,84,॔,॔,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
87
+ 955,85,ॕ,ॕ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
88
+ 956,86,ॖ,ॖ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
89
+ 957,87,ॗ,ॗ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
90
+ 958,88,क़,क़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
91
+ 959,89,ख़,ख़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
92
+ 95a,90,ग़,ग़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93
+ 95b,91,ज़,ज़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
94
+ 95c,92,ड़,ड़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
95
+ 95d,93,ढ़,ढ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
96
+ 95e,94,फ़,फ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
97
+ 95f,95,य़,य़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
98
+ 960,96,ॠ,R^I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,0,0,1
99
+ 961,97,ॡ,L^I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
100
+ 962,98,ॢ,LLi,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
101
+ 963,99,ॣ,L^I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,1,0
102
+ 964,100,।,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103
+ 965,101,॥,..,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
104
+ 966,102,०,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
105
+ 967,103,१,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
106
+ 968,104,२,2,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
107
+ 969,105,३,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
108
+ 96a,106,४,4,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109
+ 96b,107,५,5,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110
+ 96c,108,६,6,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
111
+ 96d,109,७,7,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
112
+ 96e,110,८,8,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
113
+ 96f,111,९,9,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
libs/indic_nlp_library/RESOURCES/script/english_arpabet_list.csv ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AO
2
+ AA
3
+ IY
4
+ UW
5
+ EH
6
+ IH
7
+ UH
8
+ AH
9
+ AX
10
+ AE
11
+ EY
12
+ AY
13
+ OW
14
+ AW
15
+ OY
16
+ P
17
+ B
18
+ T
19
+ D
20
+ K
21
+ G
22
+ CH
23
+ JH
24
+ F
25
+ V
26
+ TH
27
+ DH
28
+ S
29
+ Z
30
+ SH
31
+ ZH
32
+ HH
33
+ M
34
+ EM
35
+ N
36
+ EN
37
+ NG
38
+ ENG
39
+ L
40
+ EL
41
+ R
42
+ DX
43
+ NX
44
+ Y
45
+ W
46
+ Q
libs/indic_nlp_library/RESOURCES/script/english_script_phonetic_data.csv ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded
2
+ 900,0,,AO,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
3
+ 901,1,,AA,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
4
+ 902,2,,IY,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
5
+ 903,3,,UW,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
6
+ 904,4,ए,EH,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
7
+ 905,5,इ,IH,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
8
+ 906,6,उ,UH,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
9
+ 907,7,अ,AH,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
10
+ 908,8,अ,AX,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
11
+ 909,9,ऍ,AE,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
12
+ 90a,10,,EY,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
13
+ 90b,11,ऐ,AY,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
14
+ 90c,12,ओ,OW,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
15
+ 90d,13,औ,AW,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
16
+ 90e,14,,OY,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
17
+ 90f,15,,P,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
18
+ 910,16,,B,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
19
+ 911,17,,T,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
20
+ 912,18,,D,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
21
+ 913,19,,K,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
22
+ 914,20,,G,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
23
+ 915,21,,CH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
24
+ 916,22,,JH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
25
+ 917,23,,F,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
26
+ 918,24,,V,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
27
+ 919,25,,TH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0
28
+ 91a,26,,DH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
29
+ 91b,27,,S,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
30
+ 91c,28,,Z,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
31
+ 91d,29,,SH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
32
+ 91e,30,,ZH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
33
+ 91f,31,,HH,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
34
+ 920,32,,M,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
35
+ 921,33,,EM,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
36
+ 922,34,,N,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
37
+ 923,35,,EN,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
38
+ 924,36,,NG,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
39
+ 925,37,,ENG,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
40
+ 926,38,,L,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
41
+ 927,39,,EL,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
42
+ 928,40,,R,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
43
+ 929,41,,DX,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
44
+ 92a,42,,NX,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
45
+ 92b,43,,Y,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
46
+ 92c,44,,W,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
47
+ 92d,45,,Q,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
libs/indic_nlp_library/RESOURCES/script/tamil_script_phonetic_data.csv ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Unicode,Relative Offset,Devanagari,ITRANS,Notes,Valid Vector Representation,is_vowel,is_consonant,nukta,halanta,anusvara,misc,short_vowel,long_vowel,weak,medium,strong,independent_vowel,dependent_vowel,plosive,fricative,Central-approximant,Lateral-approximant,flap,velar,palatal,retroflex,dental,labial,aspirated,not_aspirated,voiced,unvoiced,nasal,not_nasal,front,central,back,close,close-mid,open-mid,open,rounded,not_rounded
2
+ 900,0,ऀ,ऀ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3
+ 901,1,ँ,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
4
+ 902,2,ं,.n,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
5
+ 903,3,ः,H,Should represent as pure aspiration and not as a vowel,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6
+ 904,4,ऄ,ऄ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7
+ 905,5,अ,a,,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
8
+ 906,6,आ,A,,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
9
+ 907,7,इ,i,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
10
+ 908,8,ई,I,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
11
+ 909,9,उ,u,,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
12
+ 90a,10,ऊ,uu,,1,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
13
+ 90b,11,ऋ,R^i,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14
+ 90c,12,ऌ,LLi,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
15
+ 90d,13,ऍ,ऍ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
16
+ 90e,14,ऎ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
17
+ 90f,15,ए,e,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
18
+ 910,16,ऐ,ai,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
19
+ 911,17,ऑ,ऑ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
20
+ 912,18,ऒ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
21
+ 913,19,ओ,o,,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
22
+ 914,20,औ,au,,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
23
+ 915,21,क,ka,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
24
+ 916,22,ख,kha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25
+ 917,23,ग,ga,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
26
+ 918,24,घ,gha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27
+ 919,25,ङ,~Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
28
+ 91a,26,च,ca,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
29
+ 91b,27,छ,Cha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30
+ 91c,28,ज,ja,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
31
+ 91d,29,झ,jha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
32
+ 91e,30,ञ,JNa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
33
+ 91f,31,ट,Ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
34
+ 920,32,ठ,Tha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
35
+ 921,33,ड,Da,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36
+ 922,34,ढ,Dha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37
+ 923,35,ण,Na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
38
+ 924,36,त,ta,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
39
+ 925,37,थ,tha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
40
+ 926,38,द,da,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
41
+ 927,39,ध,dha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
42
+ 928,40,न,na,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
43
+ 929,41,ऩ,ऩ,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
44
+ 92a,42,प,pa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0
45
+ 92b,43,फ,pha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
46
+ 92c,44,ब,ba,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
47
+ 92d,45,भ,bha,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48
+ 92e,46,म,ma,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0
49
+ 92f,47,य,ya,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
50
+ 930,48,र,ra,alveolar or dental- approximated by dental/ can also be considered a rhotic consonant (flap ie tap),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
51
+ 931,49,ऱ,Ra,retroflex (trill),1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
52
+ 932,50,ल,la,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
53
+ 933,51,ळ,La,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
54
+ 934,52,ऴ,zha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
55
+ 935,53,व,va,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
56
+ 936,54,श,sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
57
+ 937,55,ष,Sha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
58
+ 938,56,स,sa,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
59
+ 939,57,ह,ha,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
60
+ 93a,58,ऺ,ऺ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
61
+ 93b,59,ऻ,ऻ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
62
+ 93c,60,़,़,,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
63
+ 93d,61,ऽ,.a,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
64
+ 93e,62,ा,A,,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0,1
65
+ 93f,63,ि,i,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
66
+ 940,64,ी,I,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1
67
+ 941,65,ु,u,,1,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
68
+ 942,66,ू,uu,,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,0,1,0
69
+ 943,67,ृ,R^i,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
70
+ 944,68,ॄ,R^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
71
+ 945,69,ॅ,ॅ,Nasalized e,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,1
72
+ 946,70,ॆ,.e,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
73
+ 947,71,े,e,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,1
74
+ 948,72,ै,ai,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,1
75
+ 949,73,ॉ,ॉ,Nasalized o,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0
76
+ 94a,74,ॊ,.o,,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
77
+ 94b,75,ो,o,,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,0,1,0,0,1,0
78
+ 94c,76,ौ,au,,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1
79
+ 94d,77,्,,,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
80
+ 94e,78,ॎ,ॎ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
81
+ 94f,79,ॏ,ॏ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
82
+ 950,80,ॐ,AUM,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
83
+ 951,81,॑,॑,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
84
+ 952,82,॒,॒,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
85
+ 953,83,॓,॓,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
86
+ 954,84,॔,॔,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
87
+ 955,85,ॕ,ॕ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
88
+ 956,86,ॖ,ॖ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
89
+ 957,87,ॗ,ॗ,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
90
+ 958,88,क़,क़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
91
+ 959,89,ख़,ख़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
92
+ 95a,90,ग़,ग़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
93
+ 95b,91,ज़,ज़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
94
+ 95c,92,ड़,ड़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
95
+ 95d,93,ढ़,ढ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
96
+ 95e,94,फ़,फ़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
97
+ 95f,95,य़,य़,will be decomposed to canonical representation: consonant+nukta,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
98
+ 960,96,ॠ,R^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
99
+ 961,97,ॡ,L^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
100
+ 962,98,ॢ,LLi,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
101
+ 963,99,ॣ,L^I,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
102
+ 964,100,।,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
103
+ 965,101,॥,..,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
104
+ 966,102,०,0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
105
+ 967,103,१,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
106
+ 968,104,२,2,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
107
+ 969,105,३,3,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
108
+ 96a,106,४,4,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109
+ 96b,107,५,5,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
110
+ 96c,108,६,6,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
111
+ 96d,109,७,7,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
112
+ 96e,110,८,8,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
113
+ 96f,111,९,9,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
libs/indic_nlp_library/RESOURCES/transliterate/offset_itrans_map.csv ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ offset_hex,devnag_char,itrans
2
+ 0x0,ऀ,ऀ
3
+ 0x1,ँ,ँ
4
+ 0x2,ं,.m
5
+ 0x3,ः,H
6
+ 0x4,ऄ,ऄ
7
+ 0x5,अ,a
8
+ 0x6,आ,aa
9
+ 0x7,इ,i
10
+ 0x8,ई,ii
11
+ 0x9,उ,u
12
+ 0xa,ऊ,uu
13
+ 0xb,ऋ,R^i
14
+ 0xc,ऌ,L^i
15
+ 0xd,ऍ,ऍ
16
+ 0xe,ऎ,.e
17
+ 0xf,ए,e
18
+ 0x10,ऐ,ai
19
+ 0x11,ऑ,ऑ
20
+ 0x12,ऒ,.o
21
+ 0x13,ओ,o
22
+ 0x14,औ,au
23
+ 0x15,क,ka
24
+ 0x16,ख,kha
25
+ 0x17,ग,ga
26
+ 0x18,घ,gha
27
+ 0x19,ङ,~Na
28
+ 0x1a,च,cha
29
+ 0x1b,छ,Cha
30
+ 0x1c,ज,ja
31
+ 0x1d,झ,jha
32
+ 0x1e,ञ,~na
33
+ 0x1f,ट,Ta
34
+ 0x20,ठ,Tha
35
+ 0x21,ड,Da
36
+ 0x22,ढ,Dha
37
+ 0x23,ण,Na
38
+ 0x24,त,ta
39
+ 0x25,थ,tha
40
+ 0x26,द,da
41
+ 0x27,ध,dha
42
+ 0x28,न,na
43
+ 0x29,ऩ,*na
44
+ 0x2a,प,pa
45
+ 0x2b,फ,pha
46
+ 0x2c,ब,ba
47
+ 0x2d,भ,bha
48
+ 0x2e,म,ma
49
+ 0x2f,य,ya
50
+ 0x30,र,ra
51
+ 0x31,ऱ,Ra
52
+ 0x32,ल,la
53
+ 0x33,ळ,lda
54
+ 0x34,ऴ,zha
55
+ 0x35,व,va
56
+ 0x36,श,sha
57
+ 0x37,ष,Sha
58
+ 0x38,स,sa
59
+ 0x39,ह,ha
60
+ 0x3a,ऺ,ऺ
61
+ 0x3b,ऻ,ऻ
62
+ 0x3c,़,़
63
+ 0x3d,ऽ,.a
64
+ 0x3e,ा,aa
65
+ 0x3f,ि,i
66
+ 0x40,ी,ii
67
+ 0x41,ु,u
68
+ 0x42,ू,uu
69
+ 0x43,ृ,R^i
70
+ 0x44,ॄ,R^I
71
+ 0x45,ॅ,ॅ
72
+ 0x46,ॆ,.e
73
+ 0x47,े,e
74
+ 0x48,ै,ai
75
+ 0x49,ॉ,ॉ
76
+ 0x4a,ॊ,.o
77
+ 0x4b,ो,o
78
+ 0x4c,ौ,au
79
+ 0x4d,्,
80
+ 0x4e,ॎ,ॎ
81
+ 0x4f,ॏ,ॏ
82
+ 0x50,ॐ,AUM
83
+ 0x51,॑,॑
84
+ 0x52,॒,॒
85
+ 0x53,॓,॓
86
+ 0x54,॔,॔
87
+ 0x55,ॕ,ॕ
88
+ 0x56,ॖ,ॖ
89
+ 0x57,ॗ,ॗ
90
+ 0x58,क़,क़
91
+ 0x59,ख़,ख़
92
+ 0x5a,ग़,ग़
93
+ 0x5b,ज़,ज़
94
+ 0x5c,ड़,ड़
95
+ 0x5d,ढ़,ढ़
96
+ 0x5e,फ़,फ़
97
+ 0x5f,य़,य़
98
+ 0x60,ॠ,R^I
99
+ 0x61,ॡ,L^I
100
+ 0x62,ॢ,L^i
101
+ 0x63,ॣ,L^I
102
+ 0x64,।,.
103
+ 0x65,॥,..
104
+ 0x66,०,0
105
+ 0x67,१,1
106
+ 0x68,२,2
107
+ 0x69,३,3
108
+ 0x6a,४,4
109
+ 0x6b,५,5
110
+ 0x6c,६,6
111
+ 0x6d,७,7
112
+ 0x6e,८,8
113
+ 0x6f,९,9
114
+ 0x70,॰,॰
115
+ 0x71,ॱ,ॱ
116
+ 0x72,ॲ,ॲ
117
+ 0x73,ॳ,ॳ
118
+ 0x74,ॴ,ॴ
119
+ 0x75,ॵ,ॵ
120
+ 0x76,ॶ,ॶ
121
+ 0x77,ॷ,ॷ
122
+ 0x78,ॸ,ॸ
123
+ 0x79,ॹ,ॹ
124
+ 0x7a,ॺ,ॺ
125
+ 0x7b,ॻ,ॻ
126
+ 0x7c,ॼ,ॼ
127
+ 0x7d,ॽ,ॽ
128
+ 0x7e,ॾ,ॾ
129
+ 0x7f,ॿ,a
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/PKG-INFO ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.2
2
+ Name: indic_nlp_library_IT2
3
+ Version: 0.0.2
4
+ Summary: The goal of the Indic NLP Library is to build Python based libraries for common text processing and Natural Language Processing in Indian languages. This fork is specialized for IndicTrans2.
5
+ Home-page: https://github.com/VarunGumma/indic_nlp_library
6
+ Author: Varun Gumma
7
+ Author-email: [email protected]
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: sphinx-argparse
16
+ Requires-Dist: sphinx_rtd_theme
17
+ Requires-Dist: morfessor
18
+ Requires-Dist: pandas
19
+ Requires-Dist: numpy
20
+ Dynamic: author
21
+ Dynamic: author-email
22
+ Dynamic: classifier
23
+ Dynamic: description
24
+ Dynamic: description-content-type
25
+ Dynamic: home-page
26
+ Dynamic: license
27
+ Dynamic: requires-dist
28
+ Dynamic: requires-python
29
+ Dynamic: summary
30
+
31
+ # Indic NLP Library
32
+ This repository is a _de-bloated_ fork of the original [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library) and integrates [UrduHack](https://github.com/urduhack/urduhack) submodule and [Indic NLP Resources](https://github.com/anoopkunchukuttan/indic_nlp_resources) directly. This allows to work with Urdu normalization and tokenization without needing to install [urduhack](https://pypi.org/project/urduhack/) and `indic_nlp_resources` separately, which can be an issue sometimes as it is `TensorFlow` based. This repository is mainly created and mainted for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)
33
+
34
+ For any queries, please get in touch with the original authors/maintainers of the respective libraries:
35
+
36
+ - `Indic NLP Library`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
37
+ - `Indic NLP Resources`: [anoopkunchukuttan](https://github.com/anoopkunchukuttan)
38
+ - `UrduHack`: [UrduHack](https://github.com/urduhack)
39
+
40
+ ## Usage:
41
+ ```
42
+ git clone https://github.com/VarunGumma/indic_nlp_library.git
43
+
44
+ cd indic_nlp_library
45
+ pip install --editable ./
46
+ ```
47
+
48
+ ## Updates:
49
+ - Integrated `urduhack` directly into the repository.
50
+ - Renamed `master` branch as `main`.
51
+ - Integrated `indic_nlp_resources` directly into the repository.
52
+ - _De-bloated_ the repository.
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ indic_nlp_library_IT2.egg-info/PKG-INFO
5
+ indic_nlp_library_IT2.egg-info/SOURCES.txt
6
+ indic_nlp_library_IT2.egg-info/dependency_links.txt
7
+ indic_nlp_library_IT2.egg-info/requires.txt
8
+ indic_nlp_library_IT2.egg-info/top_level.txt
9
+ indicnlp/__init__.py
10
+ indicnlp/common.py
11
+ indicnlp/langinfo.py
12
+ indicnlp/loader.py
13
+ indicnlp/version.py
14
+ indicnlp/normalize/__init__.py
15
+ indicnlp/normalize/indic_normalize.py
16
+ indicnlp/script/__init__.py
17
+ indicnlp/script/english_script.py
18
+ indicnlp/script/indic_scripts.py
19
+ indicnlp/script/phonetic_sim.py
20
+ indicnlp/tokenize/__init__.py
21
+ indicnlp/tokenize/indic_detokenize.py
22
+ indicnlp/tokenize/indic_tokenize.py
23
+ indicnlp/tokenize/sentence_tokenize.py
24
+ indicnlp/transliterate/__init__.py
25
+ indicnlp/transliterate/acronym_transliterator.py
26
+ indicnlp/transliterate/script_unifier.py
27
+ indicnlp/transliterate/unicode_transliterate.py
28
+ indicnlp/urduhack/__init__.py
29
+ indicnlp/urduhack/stop_words.py
30
+ indicnlp/urduhack/urdu_characters.py
31
+ indicnlp/urduhack/normalization/__init__.py
32
+ indicnlp/urduhack/normalization/character.py
33
+ indicnlp/urduhack/normalization/regexes.py
34
+ indicnlp/urduhack/preprocessing/__init__.py
35
+ indicnlp/urduhack/preprocessing/character.py
36
+ indicnlp/urduhack/preprocessing/regexes.py
37
+ indicnlp/urduhack/preprocessing/util.py
38
+ indicnlp/urduhack/tokenization/__init__.py
39
+ indicnlp/urduhack/tokenization/eos.py
40
+ indicnlp/urduhack/tokenization/tokenizer.py
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/requires.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sphinx-argparse
2
+ sphinx_rtd_theme
3
+ morfessor
4
+ pandas
5
+ numpy
libs/indic_nlp_library/indic_nlp_library_IT2.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ indicnlp