Rajendransp133's picture
Upload 86 files
ac901c7 verified
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts
#
# @author Anoop Kunchukuttan
#
import sys
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate
from indicnlp import loader
class AggressiveScriptUnifier:
def __init__(self, common_lang="hi", nasals_mode="to_nasal_consonants"):
self.common_lang = common_lang
self.nasals_mode = nasals_mode
self.do_normalize_chandras = True
self.do_normalize_vowel_ending = True
self.remove_nuktas = True
self.normalizer_map = {}
self._init_normalizers()
def _init_normalizers(self):
normalizer_factory = indic_normalize.IndicNormalizerFactory()
## for languages with common parameters
for lang in ["hi", "mr", "sa", "kK", "ne", "sd", "bn", "gu", "ta", "te", "kn"]:
self.normalizer_map[lang] = normalizer_factory.get_normalizer(
lang,
nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras,
remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
)
## for languages with language specific parameters
self.normalizer_map["pa"] = normalizer_factory.get_normalizer(
"pa",
nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras,
remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_canonicalize_addak=True,
do_canonicalize_tippi=True,
do_replace_vowel_bases=True,
)
self.normalizer_map["or"] = normalizer_factory.get_normalizer(
"or",
nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras,
remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_remap_wa=True,
)
self.normalizer_map["as"] = normalizer_factory.get_normalizer(
"as",
nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras,
remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_remap_assamese_chars=True,
)
self.normalizer_map["ml"] = normalizer_factory.get_normalizer(
"ml",
nasals_mode=self.nasals_mode,
do_normalize_chandras=self.do_normalize_chandras,
remove_nuktas=self.remove_nuktas,
do_normalize_vowel_ending=self.do_normalize_vowel_ending,
do_canonicalize_chillus=True,
do_correct_geminated_T=True,
)
def transform(self, text, lang):
text = self.normalizer_map[lang].normalize(text)
text = unicode_transliterate.UnicodeIndicTransliterator.transliterate(
text, lang, self.common_lang
)
return text
class BasicScriptUnifier:
def __init__(self, common_lang="hi", nasals_mode="do_nothing"):
self.common_lang = common_lang
self.nasals_mode = nasals_mode
self.normalizer_map = {}
self._init_normalizers()
def _init_normalizers(self):
normalizer_factory = indic_normalize.IndicNormalizerFactory()
for lang in [
"hi",
"mr",
"sa",
"kK",
"ne",
"sd",
"bn",
"gu",
"ta",
"te",
"kn",
"pa",
"or",
"as",
"ml",
]:
self.normalizer_map[lang] = normalizer_factory.get_normalizer(
lang, nasals_mode=self.nasals_mode
)
def transform(self, text, lang):
if lang in self.normalizer_map:
text = self.normalizer_map[lang].normalize(text)
text = unicode_transliterate.UnicodeIndicTransliterator.transliterate(
text, lang, self.common_lang
)
return text
class NaiveScriptUnifier:
def __init__(self, common_lang="hi"):
self.common_lang = common_lang
def transform(self, text, lang):
text = unicode_transliterate.UnicodeIndicTransliterator.transliterate(
text, lang, self.common_lang
)
return text
if __name__ == "__main__":
loader.load()
if len(sys.argv) <= 4:
print("Usage: python script_unifier <command> <infile> <outfile> <language>")
sys.exit(1)
if sys.argv[1] == "aggressive":
language = sys.argv[4]
unifier = AggressiveScriptUnifier(nasals_mode="to_nasal_consonants")
with open(sys.argv[2], "r", encoding="utf-8") as ifile:
with open(sys.argv[3], "w", encoding="utf-8") as ofile:
for i, line in enumerate(ifile.readlines()):
line = line.strip()
transliterated_line = unifier.transform(line, language)
ofile.write(transliterated_line + "\n")
elif sys.argv[1] == "moderate":
language = sys.argv[4]
unifier = AggressiveScriptUnifier(nasals_mode="do_nothing")
with open(sys.argv[2], "r", encoding="utf-8") as ifile:
with open(sys.argv[3], "w", encoding="utf-8") as ofile:
for i, line in enumerate(ifile.readlines()):
line = line.strip()
transliterated_line = unifier.transform(line, language)
ofile.write(transliterated_line + "\n")
elif sys.argv[1] == "basic":
language = sys.argv[4]
unifier = BasicScriptUnifier()
with open(sys.argv[2], "r", encoding="utf-8") as ifile:
with open(sys.argv[3], "w", encoding="utf-8") as ofile:
for i, line in enumerate(ifile.readlines()):
line = line.strip()
transliterated_line = unifier.transform(line, language)
ofile.write(transliterated_line + "\n")
elif sys.argv[1] == "naive":
language = sys.argv[4]
unifier = NaiveScriptUnifier()
with open(sys.argv[2], "r", encoding="utf-8") as ifile:
with open(sys.argv[3], "w", encoding="utf-8") as ofile:
for i, line in enumerate(ifile.readlines()):
line = line.strip()
transliterated_line = unifier.transform(line, language)
ofile.write(transliterated_line + "\n")