# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # # Program for normalization of text written in Unicode. This is mainly geared towards Indic scripts # # @author Anoop Kunchukuttan # import sys from indicnlp.normalize import indic_normalize from indicnlp.transliterate import unicode_transliterate from indicnlp import loader class AggressiveScriptUnifier: def __init__(self, common_lang="hi", nasals_mode="to_nasal_consonants"): self.common_lang = common_lang self.nasals_mode = nasals_mode self.do_normalize_chandras = True self.do_normalize_vowel_ending = True self.remove_nuktas = True self.normalizer_map = {} self._init_normalizers() def _init_normalizers(self): normalizer_factory = indic_normalize.IndicNormalizerFactory() ## for languages with common parameters for lang in ["hi", "mr", "sa", "kK", "ne", "sd", "bn", "gu", "ta", "te", "kn"]: self.normalizer_map[lang] = normalizer_factory.get_normalizer( lang, nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, ) ## for languages with language specific parameters self.normalizer_map["pa"] = normalizer_factory.get_normalizer( "pa", nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_canonicalize_addak=True, do_canonicalize_tippi=True, do_replace_vowel_bases=True, ) self.normalizer_map["or"] = normalizer_factory.get_normalizer( "or", nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_remap_wa=True, ) self.normalizer_map["as"] = normalizer_factory.get_normalizer( "as", nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_remap_assamese_chars=True, ) self.normalizer_map["ml"] = normalizer_factory.get_normalizer( "ml", nasals_mode=self.nasals_mode, do_normalize_chandras=self.do_normalize_chandras, remove_nuktas=self.remove_nuktas, do_normalize_vowel_ending=self.do_normalize_vowel_ending, do_canonicalize_chillus=True, do_correct_geminated_T=True, ) def transform(self, text, lang): text = self.normalizer_map[lang].normalize(text) text = unicode_transliterate.UnicodeIndicTransliterator.transliterate( text, lang, self.common_lang ) return text class BasicScriptUnifier: def __init__(self, common_lang="hi", nasals_mode="do_nothing"): self.common_lang = common_lang self.nasals_mode = nasals_mode self.normalizer_map = {} self._init_normalizers() def _init_normalizers(self): normalizer_factory = indic_normalize.IndicNormalizerFactory() for lang in [ "hi", "mr", "sa", "kK", "ne", "sd", "bn", "gu", "ta", "te", "kn", "pa", "or", "as", "ml", ]: self.normalizer_map[lang] = normalizer_factory.get_normalizer( lang, nasals_mode=self.nasals_mode ) def transform(self, text, lang): if lang in self.normalizer_map: text = self.normalizer_map[lang].normalize(text) text = unicode_transliterate.UnicodeIndicTransliterator.transliterate( text, lang, self.common_lang ) return text class NaiveScriptUnifier: def __init__(self, common_lang="hi"): self.common_lang = common_lang def transform(self, text, lang): text = unicode_transliterate.UnicodeIndicTransliterator.transliterate( text, lang, self.common_lang ) return text if __name__ == "__main__": loader.load() if len(sys.argv) <= 4: print("Usage: python script_unifier ") sys.exit(1) if sys.argv[1] == "aggressive": language = sys.argv[4] unifier = AggressiveScriptUnifier(nasals_mode="to_nasal_consonants") with open(sys.argv[2], "r", encoding="utf-8") as ifile: with open(sys.argv[3], "w", encoding="utf-8") as ofile: for i, line in enumerate(ifile.readlines()): line = line.strip() transliterated_line = unifier.transform(line, language) ofile.write(transliterated_line + "\n") elif sys.argv[1] == "moderate": language = sys.argv[4] unifier = AggressiveScriptUnifier(nasals_mode="do_nothing") with open(sys.argv[2], "r", encoding="utf-8") as ifile: with open(sys.argv[3], "w", encoding="utf-8") as ofile: for i, line in enumerate(ifile.readlines()): line = line.strip() transliterated_line = unifier.transform(line, language) ofile.write(transliterated_line + "\n") elif sys.argv[1] == "basic": language = sys.argv[4] unifier = BasicScriptUnifier() with open(sys.argv[2], "r", encoding="utf-8") as ifile: with open(sys.argv[3], "w", encoding="utf-8") as ofile: for i, line in enumerate(ifile.readlines()): line = line.strip() transliterated_line = unifier.transform(line, language) ofile.write(transliterated_line + "\n") elif sys.argv[1] == "naive": language = sys.argv[4] unifier = NaiveScriptUnifier() with open(sys.argv[2], "r", encoding="utf-8") as ifile: with open(sys.argv[3], "w", encoding="utf-8") as ofile: for i, line in enumerate(ifile.readlines()): line = line.strip() transliterated_line = unifier.transform(line, language) ofile.write(transliterated_line + "\n")