Spaces:

Harveenchadha
/

oiTrans

Runtime error

oiTrans / indic_nlp_library /contrib /hindi_to_kannada_transliterator.py

harveen

Harveen | Adding code

74fc30d about 3 years ago

2.21 kB

	import sys
	from indicnlp import common
	common.set_resources_path(INDIC_NLP_RESOURCES)

	from indicnlp import loader
	from indicnlp.normalize import indic_normalize
	from indicnlp.transliterate import unicode_transliterate

	if __name__ == '__main__':
	"""
	This script transliterates Hindi to Kannada. It removes/remaps
	characters only found in Hindi. It also adds halanta to words ending
	with consonant - as is the convention in Kannada
	"""

	infname=sys.argv[1] # one sentence/word per line. Sentences should be space-tokenized
	outfname=sys.agv[2]
	loader.load()

	normalizer_factory=indic_normalize.IndicNormalizerFactory()
	normalizer=normalizer_factory.get_normalizer('hi')

	with open(infname,'r',encoding='utf-8') as infile, \
	open(outfname,'w',encoding='utf-8') as outfile:
	for line in infile:
	line=line.strip()
	line=normalizer.normalize(line)

	## replace chandrabindus with anusvara
	line=line.replace('\u0900','\u0902')
	line=line.replace('\u0901','\u0902')

	### replace chandra e and o diacritics with e and o respectively
	#line=line.replace('\u0945','\u0947')
	#line=line.replace('\u0949','\u094b')

	### replace chandra e and o diacritics with a diacritic
	## this seems to be general usage
	line=line.replace('\u0945','\u093e')
	line=line.replace('\u0949','\u093e')

	## remove nukta
	line=line.replace('\u093c','')

	## add halant if word ends with consonant
	#if isc.is_consonant(isc.get_phonetic_feature_vector(line[-1],'hi')):
	# line=line+'\u094d'
	words=line.split(' ')
	outwords=[]
	for word in line.split(' '):
	if isc.is_consonant(isc.get_phonetic_feature_vector(word[-1],'hi')):
	word=word+'\u094d'
	outwords.append(word)
	line=' '.join(outwords)


	## script conversion
	line=unicode_transliterate.UnicodeIndicTransliterator.transliterate(line,'hi','kn')

	outfile.write(line+'\n')