Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
# | |
# Copyright (c) 2013-present, Anoop Kunchukuttan | |
# All rights reserved. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
# Program to transliterate acronyms from one Latin script to Indic languages | |
# | |
# @author Anoop Kunchukuttan | |
# | |
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator | |
import random | |
class LatinToIndicAcronymTransliterator(object): | |
LATIN_TO_DEVANAGARI_TRANSTABLE = str.maketrans( | |
{ | |
"a": "ए", | |
"b": "बी", | |
"c": "सी", | |
"d": "डी", | |
"e": "ई", | |
"f": "एफ", | |
"g": "जी", | |
"h": "एच", | |
"i": "आई", | |
"j": "जे", | |
"k": "के", | |
"l": "एल", | |
"m": "एम", | |
"n": "एन", | |
"o": "ओ", | |
"p": "पी", | |
"q": "क्यू", | |
"r": "आर", | |
"s": "एस", | |
"t": "टी", | |
"u": "यू", | |
"v": "वी", | |
"w": "डब्ल्यू", | |
"x": "एक्स", | |
"y": "वाय", | |
"z": "जेड", | |
} | |
) | |
# a_unichr=ord('a') | |
# alphabet = [ chr(a_unichr+n) for n in range(26) ] | |
LATIN_ALPHABET = [ | |
"a", | |
"b", | |
"c", | |
"d", | |
"e", | |
"f", | |
"g", | |
"h", | |
"i", | |
"j", | |
"k", | |
"l", | |
"m", | |
"n", | |
"o", | |
"p", | |
"q", | |
"r", | |
"s", | |
"t", | |
"u", | |
"v", | |
"w", | |
"x", | |
"y", | |
"z", | |
] | |
def get_transtable(): | |
return LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE | |
def transliterate(w, lang): | |
return UnicodeIndicTransliterator.transliterate( | |
w.lower().translate( | |
LatinToIndicAcronymTransliterator.LATIN_TO_DEVANAGARI_TRANSTABLE | |
), | |
"hi", | |
lang, | |
) | |
def generate_latin_acronyms(num_acronyms, min_len=2, max_len=6, strategy="random"): | |
""" | |
generate Latin acronyms in lower case | |
""" | |
def sample_acronym(strategy="random"): | |
if strategy == "random": | |
slen = random.randint(min_len, max_len) | |
return "".join( | |
random.choices( | |
LatinToIndicAcronymTransliterator.LATIN_ALPHABET, k=slen | |
) | |
) | |
return [sample_acronym(strategy) for i in range(num_acronyms)] | |