|
|
|
|
|
|
|
|
|
|
|
|
|
from Unified_parser.ply.lex import Lexer |
|
from Unified_parser.ply.yacc import yacc |
|
from Unified_parser.globals import * |
|
from Unified_parser.helpers import * |
|
import sys |
|
from sys import exit |
|
from Unified_parser.get_phone_mapped_python import * |
|
|
|
|
|
tokens = ('kaki_c', 'conjsyll2_c', 'fullvowel_b', 'kaki_a', 'kaki_b', 'conjsyll2_b', 'conjsyll2_a', |
|
'conjsyll1', 'nukchan_b','nukchan_a', 'yarule', 'fullvowel_a', 'vowel') |
|
|
|
|
|
|
|
def p_sentence(p): |
|
''' |
|
sentence : words |
|
''' |
|
if p.parser.g.flags.parseLevel == 0: |
|
p.parser.g.words.syllabifiedWordOut = p[1] |
|
|
|
if p.parser.g.words.syllabifiedWordOut.find('&&') != -1: |
|
p.parser.g.words.syllabifiedWordOut = rec_replace(p.parser.g.words.syllabifiedWordOut,'&&','&') |
|
|
|
p.parser.g.flags.parseLevel += 1 |
|
else: |
|
p.parser.g.words.phonifiedWord = p[1] |
|
|
|
def p_words_syltoken(p): |
|
''' |
|
words : syltoken |
|
''' |
|
if(p.parser.g.flags.DEBUG): |
|
print(f"Syll:\t{p[1]}") |
|
p[0] = p[1] |
|
|
|
def p_words_wordsandsyltoken(p): |
|
''' |
|
words : words syltoken |
|
''' |
|
if(p.parser.g.flags.DEBUG): |
|
print(f"Syll:\t{p[2]}") |
|
p[0] = p[1] + p[2] |
|
|
|
def p_syltoken(p): |
|
''' |
|
syltoken : fullvowel_b |
|
| fullvowel_a |
|
| conjsyll2_c |
|
| conjsyll2_b |
|
| conjsyll2_a |
|
| conjsyll1 |
|
| nukchan_b |
|
| nukchan_a |
|
| yarule |
|
| vowel |
|
''' |
|
p[0] = p[1] |
|
|
|
def p_syltoken1(p): |
|
''' |
|
syltoken : |
|
| kaki_c |
|
| kaki_a |
|
| kaki_b |
|
''' |
|
if (p.parser.g.flags.DEBUG): |
|
print(f'kaki : {p[1]}') |
|
p[0] = p[1] |
|
|
|
def p_error(p): |
|
print('parse error') |
|
exit(1) |
|
|
|
|
|
def printHelp(): |
|
|
|
print("UnifiedParser - Usage Instructions") |
|
print("Run python3 parser.py wd lsflag wfflag clearflag") |
|
print("wd - word to parse in unicode.") |
|
print("lsflag - always 0. we are not using this.") |
|
print("wfflag - 0 for Monophone parsing, 1 for syllable parsing, 2 for Akshara Parsing") |
|
print("clearflag -0. for normal syntx, 1 for removing the lisp like format of output and to just produce space separated output. 2 for cls labled output") |
|
print("language - name of language in quationes") |
|
|
|
def wordparse(wd : str, lsflag : int, wfflag : int, clearflag : int,language_main :str): |
|
g = GLOBALS() |
|
lexer = Lexer() |
|
parser = yacc() |
|
parser.g = g |
|
g.flags.DEBUG = False |
|
wd = wd.strip(' ') |
|
|
|
if lsflag not in [0,1] or wfflag not in [0,1,2]: |
|
print("Invalid input") |
|
exit(1) |
|
|
|
g.flags.LangSpecificCorrectionFlag = lsflag |
|
|
|
g.flags.writeFormat = wfflag |
|
if wfflag == 4: |
|
g.flags.writeFormat = 1 |
|
g.flags.syllTagFlag = 1 |
|
|
|
word = wd |
|
if g.flags.DEBUG: |
|
print(f'Word : {word}') |
|
|
|
word = RemoveUnwanted(word) |
|
if g.flags.DEBUG: |
|
print(f'Cleared Word : {word}') |
|
|
|
if SetlanguageFeat(g, word) == 0: |
|
return 0 |
|
|
|
if CheckDictionary(g, word) != 0: |
|
return 0 |
|
|
|
if g.flags.DEBUG: |
|
print(f'langId : {g.langId}') |
|
|
|
word = ConvertToSymbols(g, word) |
|
|
|
if g.flags.DEBUG: |
|
print(f"Symbols code : {g.words.unicodeWord}") |
|
print(f"Symbols syllables : {g.words.syllabifiedWord}") |
|
|
|
language_main=language_main.lower() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parser.parse(g.words.syllabifiedWord, lexer=lexer) |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = rec_replace(g.words.syllabifiedWordOut, '&#&','&') + '&' |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word out : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = LangSpecificCorrection(g, g.words.syllabifiedWordOut, g.flags.LangSpecificCorrectionFlag) |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word langCorr : {g.words.syllabifiedWordOut}") |
|
|
|
|
|
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut) |
|
if(g.flags.DEBUG): |
|
print(f"Syllabified Word memCorr : {g.words.syllabifiedWordOut}") |
|
|
|
if not g.isSouth: |
|
if g.flags.DEBUG: |
|
print('NOT SOUTH') |
|
count = 0 |
|
for i in range(len(g.words.syllabifiedWordOut)): |
|
if g.words.syllabifiedWordOut[i] == '&': |
|
count += 1 |
|
splitPosition = 2 |
|
if GetPhoneType(g, g.words.syllabifiedWordOut, 1) == 1: |
|
if count > 2: |
|
tpe = GetPhoneType(g, g.words.syllabifiedWordOut, 2) |
|
if tpe == 2: |
|
splitPosition = 1 |
|
elif tpe == 3: |
|
splitPosition = 3 |
|
else: |
|
splitPosition = 1 |
|
count = 0 |
|
for i in range(len(g.words.syllabifiedWordOut)): |
|
if g.words.syllabifiedWordOut[i] == '&': |
|
count += 1 |
|
if count > splitPosition: |
|
count = i |
|
break |
|
start, end = g.words.syllabifiedWordOut, g.words.syllabifiedWordOut |
|
end = end[count:] |
|
start = start[:count] |
|
if(g.flags.DEBUG): |
|
print(f"posi {count} {start} {end}") |
|
end = SchwaSpecificCorrection(g, end) |
|
if(g.flags.DEBUG): |
|
print(f"prefinal : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = start + end |
|
if(g.flags.DEBUG): |
|
print(f"prefinal1 : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut) |
|
if(g.flags.DEBUG): |
|
print(f"final : {g.words.syllabifiedWordOut}") |
|
g.words.syllabifiedWordOut = SchwaDoubleConsonent(g.words.syllabifiedWordOut) |
|
if(g.flags.DEBUG): |
|
print(f"final0 : {g.words.syllabifiedWordOut}") |
|
|
|
g.words.syllabifiedWordOut = GeminateCorrection(g.words.syllabifiedWordOut, 0) |
|
g.words.syllabifiedWordOut = MiddleVowel(g, g.words.syllabifiedWordOut) |
|
g.words.syllabifiedWordOut = Syllabilfy(g.words.syllabifiedWordOut) |
|
|
|
SplitSyllables(g,g.words.syllabifiedWordOut) |
|
temp_list=g.syllableList |
|
if g.flags.DEBUG: |
|
print("temp_list_1",temp_list) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
WritetoFiles(g) |
|
if clearflag == 0: |
|
return g.answer |
|
if clearflag == 1: |
|
t = g.words.outputText |
|
t = t.split('"') |
|
ln = len(t) |
|
i = 1 |
|
g.answer = '' |
|
while i < ln: |
|
g.answer += t[i] + ' ' |
|
i += 2 |
|
g.answer.strip() |
|
return g.answer |
|
if clearflag == 2: |
|
t = g.words.outputText |
|
t = t.split('"') |
|
|
|
ln = len(t) |
|
i = 1 |
|
g.answer = '' |
|
text_replacer=TextReplacer() |
|
while i < ln: |
|
if len(t[i])>1 and ord(t[i][0])<128 and ord(t[i][0])>=65: |
|
temp = text_replacer.apply_replacements_by_phonems(t[i]) |
|
g.answer += temp |
|
elif len(t[i])==1 and ord(t[i][0])<128 and ord(t[i][0])>=65: |
|
g.answer += t[i] |
|
i += 1 |
|
g.answer.strip() |
|
return g.answer |
|
|
|
|
|
def safe_word_parse(wd : str, lsflag : int, wfflag : int, clearflag : int,language : str): |
|
try: |
|
ans = wordparse(wd, lsflag, wfflag, clearflag,language) |
|
|
|
except: |
|
print(f'failed on {wd}') |
|
ans = 'FAILED' |
|
return wd, ans, |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
if (len(sys.argv) != 6): |
|
printHelp() |
|
exit(-1) |
|
|
|
ans = wordparse(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]),str(sys.argv[5])) |
|
print(ans) |
|
|