utkarsh2299's picture
Upload 97 files
2c8dc05 verified
raw
history blame
8.46 kB
# import sys, os
# SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# sys.path.append(SCRIPT_DIR)
# combined lexical analyzer and parser
from Unified_parser.ply.lex import Lexer
from Unified_parser.ply.yacc import yacc
from Unified_parser.globals import *
from Unified_parser.helpers import *
import sys
from sys import exit
from Unified_parser.get_phone_mapped_python import *
# tokens used
tokens = ('kaki_c', 'conjsyll2_c', 'fullvowel_b', 'kaki_a', 'kaki_b', 'conjsyll2_b', 'conjsyll2_a',
'conjsyll1', 'nukchan_b','nukchan_a', 'yarule', 'fullvowel_a', 'vowel')
# parser part
def p_sentence(p):
'''
sentence : words
'''
if p.parser.g.flags.parseLevel == 0:
p.parser.g.words.syllabifiedWordOut = p[1]
if p.parser.g.words.syllabifiedWordOut.find('&&') != -1:
p.parser.g.words.syllabifiedWordOut = rec_replace(p.parser.g.words.syllabifiedWordOut,'&&','&')
p.parser.g.flags.parseLevel += 1
else:
p.parser.g.words.phonifiedWord = p[1]
def p_words_syltoken(p):
'''
words : syltoken
'''
if(p.parser.g.flags.DEBUG):
print(f"Syll:\t{p[1]}")
p[0] = p[1]
def p_words_wordsandsyltoken(p):
'''
words : words syltoken
'''
if(p.parser.g.flags.DEBUG):
print(f"Syll:\t{p[2]}")
p[0] = p[1] + p[2]
def p_syltoken(p):
'''
syltoken : fullvowel_b
| fullvowel_a
| conjsyll2_c
| conjsyll2_b
| conjsyll2_a
| conjsyll1
| nukchan_b
| nukchan_a
| yarule
| vowel
'''
p[0] = p[1]
def p_syltoken1(p):
'''
syltoken :
| kaki_c
| kaki_a
| kaki_b
'''
if (p.parser.g.flags.DEBUG):
print(f'kaki : {p[1]}')
p[0] = p[1]
def p_error(p):
print('parse error')
exit(1)
# print the help of syntax
def printHelp():
print("UnifiedParser - Usage Instructions")
print("Run python3 parser.py wd lsflag wfflag clearflag")
print("wd - word to parse in unicode.")
print("lsflag - always 0. we are not using this.")
print("wfflag - 0 for Monophone parsing, 1 for syllable parsing, 2 for Akshara Parsing")
print("clearflag -0. for normal syntx, 1 for removing the lisp like format of output and to just produce space separated output. 2 for cls labled output")
print("language - name of language in quationes")
def wordparse(wd : str, lsflag : int, wfflag : int, clearflag : int,language_main :str):
g = GLOBALS()
lexer = Lexer()
parser = yacc()
parser.g = g
g.flags.DEBUG = False
wd = wd.strip('  ') # hidden characters
if lsflag not in [0,1] or wfflag not in [0,1,2]:
print("Invalid input")
exit(1)
g.flags.LangSpecificCorrectionFlag = lsflag
g.flags.writeFormat = wfflag
if wfflag == 4:
g.flags.writeFormat = 1
g.flags.syllTagFlag = 1
word = wd
if g.flags.DEBUG:
print(f'Word : {word}')
word = RemoveUnwanted(word)
if g.flags.DEBUG:
print(f'Cleared Word : {word}')
if SetlanguageFeat(g, word) == 0:
return 0
if CheckDictionary(g, word) != 0:
return 0
if g.flags.DEBUG:
print(f'langId : {g.langId}')
word = ConvertToSymbols(g, word)
if g.flags.DEBUG:
print(f"Symbols code : {g.words.unicodeWord}")
print(f"Symbols syllables : {g.words.syllabifiedWord}")
language_main=language_main.lower()
# if language_main == "telugu":
# g.currLang = g.TELUGU
# g.langId = 3
# if(g.langId < 5):
# g.isSouth = 1
# print("language:",language_main)
# if language_main == "marathi":
# print("inside main")
# g.currLang = g.KANNADA
# g.langId = 4
# if(g.langId < 5):
# g.isSouth = 1
# print("language:",language_main)
parser.parse(g.words.syllabifiedWord, lexer=lexer)
if(g.flags.DEBUG):
print(f"Syllabified Word : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = rec_replace(g.words.syllabifiedWordOut, '&#&','&') + '&'
if(g.flags.DEBUG):
print(f"Syllabified Word out : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = LangSpecificCorrection(g, g.words.syllabifiedWordOut, g.flags.LangSpecificCorrectionFlag)
if(g.flags.DEBUG):
print(f"Syllabified Word langCorr : {g.words.syllabifiedWordOut}")
# if(g.flags.DEBUG):
# print(f"Syllabified Word gemCorr : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut)
if(g.flags.DEBUG):
print(f"Syllabified Word memCorr : {g.words.syllabifiedWordOut}")
if not g.isSouth:
if g.flags.DEBUG:
print('NOT SOUTH')
count = 0
for i in range(len(g.words.syllabifiedWordOut)):
if g.words.syllabifiedWordOut[i] == '&':
count += 1
splitPosition = 2
if GetPhoneType(g, g.words.syllabifiedWordOut, 1) == 1:
if count > 2:
tpe = GetPhoneType(g, g.words.syllabifiedWordOut, 2)
if tpe == 2:
splitPosition = 1
elif tpe == 3:
splitPosition = 3
else:
splitPosition = 1
count = 0
for i in range(len(g.words.syllabifiedWordOut)):
if g.words.syllabifiedWordOut[i] == '&':
count += 1
if count > splitPosition:
count = i
break
start, end = g.words.syllabifiedWordOut, g.words.syllabifiedWordOut
end = end[count:]
start = start[:count]
if(g.flags.DEBUG):
print(f"posi {count} {start} {end}")
end = SchwaSpecificCorrection(g, end)
if(g.flags.DEBUG):
print(f"prefinal : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = start + end
if(g.flags.DEBUG):
print(f"prefinal1 : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = CleanseWord(g.words.syllabifiedWordOut)
if(g.flags.DEBUG):
print(f"final : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = SchwaDoubleConsonent(g.words.syllabifiedWordOut)
if(g.flags.DEBUG):
print(f"final0 : {g.words.syllabifiedWordOut}")
g.words.syllabifiedWordOut = GeminateCorrection(g.words.syllabifiedWordOut, 0)
g.words.syllabifiedWordOut = MiddleVowel(g, g.words.syllabifiedWordOut)
g.words.syllabifiedWordOut = Syllabilfy(g.words.syllabifiedWordOut)
SplitSyllables(g,g.words.syllabifiedWordOut)
temp_list=g.syllableList
if g.flags.DEBUG:
print("temp_list_1",temp_list)
# print("temp_list_1",temp_list)
# for i in range(len(temp_list)):
# print("output_a", temp_list[i] )
# temp_list[i]=convert_to_main_lang (g,temp_list[i], language_main)
# print("output_b", temp_list[i] )
WritetoFiles(g)
if clearflag == 0:
return g.answer
if clearflag == 1:
t = g.words.outputText
t = t.split('"')
ln = len(t)
i = 1
g.answer = ''
while i < ln:
g.answer += t[i] + ' '
i += 2
g.answer.strip()
return g.answer
if clearflag == 2:
t = g.words.outputText
t = t.split('"')
# print(t)
ln = len(t)
i = 1
g.answer = ''
text_replacer=TextReplacer()
while i < ln:
if len(t[i])>1 and ord(t[i][0])<128 and ord(t[i][0])>=65:
temp = text_replacer.apply_replacements_by_phonems(t[i])
g.answer += temp
elif len(t[i])==1 and ord(t[i][0])<128 and ord(t[i][0])>=65:
g.answer += t[i]
i += 1
g.answer.strip()
return g.answer
def safe_word_parse(wd : str, lsflag : int, wfflag : int, clearflag : int,language : str):
try:
ans = wordparse(wd, lsflag, wfflag, clearflag,language)
# print(ans)
except:
print(f'failed on {wd}')
ans = 'FAILED'
return wd, ans,
if __name__ == '__main__':
if (len(sys.argv) != 6):
printHelp()
exit(-1)
ans = wordparse(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]),str(sys.argv[5]))
print(ans)