Yehor's picture
Init
ea6a7ed
raw
history blame contribute delete
1.84 kB
import re
_letter_to_arpabet = {
"A": "EY1",
"B": "B IY1",
"C": "S IY1",
"D": "D IY1",
"E": "IY1",
"F": "EH1 F",
"G": "JH IY1",
"H": "EY1 CH",
"I": "AY1",
"J": "JH EY1",
"K": "K EY1",
"L": "EH1 L",
"M": "EH1 M",
"N": "EH1 N",
"O": "OW1",
"P": "P IY1",
"Q": "K Y UW1",
"R": "AA1 R",
"S": "EH1 S",
"T": "T IY1",
"U": "Y UW1",
"V": "V IY1",
"X": "EH1 K S",
"Y": "W AY1",
"W": "D AH1 B AH0 L Y UW0",
"Z": "Z IY1",
"s": "Z",
}
# must ignore roman numerals
# _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)')
_acronym_re = re.compile(r"([A-Z][A-Z]+)s?")
class AcronymNormalizer(object):
def __init__(self, phoneme_dict):
self.phoneme_dict = phoneme_dict
def normalize_acronyms(self, text):
def _expand_acronyms(m, add_spaces=True):
acronym = m.group(0)
# remove dots if they exist
acronym = re.sub("\.", "", acronym)
acronym = "".join(acronym.split())
arpabet = self.phoneme_dict.lookup(acronym)
if arpabet is None:
acronym = list(acronym)
arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym]
# temporary fix
if arpabet[-1] == "{Z}" and len(arpabet) > 1:
arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:]
del arpabet[-1]
arpabet = " ".join(arpabet)
elif len(arpabet) == 1:
arpabet = "{" + arpabet[0] + "}"
else:
arpabet = acronym
return arpabet
text = re.sub(_acronym_re, _expand_acronyms, text)
return text
def __call__(self, text):
return self.normalize_acronyms(text)