Spaces:
Sleeping
Sleeping
import re | |
_letter_to_arpabet = { | |
"A": "EY1", | |
"B": "B IY1", | |
"C": "S IY1", | |
"D": "D IY1", | |
"E": "IY1", | |
"F": "EH1 F", | |
"G": "JH IY1", | |
"H": "EY1 CH", | |
"I": "AY1", | |
"J": "JH EY1", | |
"K": "K EY1", | |
"L": "EH1 L", | |
"M": "EH1 M", | |
"N": "EH1 N", | |
"O": "OW1", | |
"P": "P IY1", | |
"Q": "K Y UW1", | |
"R": "AA1 R", | |
"S": "EH1 S", | |
"T": "T IY1", | |
"U": "Y UW1", | |
"V": "V IY1", | |
"X": "EH1 K S", | |
"Y": "W AY1", | |
"W": "D AH1 B AH0 L Y UW0", | |
"Z": "Z IY1", | |
"s": "Z", | |
} | |
# must ignore roman numerals | |
# _acronym_re = re.compile(r'([A-Z][A-Z]+)s?|([A-Z]\.([A-Z]\.)+s?)') | |
_acronym_re = re.compile(r"([A-Z][A-Z]+)s?") | |
class AcronymNormalizer(object): | |
def __init__(self, phoneme_dict): | |
self.phoneme_dict = phoneme_dict | |
def normalize_acronyms(self, text): | |
def _expand_acronyms(m, add_spaces=True): | |
acronym = m.group(0) | |
# remove dots if they exist | |
acronym = re.sub("\.", "", acronym) | |
acronym = "".join(acronym.split()) | |
arpabet = self.phoneme_dict.lookup(acronym) | |
if arpabet is None: | |
acronym = list(acronym) | |
arpabet = ["{" + _letter_to_arpabet[letter] + "}" for letter in acronym] | |
# temporary fix | |
if arpabet[-1] == "{Z}" and len(arpabet) > 1: | |
arpabet[-2] = arpabet[-2][:-1] + " " + arpabet[-1][1:] | |
del arpabet[-1] | |
arpabet = " ".join(arpabet) | |
elif len(arpabet) == 1: | |
arpabet = "{" + arpabet[0] + "}" | |
else: | |
arpabet = acronym | |
return arpabet | |
text = re.sub(_acronym_re, _expand_acronyms, text) | |
return text | |
def __call__(self, text): | |
return self.normalize_acronyms(text) | |