|
|
|
|
|
import re |
|
|
|
VALID_SYMBOLS = [ |
|
"AA", |
|
"AA0", |
|
"AA1", |
|
"AA2", |
|
"AE", |
|
"AE0", |
|
"AE1", |
|
"AE2", |
|
"AH", |
|
"AH0", |
|
"AH1", |
|
"AH2", |
|
"AO", |
|
"AO0", |
|
"AO1", |
|
"AO2", |
|
"AW", |
|
"AW0", |
|
"AW1", |
|
"AW2", |
|
"AY", |
|
"AY0", |
|
"AY1", |
|
"AY2", |
|
"B", |
|
"CH", |
|
"D", |
|
"DH", |
|
"EH", |
|
"EH0", |
|
"EH1", |
|
"EH2", |
|
"ER", |
|
"ER0", |
|
"ER1", |
|
"ER2", |
|
"EY", |
|
"EY0", |
|
"EY1", |
|
"EY2", |
|
"F", |
|
"G", |
|
"HH", |
|
"IH", |
|
"IH0", |
|
"IH1", |
|
"IH2", |
|
"IY", |
|
"IY0", |
|
"IY1", |
|
"IY2", |
|
"JH", |
|
"K", |
|
"L", |
|
"M", |
|
"N", |
|
"NG", |
|
"OW", |
|
"OW0", |
|
"OW1", |
|
"OW2", |
|
"OY", |
|
"OY0", |
|
"OY1", |
|
"OY2", |
|
"P", |
|
"R", |
|
"S", |
|
"SH", |
|
"T", |
|
"TH", |
|
"UH", |
|
"UH0", |
|
"UH1", |
|
"UH2", |
|
"UW", |
|
"UW0", |
|
"UW1", |
|
"UW2", |
|
"V", |
|
"W", |
|
"Y", |
|
"Z", |
|
"ZH", |
|
] |
|
|
|
|
|
class CMUDict: |
|
"""Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" |
|
|
|
def __init__(self, file_or_path, keep_ambiguous=True): |
|
if isinstance(file_or_path, str): |
|
with open(file_or_path, encoding="latin-1") as f: |
|
entries = _parse_cmudict(f) |
|
else: |
|
entries = _parse_cmudict(file_or_path) |
|
if not keep_ambiguous: |
|
entries = {word: pron for word, pron in entries.items() if len(pron) == 1} |
|
self._entries = entries |
|
|
|
def __len__(self): |
|
return len(self._entries) |
|
|
|
def lookup(self, word): |
|
"""Returns list of ARPAbet pronunciations of the given word.""" |
|
return self._entries.get(word.upper()) |
|
|
|
@staticmethod |
|
def get_arpabet(word, cmudict, punctuation_symbols): |
|
first_symbol, last_symbol = "", "" |
|
if word and word[0] in punctuation_symbols: |
|
first_symbol = word[0] |
|
word = word[1:] |
|
if word and word[-1] in punctuation_symbols: |
|
last_symbol = word[-1] |
|
word = word[:-1] |
|
arpabet = cmudict.lookup(word) |
|
if arpabet is not None: |
|
return first_symbol + "{%s}" % arpabet[0] + last_symbol |
|
return first_symbol + word + last_symbol |
|
|
|
|
|
_alt_re = re.compile(r"\([0-9]+\)") |
|
|
|
|
|
def _parse_cmudict(file): |
|
cmudict = {} |
|
for line in file: |
|
if line and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): |
|
parts = line.split(" ") |
|
word = re.sub(_alt_re, "", parts[0]) |
|
pronunciation = _get_pronunciation(parts[1]) |
|
if pronunciation: |
|
if word in cmudict: |
|
cmudict[word].append(pronunciation) |
|
else: |
|
cmudict[word] = [pronunciation] |
|
return cmudict |
|
|
|
|
|
def _get_pronunciation(s): |
|
parts = s.strip().split(" ") |
|
for part in parts: |
|
if part not in VALID_SYMBOLS: |
|
return None |
|
return " ".join(parts) |
|
|