Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 10,009 Bytes
18c873f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 |
# Extended Grapheme to Phoneme conversion using CMU Dictionary and Heteronym parsing.
from __future__ import annotations
import re
from typing import Optional
import pywordsegment
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from .h2p import H2p
from .h2p import replace_first
from . import format_ph as ph
from .dict_reader import DictReader
from .text.numbers import normalize_numbers
from .filter import filter_text
from .processors import Processor
from copy import deepcopy
re_digit = re.compile(r"\((\d+)\)")
re_bracket_with_digit = re.compile(r"\(.*\)")
# Check that the nltk data is downloaded, if not, download it
try:
nltk.data.find('corpora/wordnet.zip')
nltk.data.find('corpora/omw-1.4.zip')
except LookupError:
nltk.download('wordnet')
nltk.download('omw-1.4')
class CMUDictExt:
def __init__(self, cmu_dict_path: str = None, h2p_dict_path: str = None, cmu_multi_mode: int = 0,
process_numbers: bool = True, phoneme_brackets: bool = True, unresolved_mode: str = 'keep'):
# noinspection GrazieInspection
"""
Initialize CMUDictExt - Extended Grapheme to Phoneme conversion using CMU Dictionary with Heteronym parsing.
CMU multi-entry resolution modes:
- -2 : Raw entry (i.e. 'A' resolves to 'AH0' and 'A(1)' to 'EY1')
- -1 : Skip resolving any entry with multiple pronunciations.
- 0 : Resolve using default un-numbered pronunciation.
- 1 : Resolve using (1) numbered pronunciation.
- n : Resolve using (n) numbered pronunciation.
- If a higher number is specified than available for the word, the highest available number is used.
Unresolved word resolution modes:
- keep : Keep the text-form word in the output.
- remove : Remove the text-form word from the output.
- drop : Return the line as None if any word is unresolved.
:param cmu_dict_path: Path to CMU dictionary file (.txt)
:type: str
:param h2p_dict_path: Path to Custom H2p dictionary (.json)
:type: str
:param cmu_multi_mode: CMU resolution mode for entries with multiple pronunciations.
:type: int
"""
# Check valid unresolved_mode argument
if unresolved_mode not in ['keep', 'remove', 'drop']:
raise ValueError('Invalid value for unresolved_mode: {}'.format(unresolved_mode))
self.unresolved_mode = unresolved_mode
self.cmu_dict_path = cmu_dict_path # Path to CMU dictionary file (.txt), if None, uses built-in
self.h2p_dict_path = h2p_dict_path # Path to Custom H2p dictionary (.json), if None, uses built-in
self.cmu_multi_mode = cmu_multi_mode # CMU multi-entry resolution mode
self.process_numbers = process_numbers # Normalize numbers to text form, if enabled
self.phoneme_brackets = phoneme_brackets # If True, phonemes are wrapped in curly brackets.
self.dict = DictReader(self.cmu_dict_path).dict # CMU Dictionary
self.h2p = H2p(self.h2p_dict_path, preload=True) # H2p parser
self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form
self.stem = SnowballStemmer('english').stem # Snowball Stemmer - used to find stem root of words
self.segment = pywordsegment.WordSegmenter().segment # Word Segmenter
self.p = Processor(self) # Processor for processing text
# Features
# Auto pluralization and de-pluralization
self.ft_auto_plural = True
# Auto splits and infers possessive forms of original words
self.ft_auto_pos = True
# Auto splits 'll
self.ft_auto_ll = True
# Auto splits and infers hyphenated words
self.ft_auto_hyphenated = True
# Auto splits possible compound words
self.ft_auto_compound = True
# Analyzes word root stem and infers pronunciation separately
# i.e. 'generously' -> 'generous' + 'ly'
self.ft_stem = True
# Forces compound words using manual lookup
self.ft_auto_compound_l2 = True
def lookup(self, text: str, pos: str = None, ph_format: str = 'sds') -> str | list | None:
# noinspection GrazieInspection
"""
Gets the CMU Dictionary entry for a word.
Options for ph_format:
- 'sds' space delimited string
- 'sds_b' space delimited string with curly brackets
- 'list' list of phoneme strings
:param pos: Part of speech tag (Optional)
:param ph_format: Format of the phonemes to return:
:type: str
:param text: Word to lookup
:type: str
"""
def format_as(in_phoneme):
if ph_format == 'sds':
output = ph.to_sds(in_phoneme)
elif ph_format == 'sds_b':
output = ph.with_cb(ph.to_sds(in_phoneme))
elif ph_format == 'list':
output = ph.to_list(in_phoneme)
else:
raise ValueError('Invalid value for ph_format: {}'.format(ph_format))
return output
# Get the CMU Dictionary entry for the word
word = text.lower()
entry = deepcopy(self.dict.get(word)) # Ensure safe copy of entry
# Has entry, return it directly
if entry is not None:
return format_as(entry)
# Auto Possessive Processor
if self.ft_auto_pos:
res = self.p.auto_possessives(word)
if res is not None:
return format_as(res)
# Auto Contractions for "ll" or "d"
if self.ft_auto_ll:
res = self.p.auto_contractions(word)
if res is not None:
return format_as(res)
# Check for hyphenated words
if self.ft_auto_hyphenated:
res = self.p.auto_hyphenated(word)
if res is not None:
return format_as(res)
# Check for compound words
if self.ft_auto_compound:
res = self.p.auto_compound(word)
if res is not None:
return format_as(res)
# No entry, detect if this is a multi-word entry
if '(' in word and ')' in word and any(char.isdigit() for char in word):
# Parse the integer from the word using regex
num = int(re.findall(re_digit, word)[0])
# If found
if num is not None:
# Remove the integer and bracket from the word
actual_word = re.sub(re_bracket_with_digit, "", word)
# See if this is a valid entry
result = deepcopy(self.dict.get(actual_word)) # Ensure safe copy of entry
# If found:
if result is not None:
# Translate the integer to index
index = min(num - 1, 0)
# Check if index is less than the number of pronunciations
if index < len(result):
# Return the entry using the provided num index
return format_as(result[index])
# If entry is higher
else:
# Return the highest available entry
return format_as(result[-1])
# Auto de-pluralization
# This is placed near the end because we need to do a pos-tag process
if self.ft_auto_plural:
res = self.p.auto_plural(word, pos)
if res is not None:
return format_as(res)
# Stem check
# noinspection SpellCheckingInspection
"""
Supported modes for words ending in:
"ing", "ingly", "ly"
"""
if self.ft_stem:
res = self.p.auto_stem(word)
if res is not None:
return format_as(res)
# Force compounding
if self.ft_auto_compound_l2:
res = self.p.auto_compound_l2(word)
if res is not None:
return format_as(res)
# If not found
return None
def convert(self, text: str) -> str | None:
# noinspection GrazieInspection
"""
Replace a grapheme text line with phonemes.
:param text: Text line to be converted
:type: str
"""
# Check valid unresolved_mode argument
if self.unresolved_mode not in ['keep', 'remove', 'drop']:
raise ValueError('Invalid value for unresolved_mode: {}'.format(self.unresolved_mode))
ur_mode = self.unresolved_mode
# Normalize numbers, if enabled
if self.process_numbers:
text = normalize_numbers(text)
# Filter and Tokenize
f_text = filter_text(text, preserve_case=True)
words = self.h2p.tokenize(f_text)
# Run POS tagging
tags = self.h2p.get_tags(words)
# Loop through words and pos tags
for word, pos in tags:
# Skip punctuation
if word == '.':
continue
# If word not in h2p dict, check CMU dict
if not self.h2p.dict.contains(word):
entry = self.lookup(word, pos)
if entry is None:
if ur_mode == 'drop':
return None
if ur_mode == 'remove':
text = replace_first(word, '', text)
continue
# Do replace
f_ph = ph.with_cb(ph.to_sds(entry))
text = replace_first(word, f_ph, text)
continue
# For word in h2p dict, get phonemes
phonemes = self.h2p.dict.get_phoneme(word, pos)
# Format phonemes
f_ph = ph.with_cb(ph.to_sds(phonemes))
# Replace word with phonemes
text = replace_first(word, f_ph, text)
# Return text
return text
|