MassivelyMultilingualTTS / Preprocessing /articulatory_features.py
Flux9665's picture
use explicit code instead of relying on release download
9e275b8
raw
history blame
33.7 kB
# -*- coding: utf-8 -*-
# partly derived from an open-source resource provided by Papercup Technologies Limited
# Resource-Author: Marlene Staib
# Modified by Florian Lux, 2021
# Further modified by Florian Lux, 2022
"""
All phonemes in the IPA standard are supported.
zero-width characters are generally not supported, as
well as some other modifiers. Tone, stress and
lengthening are represented with placeholder dimensions,
however they need to be set manually, this conversion
from phonemes to features works on a character by
character basis. In a few cases, the place of
articulation is approximated because only one phoneme
had such a combination, which does not warrant a new
dimension.
"""
def generate_feature_lookup():
return {
'~': {'symbol_type': 'silence'},
'#': {'symbol_type': 'end of sentence'},
'?': {'symbol_type': 'questionmark'},
'!': {'symbol_type': 'exclamationmark'},
'.': {'symbol_type': 'fullstop'},
' ': {'symbol_type': 'word-boundary'},
'ɜ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded',
},
'ə': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'mid',
'vowel_roundedness': 'unrounded',
},
'a': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded',
},
'ð': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'dental',
'consonant_manner': 'fricative'
},
'ɛ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded',
},
'ɪ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front_central',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'unrounded',
},
'ŋ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'nasal'
},
'ɔ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded',
},
'ɒ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open',
'vowel_roundedness': 'rounded',
},
'ɾ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'flap'
},
'ʃ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'fricative'
},
'θ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'dental',
'consonant_manner': 'fricative'
},
'ʊ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central_back',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'unrounded'
},
'ʌ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'unrounded'
},
'ʒ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'fricative'
},
'æ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid_open',
'vowel_roundedness': 'unrounded'
},
'b': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'plosive'
},
'ʔ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'glottal',
'consonant_manner': 'plosive'
},
'd': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'plosive'
},
'e': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded'
},
'f': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'fricative'
},
'ɡ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'plosive'
},
'h': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'glottal',
'consonant_manner': 'fricative'
},
'i': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'j': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'approximant'
},
'k': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'velar',
'consonant_manner': 'plosive'
},
'l': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'lateral-approximant'
},
'm': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'nasal'
},
'n': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'nasal'
},
'ɳ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'nasal'
},
'o': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'p': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'plosive'
},
'ɹ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'approximant'
},
'r': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'trill'
},
's': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'fricative'
},
't': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'plosive'
},
'u': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded',
},
'v': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'fricative'
},
'w': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labial-velar',
'consonant_manner': 'approximant'
},
'x': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'velar',
'consonant_manner': 'fricative'
},
'z': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'fricative'
},
'ʀ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'trill'
},
'ø': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'ç': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'palatal',
'consonant_manner': 'fricative'
},
'ɐ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded'
},
'œ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded'
},
'y': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded'
},
'ʏ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front_central',
'vowel_openness' : 'close_close-mid',
'vowel_roundedness': 'rounded'
},
'ɑ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'open',
'vowel_roundedness': 'unrounded'
},
'c': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'palatal',
'consonant_manner': 'plosive'
},
'ɲ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'nasal'
},
'ɣ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'fricative'
},
'ʎ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'lateral-approximant'
},
'β': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'fricative'
},
'ʝ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'fricative'
},
'ɟ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'plosive'
},
'q': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'uvular',
'consonant_manner': 'plosive'
},
'ɕ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'fricative'
},
'ɭ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'lateral-approximant'
},
'ɵ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded'
},
'ʑ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'fricative'
},
'ʋ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'approximant'
},
'ʁ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'fricative'
},
'ɨ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'ʂ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'fricative'
},
'ɓ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'implosive'
},
'ʙ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'vibrant'
},
'ɗ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'dental',
'consonant_manner': 'implosive'
},
'ɖ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'plosive'
},
'χ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'uvular',
'consonant_manner': 'fricative'
},
'ʛ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'implosive'
},
'ʟ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'lateral-approximant'
},
'ɽ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'flap'
},
'ɢ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'plosive'
},
'ɠ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'implosive'
},
'ǂ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolopalatal',
'consonant_manner': 'click'
},
'ɦ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'glottal',
'consonant_manner': 'fricative'
},
'ǁ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'alveolar',
'consonant_manner': 'click'
},
'ĩ': { # identical description with i except nasal
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded',
'consonant_manner' : 'nasal'
},
'ʍ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'labial-velar',
'consonant_manner': 'fricative'
},
'ʕ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'pharyngal',
'consonant_manner': 'fricative'
},
'ɻ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'approximant'
},
'ʄ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'palatal',
'consonant_manner': 'implosive'
},
'ũ': { # identical with u, but nasal
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded',
'consonant_manner' : 'nasal'
},
'ɤ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded',
},
'ɶ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'front',
'vowel_openness' : 'open',
'vowel_roundedness': 'rounded',
},
'õ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'rounded',
'consonant_manner' : 'nasal'
},
'ʡ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'plosive'
},
'ʈ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'plosive'
},
'ʜ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'fricative'
},
'ɱ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'nasal'
},
'ɯ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'back',
'vowel_openness' : 'close',
'vowel_roundedness': 'unrounded'
},
'ǀ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'dental',
'consonant_manner': 'click'
},
'ɸ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'fricative'
},
'ʘ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'bilabial',
'consonant_manner': 'click'
},
'ʐ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'retroflex',
'consonant_manner': 'fricative'
},
'ɰ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'velar',
'consonant_manner': 'approximant'
},
'ɘ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close-mid',
'vowel_roundedness': 'unrounded'
},
'ħ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'pharyngal',
'consonant_manner': 'fricative'
},
'ɞ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'open-mid',
'vowel_roundedness': 'rounded'
},
'ʉ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'vowel',
'VUV' : 'voiced',
'vowel_frontness' : 'central',
'vowel_openness' : 'close',
'vowel_roundedness': 'rounded'
},
'ɴ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'uvular',
'consonant_manner': 'nasal'
},
'ʢ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'epiglottal',
'consonant_manner': 'fricative'
},
'ѵ': {
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'voiced',
'consonant_place' : 'labiodental',
'consonant_manner': 'flap'
},
'ǃ': { # looks deceivingly like an exclamation mark, but it's a different unicode entry
'symbol_type' : 'phoneme',
'vowel_consonant' : 'consonant',
'VUV' : 'unvoiced',
'consonant_place' : 'postalveolar',
'consonant_manner': 'click'
},
} # REMEMBER to also add the phonemes added here to the ID lookup below as the new highest ID
def get_phone_to_id():
"""
for the states of the ctc loss and dijkstra/mas in the aligner
cannot be extracted trivially from above because sets are unordered and the IDs need to be consistent
"""
phone_to_id = dict()
for index, phone in enumerate("~#?!ǃ.ɜəaðɛɪŋɔɒɾʃθʊʌʒæbʔdefghijklmnɳopɡɹrstuvwxzʀøçɐœyʏɑcɲɣʎβʝɟqɕɭɵʑʋʁɨʂɓʙɗɖχʛʟɽɢɠǂɦǁĩʍʕɻʄũɤɶõʡʈʜɱɯǀɸʘʐɰɘħɞʉɴʢѵ"):
phone_to_id[phone] = index
return phone_to_id
def get_feature_to_index_lookup():
return {
# MODIFIER
# -- stress: modified by the previous symbol
"stressed" : 0,
# -- tone: modified by the following symbol
"very-high-tone" : 1,
"high-tone" : 2,
"mid-tone" : 3,
"low-tone" : 4,
"very-low-tone" : 5,
"rising-tone" : 6,
"falling-tone" : 7,
"peaking-tone" : 8,
"dipping-tone" : 9,
# -- lengthening: modified by the following symbol
"lengthened" : 10,
"half-length" : 11,
"shortened" : 12,
# CATEGORIES
"consonant" : 13,
"vowel" : 14,
"phoneme" : 15,
# NON-SPEECH-MARKERS
"silence" : 16,
"end of sentence" : 17,
"questionmark" : 18,
"exclamationmark" : 19,
"fullstop" : 20,
"word-boundary" : 21,
# PLACE
"dental" : 22,
"postalveolar" : 23,
"velar" : 24,
"palatal" : 25,
"glottal" : 26,
"uvular" : 27,
"labiodental" : 28,
"labial-velar" : 29,
"alveolar" : 30,
"bilabial" : 31,
"alveolopalatal" : 32,
"retroflex" : 33,
"pharyngal" : 34,
"epiglottal" : 35,
# TONGUE POSITION
"central" : 36,
"back" : 37,
"front_central" : 38,
"front" : 39,
"central_back" : 40,
# MOUTH OPENNESS
"mid" : 41,
"close-mid" : 42,
"close" : 43,
"open-mid" : 44,
"close_close-mid" : 45,
"open-mid_open" : 46,
"open" : 47,
# MOUTH SHAPE
"rounded" : 48,
"unrounded" : 49,
# MANNER
"plosive" : 50,
"nasal" : 51,
"approximant" : 52,
"trill" : 53,
"flap" : 54,
"fricative" : 55,
"lateral-approximant": 56,
"implosive" : 57,
"vibrant" : 58,
"click" : 59,
# TYPE
"unvoiced" : 60,
"voiced" : 61,
}
def generate_feature_table():
ipa_to_phonemefeats = generate_feature_lookup()
feat_types = set()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
[feat_types.add(feat) for feat in ipa_to_phonemefeats[ipa].keys()]
feat_to_val_set = dict()
for feat in feat_types:
feat_to_val_set[feat] = set()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
for feat in ipa_to_phonemefeats[ipa]:
feat_to_val_set[feat].add(ipa_to_phonemefeats[ipa][feat])
# print(feat_to_val_set)
value_list = set()
for val_set in [feat_to_val_set[feat] for feat in feat_to_val_set]:
for value in val_set:
value_list.add(value)
# print("{")
# for index, value in enumerate(list(value_list)):
# print('"{}":{},'.format(value,index))
# print("}")
value_to_index = get_feature_to_index_lookup()
phone_to_vector = dict()
for ipa in ipa_to_phonemefeats:
if len(ipa) == 1:
phone_to_vector[ipa] = [0] * (13 + sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]]))
# there are 13 features which do not occur in the vectors, because they are context dependent and not lexical
for feat in ipa_to_phonemefeats[ipa]:
if ipa_to_phonemefeats[ipa][feat] in value_to_index:
phone_to_vector[ipa][value_to_index[ipa_to_phonemefeats[ipa][feat]]] = 1
if phone_to_vector[ipa][value_to_index["phoneme"]] != 1:
# it's not a phoneme, so we give it the silence marker, regardless of what it is.
phone_to_vector[ipa][value_to_index["silence"]] = 1
for feat in feat_to_val_set:
for value in feat_to_val_set[feat]:
if value not in value_to_index:
print(f"Unknown feature value in featureset! {value}")
# print(f"{sum([len(values) for values in [feat_to_val_set[feat] for feat in feat_to_val_set]])} should be 49")
return phone_to_vector
if __name__ == '__main__':
print(generate_feature_table())