# coding: utf-8 | |
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py | |
import re | |
from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary | |
def normalize(text): | |
text = text.strip() | |
text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text) | |
text = normalize_with_dictionary(text, etc_dictionary) | |
text = normalize_english(text) | |
text = text.lower() | |
return text | |
def normalize_with_dictionary(text, dic): | |
if any(key in text for key in dic.keys()): | |
pattern = re.compile("|".join(re.escape(key) for key in dic.keys())) | |
return pattern.sub(lambda x: dic[x.group()], text) | |
return text | |
def normalize_english(text): | |
def fn(m): | |
word = m.group() | |
if word in english_dictionary: | |
return english_dictionary.get(word) | |
return word | |
text = re.sub("([A-Za-z]+)", fn, text) | |
return text | |