|
from modules.utils.zh_normalization.text_normlization import * |
|
|
|
character_map = { |
|
":": ",", |
|
";": ",", |
|
"!": "。", |
|
"(": ",", |
|
")": ",", |
|
"【": ",", |
|
"】": ",", |
|
"『": ",", |
|
"』": ",", |
|
"「": ",", |
|
"」": ",", |
|
"《": ",", |
|
"》": ",", |
|
"-": ",", |
|
"‘": " ", |
|
"“": " ", |
|
"’": " ", |
|
"”": " ", |
|
":": ",", |
|
";": ",", |
|
"!": ".", |
|
"(": ",", |
|
")": ",", |
|
|
|
|
|
">": ",", |
|
"<": ",", |
|
"-": ",", |
|
} |
|
|
|
character_to_word = { |
|
" & ": " and ", |
|
} |
|
|
|
|
|
def apply_character_to_word(text): |
|
for k, v in character_to_word.items(): |
|
text = text.replace(k, v) |
|
return text |
|
|
|
|
|
def apply_character_map(text): |
|
translation_table = str.maketrans(character_map) |
|
return text.translate(translation_table) |
|
|
|
|
|
def insert_spaces_between_uppercase(s): |
|
|
|
return re.sub( |
|
r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])", |
|
" ", |
|
s, |
|
) |
|
|
|
|
|
def ensure_suffix(a: str, b: str, c: str): |
|
a = a.strip() |
|
if not a.endswith(b): |
|
a += c |
|
return a |
|
|
|
|
|
email_domain_map = { |
|
"outlook.com": "Out look", |
|
"hotmail.com": "Hot mail", |
|
"yahoo.com": "雅虎", |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
def email_detect(text): |
|
email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") |
|
|
|
def replace(match): |
|
email = match.group(1) |
|
name, domain = email.split("@") |
|
name = " ".join(name) |
|
if domain in email_domain_map: |
|
domain = email_domain_map[domain] |
|
domain = domain.replace(".", " dot ") |
|
return f"{name} at {domain}" |
|
|
|
return email_pattern.sub(replace, text) |
|
|
|
|
|
def pre_normalize(text): |
|
|
|
|
|
return text |
|
|
|
|
|
def post_normalize(text): |
|
text = insert_spaces_between_uppercase(text) |
|
text = apply_character_map(text) |
|
text = apply_character_to_word(text) |
|
return text |
|
|
|
|
|
def text_normalize(text, is_end=False): |
|
|
|
tx = TextNormalizer() |
|
|
|
|
|
pattern = re.compile(r"(\[.+?\])|([^[]+)") |
|
|
|
def normalize_part(part): |
|
part = pre_normalize(part) |
|
sentences = tx.normalize(part) |
|
dest_text = "" |
|
for sentence in sentences: |
|
dest_text += post_normalize(sentence) |
|
return dest_text |
|
|
|
def replace(match): |
|
if match.group(1): |
|
return f" {match.group(1)} " |
|
else: |
|
return normalize_part(match.group(2)) |
|
|
|
result = pattern.sub(replace, text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
return result |
|
|
|
|
|
if __name__ == "__main__": |
|
print( |
|
text_normalize( |
|
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本." |
|
) |
|
) |
|
print( |
|
text_normalize( |
|
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149" |
|
) |
|
) |
|
print(text_normalize(" 明天有62%的概率降雨")) |
|
|