File size: 3,773 Bytes
01e655b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
from modules.utils.zh_normalization.text_normlization import *
character_map = {
":": ",",
";": ",",
"!": "。",
"(": ",",
")": ",",
"【": ",",
"】": ",",
"『": ",",
"』": ",",
"「": ",",
"」": ",",
"《": ",",
"》": ",",
"-": ",",
"‘": " ",
"“": " ",
"’": " ",
"”": " ",
":": ",",
";": ",",
"!": ".",
"(": ",",
")": ",",
# '[': ',',
# ']': ',',
">": ",",
"<": ",",
"-": ",",
}
character_to_word = {
" & ": " and ",
}
def apply_character_to_word(text):
for k, v in character_to_word.items():
text = text.replace(k, v)
return text
def apply_character_map(text):
translation_table = str.maketrans(character_map)
return text.translate(translation_table)
def insert_spaces_between_uppercase(s):
# 使用正则表达式在每个相邻的大写字母之间插入空格
return re.sub(
r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])",
" ",
s,
)
def ensure_suffix(a: str, b: str, c: str):
a = a.strip()
if not a.endswith(b):
a += c
return a
email_domain_map = {
"outlook.com": "Out look",
"hotmail.com": "Hot mail",
"yahoo.com": "雅虎",
}
# 找到所有 email 并将 name 分割为单个字母,@替换为 at ,. 替换为 dot,常见域名替换为单词
#
# 例如:
# [email protected] => z h z l u k e 9 6 at out look dot com
def email_detect(text):
email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
def replace(match):
email = match.group(1)
name, domain = email.split("@")
name = " ".join(name)
if domain in email_domain_map:
domain = email_domain_map[domain]
domain = domain.replace(".", " dot ")
return f"{name} at {domain}"
return email_pattern.sub(replace, text)
def pre_normalize(text):
# NOTE: 效果一般...
# text = email_detect(text)
return text
def post_normalize(text):
text = insert_spaces_between_uppercase(text)
text = apply_character_map(text)
text = apply_character_to_word(text)
return text
def text_normalize(text, is_end=False):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
# 匹配 \[.+?\] 的部分
pattern = re.compile(r"(\[.+?\])|([^[]+)")
def normalize_part(part):
part = pre_normalize(part)
sentences = tx.normalize(part)
dest_text = ""
for sentence in sentences:
dest_text += post_normalize(sentence)
return dest_text
def replace(match):
if match.group(1):
return f" {match.group(1)} "
else:
return normalize_part(match.group(2))
result = pattern.sub(replace, text)
# NOTE: 加了会有杂音...
# if is_end:
# 加这个是为了防止吞字
# result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")
return result
if __name__ == "__main__":
print(
text_normalize(
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本."
)
)
print(
text_normalize(
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149"
)
)
print(text_normalize(" 明天有62%的概率降雨"))
|