chattts

Sleeping

chattts / modules /utils /normalization.py

zhzluke96

update

01e655b about 1 year ago

3.77 kB

	from modules.utils.zh_normalization.text_normlization import *

	character_map = {
	"：": "，",
	"；": "，",
	"！": "。",
	"（": "，",
	"）": "，",
	"【": "，",
	"】": "，",
	"『": "，",
	"』": "，",
	"「": "，",
	"」": "，",
	"《": "，",
	"》": "，",
	"－": "，",
	"‘": " ",
	"“": " ",
	"’": " ",
	"”": " ",
	":": ",",
	";": ",",
	"!": ".",
	"(": ",",
	")": ",",
	# '[': ',',
	# ']': ',',
	">": ",",
	"<": ",",
	"-": ",",
	}

	character_to_word = {
	" & ": " and ",
	}


	def apply_character_to_word(text):
	for k, v in character_to_word.items():
	text = text.replace(k, v)
	return text


	def apply_character_map(text):
	translation_table = str.maketrans(character_map)
	return text.translate(translation_table)


	def insert_spaces_between_uppercase(s):
	# 使用正则表达式在每个相邻的大写字母之间插入空格
	return re.sub(
	r"(?<=[A-Z])(?=[A-Z])\|(?<=[a-z])(?=[A-Z])\|(?<=[\u4e00-\u9fa5])(?=[A-Z])\|(?<=[A-Z])(?=[\u4e00-\u9fa5])",
	" ",
	s,
	)


	def ensure_suffix(a: str, b: str, c: str):
	a = a.strip()
	if not a.endswith(b):
	a += c
	return a


	email_domain_map = {
	"outlook.com": "Out look",
	"hotmail.com": "Hot mail",
	"yahoo.com": "雅虎",
	}


	# 找到所有 email 并将 name 分割为单个字母，@替换为 at ，. 替换为 dot，常见域名替换为单词
	#
	# 例如:
	# [email protected] => z h z l u k e 9 6 at out look dot com
	def email_detect(text):
	email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")

	def replace(match):
	email = match.group(1)
	name, domain = email.split("@")
	name = " ".join(name)
	if domain in email_domain_map:
	domain = email_domain_map[domain]
	domain = domain.replace(".", " dot ")
	return f"{name} at {domain}"

	return email_pattern.sub(replace, text)


	def pre_normalize(text):
	# NOTE: 效果一般...
	# text = email_detect(text)
	return text


	def post_normalize(text):
	text = insert_spaces_between_uppercase(text)
	text = apply_character_map(text)
	text = apply_character_to_word(text)
	return text


	def text_normalize(text, is_end=False):
	# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
	tx = TextNormalizer()

	# 匹配 \[.+?\] 的部分
	pattern = re.compile(r"(\[.+?\])\|([^[]+)")

	def normalize_part(part):
	part = pre_normalize(part)
	sentences = tx.normalize(part)
	dest_text = ""
	for sentence in sentences:
	dest_text += post_normalize(sentence)
	return dest_text

	def replace(match):
	if match.group(1):
	return f" {match.group(1)} "
	else:
	return normalize_part(match.group(2))

	result = pattern.sub(replace, text)

	# NOTE: 加了会有杂音...
	# if is_end:
	# 加这个是为了防止吞字
	# result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")

	return result


	if __name__ == "__main__":
	print(
	text_normalize(
	"ChatTTS是专门为对话场景设计的文本转语音模型，例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本."
	)
	)
	print(
	text_normalize(
	" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中梁朝伟 [speed_9] 扮演的陈永仁的编号27149"
	)
	)
	print(text_normalize(" 明天有62％的概率降雨"))