BangDream-Bert-VITS2

Running

App Files Files Community

BangDream-Bert-VITS2 / tools /sentence.py

Mahiruoshi

Update tools/sentence.py

d1f7ac3 over 1 year ago

raw

history blame

4.44 kB

	import re, os

	from ebooklib import epub
	import PyPDF2
	from PyPDF2 import PdfReader
	from bs4 import BeautifulSoup

	def is_japanese(string):
	for ch in string:
	if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
	return True
	return False

	def is_chinese(string):
	for ch in string:
	if '\u4e00' <= ch <= '\u9fff':
	return True
	return False

	def replace_quotes(text):
	# 替换中文、日文引号为英文引号
	text = re.sub(r'[“”‘’『』「」（）()]', '"', text)
	return text

	def extrac(text):
	text = replace_quotes(text) # 替换引号
	text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
	# 使用换行符和标点符号进行初步分割
	preliminary_sentences = re.split(r'([\n。！？\.\?!])', text)
	final_sentences = []

	temp_sentence = ""
	for piece in preliminary_sentences:
	if re.match(r'[\n。！？\.\?!]', piece):
	temp_sentence += piece
	# 分割句子并保留标点
	sub_sentences = re.split(r'(?<=[。！？\.\?!])', temp_sentence)
	for sub_sentence in sub_sentences:
	# 检查是否混合语言
	if len(sub_sentence) > 20 or is_mixed_language(sub_sentence):
	# 进一步分割
	final_sentences.extend(split_mixed_language(sub_sentence))
	else:
	final_sentences.append(sub_sentence)
	temp_sentence = ""
	else:
	temp_sentence += piece

	# 添加最后一个句子（如果有）
	if temp_sentence:
	final_sentences.append(temp_sentence)

	return [s.replace('"','').replace("<","").replace(">","") for s in final_sentences if s] # 移除空字符串

	def is_mixed_language(sentence):
	contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
	contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
	contains_english = re.search(r'[a-zA-Z]', sentence) is not None
	languages_count = sum([contains_chinese, contains_japanese, contains_english])
	return languages_count > 1

	def split_mixed_language(sentence):
	# 分割混合语言句子
	sub_sentences = re.split(r'(?<=[。！？\.\?!])(?=")\|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]\|[a-zA-Z])', sentence)
	return [s.strip() for s in sub_sentences if s.strip()]

	def seconds_to_ass_time(seconds):
	"""将秒数转换为ASS时间格式"""
	hours = int(seconds / 3600)
	minutes = int((seconds % 3600) / 60)
	seconds = int(seconds) % 60
	milliseconds = int((seconds - int(seconds)) * 1000)
	return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))

	def extract_text_from_epub(file_path):
	book = epub.read_epub(file_path)
	content = []
	for item in book.items:
	if isinstance(item, epub.EpubHtml):
	soup = BeautifulSoup(item.content, 'html.parser')
	content.append(soup.get_text())
	return '\n'.join(content)

	def extract_text_from_pdf(file_path):
	with open(file_path, 'rb') as file:
	reader = PdfReader(file)
	content = [page.extract_text() for page in reader.pages]
	return '\n'.join(content)

	def remove_annotations(text):
	# 移除方括号、尖括号和中文方括号中的内容
	text = re.sub(r'\[.*?\]', '', text)
	text = re.sub(r'\<.*?\>', '', text)
	text = re.sub(r'``【oaicite:1】``', '', text)
	return text

	def extract_text_from_file(inputFile):
	file_extension = os.path.splitext(inputFile)[1].lower()
	if file_extension == ".epub":
	return extract_text_from_epub(inputFile)
	elif file_extension == ".pdf":
	return extract_text_from_pdf(inputFile)
	elif file_extension == ".txt":
	with open(inputFile, 'r', encoding='utf-8') as f:
	return f.read()
	else:
	raise ValueError(f"Unsupported file format: {file_extension}")

	if __name__ == "__main__":
	text = "你好，这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好！今天我们要介绍VITS项目，其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
	print(extrac(text))