import re def is_japanese(string): for ch in string: if ord(ch) > 0x3040 and ord(ch) < 0x30FF: return True return False def is_chinese(string): for ch in string: if '\u4e00' <= ch <= '\u9fff': return True return False def replace_quotes(text): # 替换中文、日文引号为英文引号 text = re.sub(r'[“”‘’『』「」()()]', '"', text) return text def extrac(text): text = replace_quotes(text) # 替换引号 text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签 # 使用换行符和标点符号进行初步分割 preliminary_sentences = re.split(r'([\n。!?\.\?!])', text) final_sentences = [] temp_sentence = "" for piece in preliminary_sentences: if re.match(r'[\n。!?\.\?!]', piece): temp_sentence += piece # 分割句子并保留标点 sub_sentences = re.split(r'(?<=[。!?\.\?!])', temp_sentence) for sub_sentence in sub_sentences: # 检查是否混合语言 if len(sub_sentence) > 20 or is_mixed_language(sub_sentence): # 进一步分割 final_sentences.extend(split_mixed_language(sub_sentence)) else: final_sentences.append(sub_sentence) temp_sentence = "" else: temp_sentence += piece # 添加最后一个句子(如果有) if temp_sentence: final_sentences.append(temp_sentence) return [s.replace('"','').replace("<","").replace(">","") for s in final_sentences if s] # 移除空字符串 def is_mixed_language(sentence): contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None contains_english = re.search(r'[a-zA-Z]', sentence) is not None languages_count = sum([contains_chinese, contains_japanese, contains_english]) return languages_count > 1 def split_mixed_language(sentence): # 分割混合语言句子 sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence) return [s.strip() for s in sub_sentences if s.strip()] if __name__ == "__main__": text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。" print(extrac(text))