Mahiruoshi's picture
Update tools/sentence.py
d1f7ac3
raw
history blame
4.44 kB
import re, os
from ebooklib import epub
import PyPDF2
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
def is_japanese(string):
for ch in string:
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
return True
return False
def is_chinese(string):
for ch in string:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def replace_quotes(text):
# 替换中文、日文引号为英文引号
text = re.sub(r'[“”‘’『』「」()()]', '"', text)
return text
def extrac(text):
text = replace_quotes(text) # 替换引号
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
# 使用换行符和标点符号进行初步分割
preliminary_sentences = re.split(r'([\n。!?\.\?!])', text)
final_sentences = []
temp_sentence = ""
for piece in preliminary_sentences:
if re.match(r'[\n。!?\.\?!]', piece):
temp_sentence += piece
# 分割句子并保留标点
sub_sentences = re.split(r'(?<=[。!?\.\?!])', temp_sentence)
for sub_sentence in sub_sentences:
# 检查是否混合语言
if len(sub_sentence) > 20 or is_mixed_language(sub_sentence):
# 进一步分割
final_sentences.extend(split_mixed_language(sub_sentence))
else:
final_sentences.append(sub_sentence)
temp_sentence = ""
else:
temp_sentence += piece
# 添加最后一个句子(如果有)
if temp_sentence:
final_sentences.append(temp_sentence)
return [s.replace('"','').replace("<","").replace(">","") for s in final_sentences if s] # 移除空字符串
def is_mixed_language(sentence):
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
contains_english = re.search(r'[a-zA-Z]', sentence) is not None
languages_count = sum([contains_chinese, contains_japanese, contains_english])
return languages_count > 1
def split_mixed_language(sentence):
# 分割混合语言句子
sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
return [s.strip() for s in sub_sentences if s.strip()]
def seconds_to_ass_time(seconds):
"""将秒数转换为ASS时间格式"""
hours = int(seconds / 3600)
minutes = int((seconds % 3600) / 60)
seconds = int(seconds) % 60
milliseconds = int((seconds - int(seconds)) * 1000)
return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))
def extract_text_from_epub(file_path):
book = epub.read_epub(file_path)
content = []
for item in book.items:
if isinstance(item, epub.EpubHtml):
soup = BeautifulSoup(item.content, 'html.parser')
content.append(soup.get_text())
return '\n'.join(content)
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as file:
reader = PdfReader(file)
content = [page.extract_text() for page in reader.pages]
return '\n'.join(content)
def remove_annotations(text):
# 移除方括号、尖括号和中文方括号中的内容
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'\<.*?\>', '', text)
text = re.sub(r'&#8203;``【oaicite:1】``&#8203;', '', text)
return text
def extract_text_from_file(inputFile):
file_extension = os.path.splitext(inputFile)[1].lower()
if file_extension == ".epub":
return extract_text_from_epub(inputFile)
elif file_extension == ".pdf":
return extract_text_from_pdf(inputFile)
elif file_extension == ".txt":
with open(inputFile, 'r', encoding='utf-8') as f:
return f.read()
else:
raise ValueError(f"Unsupported file format: {file_extension}")
if __name__ == "__main__":
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
print(extrac(text))