Spaces:
Running
Running
import re, os | |
from ebooklib import epub | |
import PyPDF2 | |
from PyPDF2 import PdfReader | |
from bs4 import BeautifulSoup | |
def is_japanese(string): | |
for ch in string: | |
if ord(ch) > 0x3040 and ord(ch) < 0x30FF: | |
return True | |
return False | |
def is_chinese(string): | |
for ch in string: | |
if '\u4e00' <= ch <= '\u9fff': | |
return True | |
return False | |
def replace_quotes(text): | |
# 替换中文、日文引号为英文引号 | |
text = re.sub(r'[“”‘’『』「」()()]', '"', text) | |
return text | |
def extrac(text): | |
text = replace_quotes(text) # 替换引号 | |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签 | |
# 使用换行符和标点符号进行初步分割 | |
preliminary_sentences = re.split(r'([\n。!?\.\?!])', text) | |
final_sentences = [] | |
temp_sentence = "" | |
for piece in preliminary_sentences: | |
if re.match(r'[\n。!?\.\?!]', piece): | |
temp_sentence += piece | |
# 分割句子并保留标点 | |
sub_sentences = re.split(r'(?<=[。!?\.\?!])', temp_sentence) | |
for sub_sentence in sub_sentences: | |
# 检查是否混合语言 | |
if len(sub_sentence) > 20 or is_mixed_language(sub_sentence): | |
# 进一步分割 | |
final_sentences.extend(split_mixed_language(sub_sentence)) | |
else: | |
final_sentences.append(sub_sentence) | |
temp_sentence = "" | |
else: | |
temp_sentence += piece | |
# 添加最后一个句子(如果有) | |
if temp_sentence: | |
final_sentences.append(temp_sentence) | |
return [s.replace('"','').replace("<","").replace(">","") for s in final_sentences if s] # 移除空字符串 | |
def is_mixed_language(sentence): | |
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None | |
contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None | |
contains_english = re.search(r'[a-zA-Z]', sentence) is not None | |
languages_count = sum([contains_chinese, contains_japanese, contains_english]) | |
return languages_count > 1 | |
def split_mixed_language(sentence): | |
# 分割混合语言句子 | |
sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence) | |
return [s.strip() for s in sub_sentences if s.strip()] | |
def seconds_to_ass_time(seconds): | |
"""将秒数转换为ASS时间格式""" | |
hours = int(seconds / 3600) | |
minutes = int((seconds % 3600) / 60) | |
seconds = int(seconds) % 60 | |
milliseconds = int((seconds - int(seconds)) * 1000) | |
return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10)) | |
def extract_text_from_epub(file_path): | |
book = epub.read_epub(file_path) | |
content = [] | |
for item in book.items: | |
if isinstance(item, epub.EpubHtml): | |
soup = BeautifulSoup(item.content, 'html.parser') | |
content.append(soup.get_text()) | |
return '\n'.join(content) | |
def extract_text_from_pdf(file_path): | |
with open(file_path, 'rb') as file: | |
reader = PdfReader(file) | |
content = [page.extract_text() for page in reader.pages] | |
return '\n'.join(content) | |
def remove_annotations(text): | |
# 移除方括号、尖括号和中文方括号中的内容 | |
text = re.sub(r'\[.*?\]', '', text) | |
text = re.sub(r'\<.*?\>', '', text) | |
text = re.sub(r'​``【oaicite:1】``​', '', text) | |
return text | |
def extract_text_from_file(inputFile): | |
file_extension = os.path.splitext(inputFile)[1].lower() | |
if file_extension == ".epub": | |
return extract_text_from_epub(inputFile) | |
elif file_extension == ".pdf": | |
return extract_text_from_pdf(inputFile) | |
elif file_extension == ".txt": | |
with open(inputFile, 'r', encoding='utf-8') as f: | |
return f.read() | |
else: | |
raise ValueError(f"Unsupported file format: {file_extension}") | |
if __name__ == "__main__": | |
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。" | |
print(extrac(text)) | |