File size: 4,442 Bytes
d1f7ac3
 
 
 
 
 
5422b18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d1f7ac3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5422b18
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re, os

from ebooklib import epub
import PyPDF2
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup

def is_japanese(string):
        for ch in string:
            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
                return True
        return False

def is_chinese(string):
    for ch in string:
        if '\u4e00' <= ch <= '\u9fff':
            return True
    return False

def replace_quotes(text):
    # 替换中文、日文引号为英文引号
    text = re.sub(r'[“”‘’『』「」()()]', '"', text)
    return text

def extrac(text):
    text = replace_quotes(text)  # 替换引号
    text = re.sub("<[^>]*>", "", text)  # 移除 HTML 标签
    # 使用换行符和标点符号进行初步分割
    preliminary_sentences = re.split(r'([\n。!?\.\?!])', text)
    final_sentences = []

    temp_sentence = ""
    for piece in preliminary_sentences:
        if re.match(r'[\n。!?\.\?!]', piece):
            temp_sentence += piece
            # 分割句子并保留标点
            sub_sentences = re.split(r'(?<=[。!?\.\?!])', temp_sentence)
            for sub_sentence in sub_sentences:
                # 检查是否混合语言
                if len(sub_sentence) > 20 or is_mixed_language(sub_sentence):
                    # 进一步分割
                    final_sentences.extend(split_mixed_language(sub_sentence))
                else:
                    final_sentences.append(sub_sentence)
            temp_sentence = ""
        else:
            temp_sentence += piece

    # 添加最后一个句子(如果有)
    if temp_sentence:
        final_sentences.append(temp_sentence)

    return [s.replace('"','').replace("<","").replace(">","") for s in final_sentences if s]  # 移除空字符串

def is_mixed_language(sentence):
    contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
    contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
    contains_english = re.search(r'[a-zA-Z]', sentence) is not None
    languages_count = sum([contains_chinese, contains_japanese, contains_english])
    return languages_count > 1

def split_mixed_language(sentence):
    # 分割混合语言句子
    sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
    return [s.strip() for s in sub_sentences if s.strip()]

def seconds_to_ass_time(seconds):
    """将秒数转换为ASS时间格式"""
    hours = int(seconds / 3600)
    minutes = int((seconds % 3600) / 60)
    seconds = int(seconds) % 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return "{:01d}:{:02d}:{:02d}.{:02d}".format(hours, minutes, seconds, int(milliseconds / 10))

def extract_text_from_epub(file_path):
    book = epub.read_epub(file_path)
    content = []
    for item in book.items:
        if isinstance(item, epub.EpubHtml):
            soup = BeautifulSoup(item.content, 'html.parser')
            content.append(soup.get_text())
    return '\n'.join(content)

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
        content = [page.extract_text() for page in reader.pages]
    return '\n'.join(content)

def remove_annotations(text):
    # 移除方括号、尖括号和中文方括号中的内容
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\<.*?\>', '', text)
    text = re.sub(r'&#8203;``【oaicite:1】``&#8203;', '', text)
    return text

def extract_text_from_file(inputFile):
    file_extension = os.path.splitext(inputFile)[1].lower()
    if file_extension == ".epub":
        return extract_text_from_epub(inputFile)
    elif file_extension == ".pdf":
        return extract_text_from_pdf(inputFile)
    elif file_extension == ".txt":
        with open(inputFile, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        raise ValueError(f"Unsupported file format: {file_extension}")

if __name__ == "__main__":
    text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
    print(extrac(text))