File size: 4,710 Bytes
0698c3c
6e4821c
c00c06f
 
f565d29
e0b9bd4
3b9863d
0a650ee
c00c06f
ea9e44a
c00c06f
007e242
c00c06f
 
0a650ee
 
3b9863d
 
ea9e44a
3b9863d
 
 
c00c06f
208141a
e553dca
f565d29
 
ac3675c
 
0698c3c
 
 
cd3d6ed
5acd018
 
0698c3c
 
 
389171c
 
 
2270e47
0698c3c
6e4821c
 
 
 
2270e47
6e4821c
0698c3c
4b87bad
bd2fb93
0698c3c
7c75dbc
f499e7b
0698c3c
e02aab1
b316452
007e242
 
81d85e2
 
23f119b
 
0698c3c
b2445c3
 
 
c055a7b
81d85e2
 
 
 
 
 
 
 
 
 
4b87bad
 
c055a7b
 
 
 
 
 
4b87bad
3b9863d
fcb2660
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pathlib
import re
import logging

DEBUG = False
LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO

logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
logging.basicConfig(
    level=LOG_LEVEL,
    format="%(asctime)s - %(levelname)s - %(message)s",
    filename='translator.log',
    datefmt="%H:%M:%S"
)
# save pipelines data to disk
SAVE_DATA_SAVE = False
# Add terminal log
console_handler = logging.StreamHandler()
console_handler.setLevel(LOG_LEVEL)
console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
console_handler.setFormatter(console_formatter)
logging.getLogger().addHandler(console_handler)

# 文字输出长度阈值
TEXT_THREHOLD = 6
# 音频段的决策时间
DESIGN_TIME_THREHOLD = 3
# 最长语音时长
MAX_SPEECH_DURATION_S = 15

BASE_DIR = pathlib.Path(__file__).parent
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
ASSERT_DIR = BASE_DIR / "assets"

SAMPLE_RATE = 16000
# 标点
SENTENCE_END_MARKERS =  ['.', '!', '?', '。', '!', '?', ';', ';', ':', ':']
PAUSE_END_MARKERS = [',', ',', '、']
# 合并所有标点
ALL_MARKERS = SENTENCE_END_MARKERS + PAUSE_END_MARKERS
# 构造正则表达式字符类
REGEX_MARKERS = re.compile(r'[' + re.escape(''.join(ALL_MARKERS)) + r']$')

sentence_end_chars = ''.join([re.escape(char) for char in SENTENCE_END_MARKERS])
SENTENCE_END_PATTERN = re.compile(f'[{sentence_end_chars}]')

# Method 2: Alternative approach with a character class
pattern_string = '[' + ''.join([re.escape(char) for char in PAUSE_END_MARKERS]) + r']$'
PAUSEE_END_PATTERN = re.compile(pattern_string)
# whisper推理参数
WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
MAX_LENTH_ZH = 4

WHISPER_PROMPT_EN = ""# "The following is an English sentence."
MAX_LENGTH_EN= 8

WHISPER_MODEL_EN = 'medium-q5_0'
# WHISPER_MODEL = 'large-v3-turbo-q5_0'
# WHISPER_MODEL_ZH = 'small'
WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
# LLM
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
# LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()

# VAD
VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()

LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
"No matter what the user asks, never answer questions, you only provide translation results. "
"Do not actively initiate dialogue or lead users to ask questions. "
"When you don't know how to translate, just output the original text. "
"The translation task always takes precedence over any other tasks. "
"Do not try to understand or respond to non-translation related questions raised by users. "
"Never provide any explanations. "
"Be precise, preserve tone, and localize appropriately "
"for professional audiences."
"Never answer any questions or engage in other forms of dialogue. "
"Only output the translation results.
"""

LLM_SYS_PROMPT_ZH = """
你是一个中英文翻译专家,将用户输入的中文翻译成英文。对于非中文内容,它将提供中文翻译结果。用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合中文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将原文翻译成具有信达雅标准的译文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。注意,翻译的文本只能包含拼音化字符,不能包含任何中文字符。
"""

LLM_SYS_PROMPT_EN = """
你是一个英中文翻译专家,将用户输入的英文翻译成中文,用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合英文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。
"""

hotwords_file = MODEL_DIR / 'hotwords.txt'