daihui.zhang
commited on
Commit
·
f565d29
1
Parent(s):
fcb2660
add DESIGN_TIME_THREHOLD
Browse files- config.py +3 -1
- transcribe/pipelines/pipe_vad.py +3 -27
- transcribe/whisper_llm_serve.py +74 -41
config.py
CHANGED
@@ -2,7 +2,7 @@ import pathlib
|
|
2 |
import re
|
3 |
import logging
|
4 |
|
5 |
-
DEBUG =
|
6 |
LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
|
7 |
|
8 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
@@ -23,6 +23,8 @@ logging.getLogger().addHandler(console_handler)
|
|
23 |
|
24 |
# 文字输出长度阈值
|
25 |
TEXT_THREHOLD = 6
|
|
|
|
|
26 |
|
27 |
BASE_DIR = pathlib.Path(__file__).parent
|
28 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
|
|
2 |
import re
|
3 |
import logging
|
4 |
|
5 |
+
DEBUG = False
|
6 |
LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
|
7 |
|
8 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
|
|
23 |
|
24 |
# 文字输出长度阈值
|
25 |
TEXT_THREHOLD = 6
|
26 |
+
# 音频段的决策时间
|
27 |
+
DESIGN_TIME_THREHOLD = 3
|
28 |
|
29 |
BASE_DIR = pathlib.Path(__file__).parent
|
30 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
transcribe/pipelines/pipe_vad.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
|
2 |
from .base import MetaItem, BasePipe
|
3 |
-
from ..helpers.vadprocessor import FixedVADIterator
|
4 |
|
5 |
import numpy as np
|
6 |
import logging
|
@@ -16,15 +16,12 @@ class VadPipe(BasePipe):
|
|
16 |
super().__init__(in_queue, out_queue)
|
17 |
self._offset = 0 # 处理的frame size offset
|
18 |
self._status = 'END'
|
19 |
-
self.last_state_change_offset = 0
|
20 |
-
self.adaptive_ctrl = AdaptiveSilenceController()
|
21 |
|
22 |
|
23 |
def reset(self):
|
24 |
self._offset = 0
|
25 |
self._status = 'END'
|
26 |
-
|
27 |
-
self.adaptive_ctrl = AdaptiveSilenceController()
|
28 |
self.vac.reset_states()
|
29 |
|
30 |
@classmethod
|
@@ -53,16 +50,9 @@ class VadPipe(BasePipe):
|
|
53 |
if start_frame:
|
54 |
relative_start_frame =start_frame - self._offset
|
55 |
if end_frame:
|
56 |
-
relative_end_frame =
|
57 |
return relative_start_frame, relative_end_frame
|
58 |
|
59 |
-
def update_silence_ms(self):
|
60 |
-
min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
|
61 |
-
min_silence_samples = self.sample_rate * min_silence / 1000
|
62 |
-
old_silence_samples = self.vac.min_silence_samples
|
63 |
-
logging.warning(f"🫠 update_silence_ms :{old_silence_samples * 1000 / self.sample_rate :.2f}ms => current: {min_silence}ms ")
|
64 |
-
# self.vac.min_silence_samples = min_silence_samples
|
65 |
-
|
66 |
def process(self, in_data: MetaItem) -> MetaItem:
|
67 |
if self._offset == 0:
|
68 |
self.vac.reset_states()
|
@@ -77,29 +67,15 @@ class VadPipe(BasePipe):
|
|
77 |
if rel_start_frame is not None and rel_end_frame is None:
|
78 |
self._status = "START" # 语音开始
|
79 |
target_audio = source_audio[rel_start_frame:]
|
80 |
-
|
81 |
-
# 计算上一段静音长度
|
82 |
-
silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
|
83 |
-
self.adaptive_ctrl.update_silence(silence_len)
|
84 |
-
self.last_state_change_offset = self._offset + rel_start_frame
|
85 |
-
|
86 |
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
87 |
elif rel_start_frame is None and rel_end_frame is not None:
|
88 |
self._status = "END" # 音频结束
|
89 |
target_audio = source_audio[:rel_end_frame]
|
90 |
-
|
91 |
-
speech_len = (rel_end_frame) / self.sample_rate * 1000
|
92 |
-
self.adaptive_ctrl.update_speech(speech_len)
|
93 |
-
self.last_state_change_offset = self._offset + rel_end_frame
|
94 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
95 |
else:
|
96 |
self._status = 'END'
|
97 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
98 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
99 |
-
|
100 |
-
seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
|
101 |
-
self.adaptive_ctrl.update_speech(seg_len)
|
102 |
-
self.last_state_change_offset = self._offset + rel_end_frame
|
103 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
104 |
else:
|
105 |
if self._status == 'START':
|
|
|
1 |
|
2 |
from .base import MetaItem, BasePipe
|
3 |
+
from ..helpers.vadprocessor import FixedVADIterator
|
4 |
|
5 |
import numpy as np
|
6 |
import logging
|
|
|
16 |
super().__init__(in_queue, out_queue)
|
17 |
self._offset = 0 # 处理的frame size offset
|
18 |
self._status = 'END'
|
|
|
|
|
19 |
|
20 |
|
21 |
def reset(self):
|
22 |
self._offset = 0
|
23 |
self._status = 'END'
|
24 |
+
|
|
|
25 |
self.vac.reset_states()
|
26 |
|
27 |
@classmethod
|
|
|
50 |
if start_frame:
|
51 |
relative_start_frame =start_frame - self._offset
|
52 |
if end_frame:
|
53 |
+
relative_end_frame = end_frame - self._offset
|
54 |
return relative_start_frame, relative_end_frame
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def process(self, in_data: MetaItem) -> MetaItem:
|
57 |
if self._offset == 0:
|
58 |
self.vac.reset_states()
|
|
|
67 |
if rel_start_frame is not None and rel_end_frame is None:
|
68 |
self._status = "START" # 语音开始
|
69 |
target_audio = source_audio[rel_start_frame:]
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
71 |
elif rel_start_frame is None and rel_end_frame is not None:
|
72 |
self._status = "END" # 音频结束
|
73 |
target_audio = source_audio[:rel_end_frame]
|
|
|
|
|
|
|
|
|
74 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
75 |
else:
|
76 |
self._status = 'END'
|
77 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
78 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
|
|
|
|
|
|
|
|
79 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
80 |
else:
|
81 |
if self._status == 'START':
|
transcribe/whisper_llm_serve.py
CHANGED
@@ -14,12 +14,39 @@ from .utils import log_block, save_to_wave, TestDataWriter, filter_words
|
|
14 |
from .translatepipes import TranslatePipes
|
15 |
|
16 |
from transcribe.helpers.vadprocessor import VadProcessor
|
17 |
-
# from transcribe.helpers.vad_dynamic import VadProcessor
|
18 |
-
# from transcribe.helpers.vadprocessor import VadProcessor
|
19 |
from transcribe.pipelines import MetaItem
|
|
|
|
|
20 |
|
21 |
logger = getLogger("TranscriptionService")
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
class WhisperTranscriptionService:
|
25 |
"""
|
@@ -51,21 +78,15 @@ class WhisperTranscriptionService:
|
|
51 |
self._frame_queue = queue.Queue()
|
52 |
# 音频队列缓冲区
|
53 |
self.frames_np = np.array([], dtype=np.float32)
|
|
|
54 |
# 完整音频队列
|
55 |
-
self.
|
56 |
-
self._temp_string = ""
|
57 |
-
|
58 |
-
self._transcrible_analysis = None
|
59 |
# 启动处理线程
|
60 |
self._translate_thread_stop = threading.Event()
|
61 |
self._frame_processing_thread_stop = threading.Event()
|
62 |
|
63 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
64 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
65 |
-
# if language == "zh":
|
66 |
-
# self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
|
67 |
-
# else:
|
68 |
-
# self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
|
69 |
self.row_number = 0
|
70 |
# for test
|
71 |
self._transcrible_time_cost = 0.
|
@@ -107,38 +128,57 @@ class WhisperTranscriptionService:
|
|
107 |
speech_status = processed_audio.speech_status
|
108 |
return speech_audio, speech_status
|
109 |
|
|
|
|
|
110 |
def _frame_processing_loop(self) -> None:
|
111 |
"""从队列获取音频帧并合并到缓冲区"""
|
112 |
while not self._frame_processing_thread_stop.is_set():
|
113 |
try:
|
114 |
frame_np = self._frame_queue.get(timeout=0.1)
|
115 |
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
116 |
-
|
|
|
117 |
continue
|
|
|
118 |
with self.lock:
|
|
|
|
|
|
|
119 |
self.frames_np = np.append(self.frames_np, frame_np)
|
120 |
-
if speech_status == "END" and len(self.frames_np) > 0:
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
except queue.Empty:
|
124 |
pass
|
125 |
|
126 |
def _transcription_processing_loop(self) -> None:
|
127 |
"""主转录处理循环"""
|
128 |
frame_epoch = 1
|
|
|
|
|
|
|
|
|
129 |
while not self._translate_thread_stop.is_set():
|
130 |
|
131 |
if len(self.frames_np) ==0:
|
132 |
time.sleep(0.01)
|
133 |
continue
|
|
|
134 |
with self.lock:
|
135 |
-
if len(self.
|
136 |
-
audio_buffer = self.
|
137 |
partial = False
|
138 |
else:
|
139 |
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
|
140 |
partial = True
|
141 |
-
|
142 |
if len(audio_buffer) < int(self.sample_rate):
|
143 |
silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
|
144 |
silence_audio[-len(audio_buffer):] = audio_buffer
|
@@ -149,37 +189,30 @@ class WhisperTranscriptionService:
|
|
149 |
segments = meta_item.segments
|
150 |
logger.debug(f"Segments: {segments}")
|
151 |
segments = filter_words(segments)
|
|
|
152 |
if len(segments):
|
153 |
seg_text = self.text_separator.join(seg.text for seg in segments)
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
else:
|
163 |
-
self._temp_string = ""
|
164 |
-
|
165 |
-
result = TransResult(
|
166 |
-
seg_id=self.row_number,
|
167 |
-
context=seg_text,
|
168 |
-
from_=self.source_language,
|
169 |
-
to=self.target_language,
|
170 |
-
tran_content=self._translate_text_large(seg_text),
|
171 |
-
partial=partial
|
172 |
-
)
|
173 |
if partial == False:
|
174 |
self.row_number += 1
|
175 |
-
|
176 |
-
self._send_result_to_client(result)
|
177 |
-
|
178 |
-
if partial == False:
|
179 |
frame_epoch = 1
|
180 |
else:
|
181 |
frame_epoch += 1
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
185 |
"""转录音频并返回转录片段"""
|
|
|
14 |
from .translatepipes import TranslatePipes
|
15 |
|
16 |
from transcribe.helpers.vadprocessor import VadProcessor
|
|
|
|
|
17 |
from transcribe.pipelines import MetaItem
|
18 |
+
from dataclasses import dataclass, field
|
19 |
+
|
20 |
|
21 |
logger = getLogger("TranscriptionService")
|
22 |
|
23 |
+
@dataclass
|
24 |
+
class FullSegment:
|
25 |
+
"""整句"""
|
26 |
+
audio_array: np.ndarray
|
27 |
+
created_time: float = field(default_factory=time.time)
|
28 |
+
|
29 |
+
@staticmethod
|
30 |
+
def merge(*audio_segments: list["FullSegment"]):
|
31 |
+
audio_segments_sorted = sorted([*audio_segments], key=lambda item: item.created_time)
|
32 |
+
return FullSegment(
|
33 |
+
created_time=audio_segments_sorted[0].created_time,
|
34 |
+
audio_array=np.concatenate([i.audio_array for i in audio_segments_sorted], axis=0)
|
35 |
+
)
|
36 |
+
|
37 |
+
@property
|
38 |
+
def time_duration(self) -> float:
|
39 |
+
return len(self.audio_array) / config.SAMPLE_RATE
|
40 |
+
|
41 |
+
@property
|
42 |
+
def start_timestamp(self):
|
43 |
+
return self.created_time
|
44 |
+
|
45 |
+
@property
|
46 |
+
def end_timestamp(self):
|
47 |
+
return self.created_time + self.time_duration
|
48 |
+
|
49 |
+
|
50 |
|
51 |
class WhisperTranscriptionService:
|
52 |
"""
|
|
|
78 |
self._frame_queue = queue.Queue()
|
79 |
# 音频队列缓冲区
|
80 |
self.frames_np = np.array([], dtype=np.float32)
|
81 |
+
self.frames_np_start_timestamp = None
|
82 |
# 完整音频队列
|
83 |
+
self.full_segments_queue = collections.deque()
|
|
|
|
|
|
|
84 |
# 启动处理线程
|
85 |
self._translate_thread_stop = threading.Event()
|
86 |
self._frame_processing_thread_stop = threading.Event()
|
87 |
|
88 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
89 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
|
|
|
|
|
|
|
|
90 |
self.row_number = 0
|
91 |
# for test
|
92 |
self._transcrible_time_cost = 0.
|
|
|
128 |
speech_status = processed_audio.speech_status
|
129 |
return speech_audio, speech_status
|
130 |
|
131 |
+
|
132 |
+
|
133 |
def _frame_processing_loop(self) -> None:
|
134 |
"""从队列获取音频帧并合并到缓冲区"""
|
135 |
while not self._frame_processing_thread_stop.is_set():
|
136 |
try:
|
137 |
frame_np = self._frame_queue.get(timeout=0.1)
|
138 |
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
139 |
+
|
140 |
+
if frame_np is None:
|
141 |
continue
|
142 |
+
|
143 |
with self.lock:
|
144 |
+
if speech_status == "START" and self.frames_np_start_timestamp is None:
|
145 |
+
self.frames_np_start_timestamp = time.time()
|
146 |
+
# 添加音频到音频缓冲区
|
147 |
self.frames_np = np.append(self.frames_np, frame_np)
|
148 |
+
if speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
|
149 |
+
time_diff = time.time() - self.frames_np_start_timestamp
|
150 |
+
if time_diff >= config.DESIGN_TIME_THREHOLD:
|
151 |
+
audio_array=self.frames_np.copy()
|
152 |
+
self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
|
153 |
+
self.frames_np_start_timestamp = None
|
154 |
+
self.frames_np = np.array([], dtype=np.float32)
|
155 |
+
else:
|
156 |
+
logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续增加缓冲区")
|
157 |
+
|
158 |
except queue.Empty:
|
159 |
pass
|
160 |
|
161 |
def _transcription_processing_loop(self) -> None:
|
162 |
"""主转录处理循环"""
|
163 |
frame_epoch = 1
|
164 |
+
loop_start_time = time.perf_counter()
|
165 |
+
# 1. 音频进入的时间戳长度低于3s和后面拼接起来
|
166 |
+
# 2. 超过25s后主动短句
|
167 |
+
|
168 |
while not self._translate_thread_stop.is_set():
|
169 |
|
170 |
if len(self.frames_np) ==0:
|
171 |
time.sleep(0.01)
|
172 |
continue
|
173 |
+
|
174 |
with self.lock:
|
175 |
+
if len(self.full_segments_queue) > 0:
|
176 |
+
audio_buffer = self.full_segments_queue.pop()
|
177 |
partial = False
|
178 |
else:
|
179 |
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
|
180 |
partial = True
|
181 |
+
|
182 |
if len(audio_buffer) < int(self.sample_rate):
|
183 |
silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
|
184 |
silence_audio[-len(audio_buffer):] = audio_buffer
|
|
|
189 |
segments = meta_item.segments
|
190 |
logger.debug(f"Segments: {segments}")
|
191 |
segments = filter_words(segments)
|
192 |
+
|
193 |
if len(segments):
|
194 |
seg_text = self.text_separator.join(seg.text for seg in segments)
|
195 |
+
result = TransResult(
|
196 |
+
seg_id=self.row_number,
|
197 |
+
context=seg_text,
|
198 |
+
from_=self.source_language,
|
199 |
+
to=self.target_language,
|
200 |
+
tran_content=self._translate_text_large(seg_text),
|
201 |
+
partial=partial
|
202 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
if partial == False:
|
204 |
self.row_number += 1
|
|
|
|
|
|
|
|
|
205 |
frame_epoch = 1
|
206 |
else:
|
207 |
frame_epoch += 1
|
208 |
+
self._send_result_to_client(result)
|
209 |
+
|
210 |
+
# 轮询时间延迟
|
211 |
+
loop_time_diff = 2 - (time.perf_counter() - loop_start_time)
|
212 |
+
if loop_time_diff >0:
|
213 |
+
time.sleep(loop_time_diff)
|
214 |
+
loop_start_time = time.perf_counter()
|
215 |
+
|
216 |
|
217 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
218 |
"""转录音频并返回转录片段"""
|