daihui.zhang commited on
Commit
f565d29
·
1 Parent(s): fcb2660

add DESIGN_TIME_THREHOLD

Browse files
config.py CHANGED
@@ -2,7 +2,7 @@ import pathlib
2
  import re
3
  import logging
4
 
5
- DEBUG = True
6
  LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
7
 
8
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
@@ -23,6 +23,8 @@ logging.getLogger().addHandler(console_handler)
23
 
24
  # 文字输出长度阈值
25
  TEXT_THREHOLD = 6
 
 
26
 
27
  BASE_DIR = pathlib.Path(__file__).parent
28
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
 
2
  import re
3
  import logging
4
 
5
+ DEBUG = False
6
  LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
7
 
8
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
 
23
 
24
  # 文字输出长度阈值
25
  TEXT_THREHOLD = 6
26
+ # 音频段的决策时间
27
+ DESIGN_TIME_THREHOLD = 3
28
 
29
  BASE_DIR = pathlib.Path(__file__).parent
30
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
transcribe/pipelines/pipe_vad.py CHANGED
@@ -1,6 +1,6 @@
1
 
2
  from .base import MetaItem, BasePipe
3
- from ..helpers.vadprocessor import FixedVADIterator, AdaptiveSilenceController
4
 
5
  import numpy as np
6
  import logging
@@ -16,15 +16,12 @@ class VadPipe(BasePipe):
16
  super().__init__(in_queue, out_queue)
17
  self._offset = 0 # 处理的frame size offset
18
  self._status = 'END'
19
- self.last_state_change_offset = 0
20
- self.adaptive_ctrl = AdaptiveSilenceController()
21
 
22
 
23
  def reset(self):
24
  self._offset = 0
25
  self._status = 'END'
26
- self.last_state_change_offset = 0
27
- self.adaptive_ctrl = AdaptiveSilenceController()
28
  self.vac.reset_states()
29
 
30
  @classmethod
@@ -53,16 +50,9 @@ class VadPipe(BasePipe):
53
  if start_frame:
54
  relative_start_frame =start_frame - self._offset
55
  if end_frame:
56
- relative_end_frame = max(0, end_frame - self._offset)
57
  return relative_start_frame, relative_end_frame
58
 
59
- def update_silence_ms(self):
60
- min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
61
- min_silence_samples = self.sample_rate * min_silence / 1000
62
- old_silence_samples = self.vac.min_silence_samples
63
- logging.warning(f"🫠 update_silence_ms :{old_silence_samples * 1000 / self.sample_rate :.2f}ms => current: {min_silence}ms ")
64
- # self.vac.min_silence_samples = min_silence_samples
65
-
66
  def process(self, in_data: MetaItem) -> MetaItem:
67
  if self._offset == 0:
68
  self.vac.reset_states()
@@ -77,29 +67,15 @@ class VadPipe(BasePipe):
77
  if rel_start_frame is not None and rel_end_frame is None:
78
  self._status = "START" # 语音开始
79
  target_audio = source_audio[rel_start_frame:]
80
-
81
- # 计算上一段静音长度
82
- silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
83
- self.adaptive_ctrl.update_silence(silence_len)
84
- self.last_state_change_offset = self._offset + rel_start_frame
85
-
86
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
87
  elif rel_start_frame is None and rel_end_frame is not None:
88
  self._status = "END" # 音频结束
89
  target_audio = source_audio[:rel_end_frame]
90
-
91
- speech_len = (rel_end_frame) / self.sample_rate * 1000
92
- self.adaptive_ctrl.update_speech(speech_len)
93
- self.last_state_change_offset = self._offset + rel_end_frame
94
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
95
  else:
96
  self._status = 'END'
97
  target_audio = source_audio[rel_start_frame:rel_end_frame]
98
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
99
-
100
- seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
101
- self.adaptive_ctrl.update_speech(seg_len)
102
- self.last_state_change_offset = self._offset + rel_end_frame
103
  # logging.debug("❌ No valid speech segment detected, setting status to END")
104
  else:
105
  if self._status == 'START':
 
1
 
2
  from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import FixedVADIterator
4
 
5
  import numpy as np
6
  import logging
 
16
  super().__init__(in_queue, out_queue)
17
  self._offset = 0 # 处理的frame size offset
18
  self._status = 'END'
 
 
19
 
20
 
21
  def reset(self):
22
  self._offset = 0
23
  self._status = 'END'
24
+
 
25
  self.vac.reset_states()
26
 
27
  @classmethod
 
50
  if start_frame:
51
  relative_start_frame =start_frame - self._offset
52
  if end_frame:
53
+ relative_end_frame = end_frame - self._offset
54
  return relative_start_frame, relative_end_frame
55
 
 
 
 
 
 
 
 
56
  def process(self, in_data: MetaItem) -> MetaItem:
57
  if self._offset == 0:
58
  self.vac.reset_states()
 
67
  if rel_start_frame is not None and rel_end_frame is None:
68
  self._status = "START" # 语音开始
69
  target_audio = source_audio[rel_start_frame:]
 
 
 
 
 
 
70
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
71
  elif rel_start_frame is None and rel_end_frame is not None:
72
  self._status = "END" # 音频结束
73
  target_audio = source_audio[:rel_end_frame]
 
 
 
 
74
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
75
  else:
76
  self._status = 'END'
77
  target_audio = source_audio[rel_start_frame:rel_end_frame]
78
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
 
 
 
 
79
  # logging.debug("❌ No valid speech segment detected, setting status to END")
80
  else:
81
  if self._status == 'START':
transcribe/whisper_llm_serve.py CHANGED
@@ -14,12 +14,39 @@ from .utils import log_block, save_to_wave, TestDataWriter, filter_words
14
  from .translatepipes import TranslatePipes
15
 
16
  from transcribe.helpers.vadprocessor import VadProcessor
17
- # from transcribe.helpers.vad_dynamic import VadProcessor
18
- # from transcribe.helpers.vadprocessor import VadProcessor
19
  from transcribe.pipelines import MetaItem
 
 
20
 
21
  logger = getLogger("TranscriptionService")
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  class WhisperTranscriptionService:
25
  """
@@ -51,21 +78,15 @@ class WhisperTranscriptionService:
51
  self._frame_queue = queue.Queue()
52
  # 音频队列缓冲区
53
  self.frames_np = np.array([], dtype=np.float32)
 
54
  # 完整音频队列
55
- self.segments_queue = collections.deque()
56
- self._temp_string = ""
57
-
58
- self._transcrible_analysis = None
59
  # 启动处理线程
60
  self._translate_thread_stop = threading.Event()
61
  self._frame_processing_thread_stop = threading.Event()
62
 
63
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
64
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
65
- # if language == "zh":
66
- # self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
67
- # else:
68
- # self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
69
  self.row_number = 0
70
  # for test
71
  self._transcrible_time_cost = 0.
@@ -107,38 +128,57 @@ class WhisperTranscriptionService:
107
  speech_status = processed_audio.speech_status
108
  return speech_audio, speech_status
109
 
 
 
110
  def _frame_processing_loop(self) -> None:
111
  """从队列获取音频帧并合并到缓冲区"""
112
  while not self._frame_processing_thread_stop.is_set():
113
  try:
114
  frame_np = self._frame_queue.get(timeout=0.1)
115
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
116
- if frame_np is None or len(frame_np) == 0:
 
117
  continue
 
118
  with self.lock:
 
 
 
119
  self.frames_np = np.append(self.frames_np, frame_np)
120
- if speech_status == "END" and len(self.frames_np) > 0:
121
- self.segments_queue.appendleft(self.frames_np.copy())
122
- self.frames_np = np.array([], dtype=np.float32)
 
 
 
 
 
 
 
123
  except queue.Empty:
124
  pass
125
 
126
  def _transcription_processing_loop(self) -> None:
127
  """主转录处理循环"""
128
  frame_epoch = 1
 
 
 
 
129
  while not self._translate_thread_stop.is_set():
130
 
131
  if len(self.frames_np) ==0:
132
  time.sleep(0.01)
133
  continue
 
134
  with self.lock:
135
- if len(self.segments_queue) >0:
136
- audio_buffer = self.segments_queue.pop()
137
  partial = False
138
  else:
139
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
140
  partial = True
141
-
142
  if len(audio_buffer) < int(self.sample_rate):
143
  silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
144
  silence_audio[-len(audio_buffer):] = audio_buffer
@@ -149,37 +189,30 @@ class WhisperTranscriptionService:
149
  segments = meta_item.segments
150
  logger.debug(f"Segments: {segments}")
151
  segments = filter_words(segments)
 
152
  if len(segments):
153
  seg_text = self.text_separator.join(seg.text for seg in segments)
154
- if self._temp_string:
155
- seg_text = self._temp_string + seg_text
156
-
157
- if partial == False:
158
- # segment_length = len(seg_text.split(self.text_separator)) if self.text_separator else len(seg_text)
159
- if len(seg_text) < config.TEXT_THREHOLD:
160
- partial = True
161
- self._temp_string = seg_text
162
- else:
163
- self._temp_string = ""
164
-
165
- result = TransResult(
166
- seg_id=self.row_number,
167
- context=seg_text,
168
- from_=self.source_language,
169
- to=self.target_language,
170
- tran_content=self._translate_text_large(seg_text),
171
- partial=partial
172
- )
173
  if partial == False:
174
  self.row_number += 1
175
-
176
- self._send_result_to_client(result)
177
-
178
- if partial == False:
179
  frame_epoch = 1
180
  else:
181
  frame_epoch += 1
182
-
 
 
 
 
 
 
 
183
 
184
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
185
  """转录音频并返回转录片段"""
 
14
  from .translatepipes import TranslatePipes
15
 
16
  from transcribe.helpers.vadprocessor import VadProcessor
 
 
17
  from transcribe.pipelines import MetaItem
18
+ from dataclasses import dataclass, field
19
+
20
 
21
  logger = getLogger("TranscriptionService")
22
 
23
+ @dataclass
24
+ class FullSegment:
25
+ """整句"""
26
+ audio_array: np.ndarray
27
+ created_time: float = field(default_factory=time.time)
28
+
29
+ @staticmethod
30
+ def merge(*audio_segments: list["FullSegment"]):
31
+ audio_segments_sorted = sorted([*audio_segments], key=lambda item: item.created_time)
32
+ return FullSegment(
33
+ created_time=audio_segments_sorted[0].created_time,
34
+ audio_array=np.concatenate([i.audio_array for i in audio_segments_sorted], axis=0)
35
+ )
36
+
37
+ @property
38
+ def time_duration(self) -> float:
39
+ return len(self.audio_array) / config.SAMPLE_RATE
40
+
41
+ @property
42
+ def start_timestamp(self):
43
+ return self.created_time
44
+
45
+ @property
46
+ def end_timestamp(self):
47
+ return self.created_time + self.time_duration
48
+
49
+
50
 
51
  class WhisperTranscriptionService:
52
  """
 
78
  self._frame_queue = queue.Queue()
79
  # 音频队列缓冲区
80
  self.frames_np = np.array([], dtype=np.float32)
81
+ self.frames_np_start_timestamp = None
82
  # 完整音频队列
83
+ self.full_segments_queue = collections.deque()
 
 
 
84
  # 启动处理线程
85
  self._translate_thread_stop = threading.Event()
86
  self._frame_processing_thread_stop = threading.Event()
87
 
88
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
89
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
 
 
 
 
90
  self.row_number = 0
91
  # for test
92
  self._transcrible_time_cost = 0.
 
128
  speech_status = processed_audio.speech_status
129
  return speech_audio, speech_status
130
 
131
+
132
+
133
  def _frame_processing_loop(self) -> None:
134
  """从队列获取音频帧并合并到缓冲区"""
135
  while not self._frame_processing_thread_stop.is_set():
136
  try:
137
  frame_np = self._frame_queue.get(timeout=0.1)
138
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
139
+
140
+ if frame_np is None:
141
  continue
142
+
143
  with self.lock:
144
+ if speech_status == "START" and self.frames_np_start_timestamp is None:
145
+ self.frames_np_start_timestamp = time.time()
146
+ # 添加音频到音频缓冲区
147
  self.frames_np = np.append(self.frames_np, frame_np)
148
+ if speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
149
+ time_diff = time.time() - self.frames_np_start_timestamp
150
+ if time_diff >= config.DESIGN_TIME_THREHOLD:
151
+ audio_array=self.frames_np.copy()
152
+ self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
153
+ self.frames_np_start_timestamp = None
154
+ self.frames_np = np.array([], dtype=np.float32)
155
+ else:
156
+ logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续增加缓冲区")
157
+
158
  except queue.Empty:
159
  pass
160
 
161
  def _transcription_processing_loop(self) -> None:
162
  """主转录处理循环"""
163
  frame_epoch = 1
164
+ loop_start_time = time.perf_counter()
165
+ # 1. 音频进入的时间戳长度低于3s和后面拼接起来
166
+ # 2. 超过25s后主动短句
167
+
168
  while not self._translate_thread_stop.is_set():
169
 
170
  if len(self.frames_np) ==0:
171
  time.sleep(0.01)
172
  continue
173
+
174
  with self.lock:
175
+ if len(self.full_segments_queue) > 0:
176
+ audio_buffer = self.full_segments_queue.pop()
177
  partial = False
178
  else:
179
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
180
  partial = True
181
+
182
  if len(audio_buffer) < int(self.sample_rate):
183
  silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
184
  silence_audio[-len(audio_buffer):] = audio_buffer
 
189
  segments = meta_item.segments
190
  logger.debug(f"Segments: {segments}")
191
  segments = filter_words(segments)
192
+
193
  if len(segments):
194
  seg_text = self.text_separator.join(seg.text for seg in segments)
195
+ result = TransResult(
196
+ seg_id=self.row_number,
197
+ context=seg_text,
198
+ from_=self.source_language,
199
+ to=self.target_language,
200
+ tran_content=self._translate_text_large(seg_text),
201
+ partial=partial
202
+ )
 
 
 
 
 
 
 
 
 
 
 
203
  if partial == False:
204
  self.row_number += 1
 
 
 
 
205
  frame_epoch = 1
206
  else:
207
  frame_epoch += 1
208
+ self._send_result_to_client(result)
209
+
210
+ # 轮询时间延迟
211
+ loop_time_diff = 2 - (time.perf_counter() - loop_start_time)
212
+ if loop_time_diff >0:
213
+ time.sleep(loop_time_diff)
214
+ loop_start_time = time.perf_counter()
215
+
216
 
217
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
218
  """转录音频并返回转录片段"""