daihui.zhang commited on
Commit
d35ad3d
·
1 Parent(s): 96789f1

update code structures and some code

Browse files
config/settings.py CHANGED
@@ -60,8 +60,8 @@ WHISPER_MODEL_EN = 'large-v3-turbo-q5_0'
60
  WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
61
  # LLM
62
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
63
- LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
64
- # LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
65
 
66
  # VAD
67
  VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
 
60
  WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
61
  # LLM
62
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
63
+ # LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
64
+ LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
65
 
66
  # VAD
67
  VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
main.py CHANGED
@@ -1,6 +1,5 @@
1
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
2
- from urllib.parse import urlparse, parse_qsl
3
- from transcribe.whisper_llm_serve import WhisperTranscriptionService
4
  from uuid import uuid1
5
  from logging import getLogger
6
  import numpy as np
 
1
  from fastapi import FastAPI, WebSocket, WebSocketDisconnect
2
+ from transcribe.serve import WhisperTranscriptionService
 
3
  from uuid import uuid1
4
  from logging import getLogger
5
  import numpy as np
transcribe/pipelines/pipe_vad.py CHANGED
@@ -72,7 +72,7 @@ class VadPipe(BasePipe):
72
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
  else:
74
  self._status = 'END'
75
- target_audio = source_audio[rel_start_frame:rel_end_frame]
76
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
77
  # logging.debug("❌ No valid speech segment detected, setting status to END")
78
  else:
 
72
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
  else:
74
  self._status = 'END'
75
+ target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame]
76
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
77
  # logging.debug("❌ No valid speech segment detected, setting status to END")
78
  else:
transcribe/{whisper_llm_serve.py → serve.py} RENAMED
@@ -7,9 +7,9 @@ import asyncio
7
  import numpy as np
8
  import config
9
  import collections
10
- from api_model import TransResult, Message, DebugResult
11
 
12
- from .utils import log_block, save_to_wave, TestDataWriter, filter_words
13
  from .translatepipes import TranslatePipes
14
 
15
  from transcribe.pipelines import MetaItem
@@ -18,26 +18,12 @@ from transcribe.pipelines import MetaItem
18
  logger = getLogger("TranscriptionService")
19
 
20
 
21
- def _get_text_separator(language: str) -> str:
22
- """根据语言返回适当的文本分隔符"""
23
- return "" if language == "zh" else " "
24
-
25
-
26
- def _start_thread(target_function) -> threading.Thread:
27
- """启动守护线程执行指定函数"""
28
- thread = threading.Thread(target=target_function)
29
- thread.daemon = True
30
- thread.start()
31
- return thread
32
-
33
 
34
  class WhisperTranscriptionService:
35
  """
36
  Whisper语音转录服务类,处理音频流转录和翻译
37
  """
38
 
39
- SERVER_READY = "SERVER_READY"
40
- DISCONNECT = "DISCONNECT"
41
 
42
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
43
  print('>>>>>>>>>>>>>>>> init service >>>>>>>>>>>>>>>>>>>>>>')
@@ -47,52 +33,37 @@ class WhisperTranscriptionService:
47
  self.client_uid = client_uid
48
  # 转录结果稳定性管理
49
  self.websocket = websocket
50
- self._translate_pipe = pipe
51
-
52
  # 音频处理相关
53
  self.sample_rate = config.SAMPLE_RATE
54
 
55
  self.lock = threading.Lock()
56
  # 文本分隔符,根据语言设置
57
- self.text_separator = _get_text_separator(language)
58
  self.loop = asyncio.get_event_loop()
59
  # 原始音频队列
60
- self._frame_queue = queue.Queue()
61
  # 音频队列缓冲区
62
  self.frames_np = np.array([], dtype=np.float32)
 
63
  self.frames_np_start_timestamp = None
64
  # 完整音频队列
65
  self.full_segments_queue = collections.deque()
66
  # 启动处理线程
67
- self._translate_thread_stop = threading.Event()
68
- self._frame_processing_thread_stop = threading.Event()
69
-
70
- self.translate_thread = _start_thread(self._transcription_processing_loop)
71
- self.frame_processing_thread = _start_thread(self._read_frame_processing_loop)
72
  self.row_number = 0
73
- # for test
74
- self._transcribe_time_cost = 0.
75
- self._translate_time_cost = 0.
76
-
77
- if config.SAVE_DATA_SAVE:
78
- self._save_task_stop = threading.Event()
79
- self._save_queue = queue.Queue()
80
- self._save_thread = _start_thread(self.save_data_loop)
81
-
82
 
83
- def save_data_loop(self):
84
- writer = TestDataWriter()
85
- while not self._save_task_stop.is_set():
86
- test_data = self._save_queue.get()
87
- writer.write(test_data) # Save test_data to CSV
88
 
89
  def add_frames(self, frame_np: np.ndarray) -> None:
90
  """添加音频帧到处理队列"""
91
- self._frame_queue.put(frame_np)
92
 
93
  def _apply_voice_activity_detection(self, frame_np:np.array):
94
  """应用语音活动检测来优化音频缓冲区"""
95
- processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
96
  speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
97
  speech_status = processed_audio.speech_status
98
  return speech_audio, speech_status
@@ -100,9 +71,9 @@ class WhisperTranscriptionService:
100
 
101
  def _read_frame_processing_loop(self) -> None:
102
  """从队列获取音频帧并合并到缓冲区"""
103
- while not self._frame_processing_thread_stop.is_set():
104
  try:
105
- frame_np = self._frame_queue.get(timeout=0.1)
106
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
107
 
108
  if frame_np is None:
@@ -142,7 +113,7 @@ class WhisperTranscriptionService:
142
  """主转录处理循环"""
143
  frame_epoch = 1
144
 
145
- while not self._translate_thread_stop.is_set():
146
 
147
  if len(self.frames_np) ==0:
148
  time.sleep(0.1)
@@ -187,54 +158,35 @@ class WhisperTranscriptionService:
187
  partial=partial
188
  )
189
  self._send_result_to_client(result)
190
-
191
-
192
 
193
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
194
  """转录音频并返回转录片段"""
195
  log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
196
- start_time = time.perf_counter()
197
-
198
- result = self._translate_pipe.transcribe(audio_buffer.tobytes(), self.source_language)
199
- segments = result.segments
200
- time_diff = (time.perf_counter() - start_time)
201
- logger.debug(f"📝 transcribe Segments: {segments} ")
202
- log_block("📝 transcribe output", f"{self.text_separator.join(seg.text for seg in segments)}", "")
203
- log_block("📝 transcribe time", f"{time_diff:.3f}", "s")
204
- self._transcribe_time_cost = round(time_diff, 3)
205
  return result
206
 
207
  def _translate_text(self, text: str) -> str:
208
  """将文本翻译为目标语言"""
209
  if not text.strip():
210
  return ""
211
-
212
  log_block("🐧 Translation input ", f"{text}")
213
- start_time = time.perf_counter()
214
 
215
- result = self._translate_pipe.translate(text, self.source_language, self.target_language)
216
  translated_text = result.translate_content
217
- time_diff = (time.perf_counter() - start_time)
218
- log_block("🐧 Translation time ", f"{time_diff:.3f}", "s")
219
  log_block("🐧 Translation out ", f"{translated_text}")
220
- self._translate_time_cost = round(time_diff, 3)
221
  return translated_text
222
 
223
  def _translate_text_large(self, text: str) -> str:
224
  """将文本翻译为目标语言"""
225
  if not text.strip():
226
  return ""
227
-
228
  log_block("Translation input", f"{text}")
229
- start_time = time.perf_counter()
230
-
231
- result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
232
  translated_text = result.translate_content
233
-
234
- time_diff = (time.perf_counter() - start_time)
235
- log_block("Translation large model time ", f"{time_diff:.3f}", "s")
236
  log_block("Translation large model output", f"{translated_text}")
237
- self._translate_time_cost = round(time_diff, 3)
238
  return translated_text
239
 
240
 
@@ -252,8 +204,5 @@ class WhisperTranscriptionService:
252
 
253
  def stop(self) -> None:
254
  """停止所有处理线程并清理资源"""
255
- self._translate_thread_stop.set()
256
- self._frame_processing_thread_stop.set()
257
- if config.SAVE_DATA_SAVE:
258
- self._save_task_stop.set()
259
  logger.info(f"Stopping transcription service for client: {self.client_uid}")
 
7
  import numpy as np
8
  import config
9
  import collections
10
+ from api_model import TransResult, Message
11
 
12
+ from .utils import log_block, start_thread, get_text_separator, filter_words
13
  from .translatepipes import TranslatePipes
14
 
15
  from transcribe.pipelines import MetaItem
 
18
  logger = getLogger("TranscriptionService")
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  class WhisperTranscriptionService:
23
  """
24
  Whisper语音转录服务类,处理音频流转录和翻译
25
  """
26
 
 
 
27
 
28
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
29
  print('>>>>>>>>>>>>>>>> init service >>>>>>>>>>>>>>>>>>>>>>')
 
33
  self.client_uid = client_uid
34
  # 转录结果稳定性管理
35
  self.websocket = websocket
36
+ self.translate_pipe = pipe
 
37
  # 音频处理相关
38
  self.sample_rate = config.SAMPLE_RATE
39
 
40
  self.lock = threading.Lock()
41
  # 文本分隔符,根据语言设置
42
+ self.text_separator = get_text_separator(language)
43
  self.loop = asyncio.get_event_loop()
44
  # 原始音频队列
45
+ self.frame_queue = queue.Queue()
46
  # 音频队列缓冲区
47
  self.frames_np = np.array([], dtype=np.float32)
48
+ # 音频开始的时间点 用于约束最小断句时间
49
  self.frames_np_start_timestamp = None
50
  # 完整音频队列
51
  self.full_segments_queue = collections.deque()
52
  # 启动处理线程
53
+ self._stop = threading.Event()
54
+ self.translate_thread = start_thread(self._transcription_processing_loop)
55
+ self.frame_processing_thread = start_thread(self._read_frame_processing_loop)
56
+ # 行号
 
57
  self.row_number = 0
 
 
 
 
 
 
 
 
 
58
 
 
 
 
 
 
59
 
60
  def add_frames(self, frame_np: np.ndarray) -> None:
61
  """添加音频帧到处理队列"""
62
+ self.frame_queue.put(frame_np)
63
 
64
  def _apply_voice_activity_detection(self, frame_np:np.array):
65
  """应用语音活动检测来优化音频缓冲区"""
66
+ processed_audio = self.translate_pipe.voice_detect(frame_np.tobytes())
67
  speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
68
  speech_status = processed_audio.speech_status
69
  return speech_audio, speech_status
 
71
 
72
  def _read_frame_processing_loop(self) -> None:
73
  """从队列获取音频帧并合并到缓冲区"""
74
+ while not self._stop.is_set():
75
  try:
76
+ frame_np = self.frame_queue.get(timeout=0.1)
77
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
78
 
79
  if frame_np is None:
 
113
  """主转录处理循环"""
114
  frame_epoch = 1
115
 
116
+ while not self._stop.is_set():
117
 
118
  if len(self.frames_np) ==0:
119
  time.sleep(0.1)
 
158
  partial=partial
159
  )
160
  self._send_result_to_client(result)
 
 
161
 
162
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
163
  """转录音频并返回转录片段"""
164
  log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
165
+ result = self.translate_pipe.transcribe(audio_buffer.tobytes(), self.source_language)
166
+ log_block("📝 transcribe output", f"{self.text_separator.join(seg.text for seg in result.segments)}", "")
167
+
 
 
 
 
 
 
168
  return result
169
 
170
  def _translate_text(self, text: str) -> str:
171
  """将文本翻译为目标语言"""
172
  if not text.strip():
173
  return ""
 
174
  log_block("🐧 Translation input ", f"{text}")
 
175
 
176
+ result = self.translate_pipe.translate(text, self.source_language, self.target_language)
177
  translated_text = result.translate_content
178
+
 
179
  log_block("🐧 Translation out ", f"{translated_text}")
 
180
  return translated_text
181
 
182
  def _translate_text_large(self, text: str) -> str:
183
  """将文本翻译为目标语言"""
184
  if not text.strip():
185
  return ""
 
186
  log_block("Translation input", f"{text}")
187
+ result = self.translate_pipe.translate_large(text, self.source_language, self.target_language)
 
 
188
  translated_text = result.translate_content
 
 
 
189
  log_block("Translation large model output", f"{translated_text}")
 
190
  return translated_text
191
 
192
 
 
204
 
205
  def stop(self) -> None:
206
  """停止所有处理线程并清理资源"""
207
+ self._stop.set()
 
 
 
208
  logger.info(f"Stopping transcription service for client: {self.client_uid}")
transcribe/translatepipes.py CHANGED
@@ -1,5 +1,5 @@
1
- from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe, TranslatePipe
2
-
3
 
4
  class TranslatePipes:
5
  def __init__(self) -> None:
@@ -26,6 +26,7 @@ class TranslatePipes:
26
  for p in self._process:
27
  p.wait()
28
 
 
29
  def translate(self, text, src_lang, dst_lang) -> MetaItem:
30
  item = MetaItem(
31
  transcribe_content=text,
@@ -34,6 +35,7 @@ class TranslatePipes:
34
  self._translate_pipe.input_queue.put(item)
35
  return self._translate_pipe.output_queue.get()
36
 
 
37
  def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
38
  item = MetaItem(
39
  transcribe_content=text,
@@ -47,6 +49,7 @@ class TranslatePipes:
47
  return self._funasr_pipe
48
  return self._whisper_pipe_en
49
 
 
50
  def transcribe(self, audio_buffer: bytes, src_lang: str) -> MetaItem:
51
  transcription_model = self.get_transcription_model(src_lang)
52
  item = MetaItem(audio=audio_buffer, source_language=src_lang)
 
1
+ from .pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe, TranslatePipe
2
+ from .utils import timer
3
 
4
  class TranslatePipes:
5
  def __init__(self) -> None:
 
26
  for p in self._process:
27
  p.wait()
28
 
29
+ @timer(name="🐧 Translate")
30
  def translate(self, text, src_lang, dst_lang) -> MetaItem:
31
  item = MetaItem(
32
  transcribe_content=text,
 
35
  self._translate_pipe.input_queue.put(item)
36
  return self._translate_pipe.output_queue.get()
37
 
38
+ @timer(name="🐧 Translate-large")
39
  def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
40
  item = MetaItem(
41
  transcribe_content=text,
 
49
  return self._funasr_pipe
50
  return self._whisper_pipe_en
51
 
52
+ @timer(name="📝 transcribe")
53
  def transcribe(self, audio_buffer: bytes, src_lang: str) -> MetaItem:
54
  transcription_model = self.get_transcription_model(src_lang)
55
  item = MetaItem(audio=audio_buffer, source_language=src_lang)
transcribe/utils.py CHANGED
@@ -8,8 +8,9 @@ import config
8
  import csv
9
  import av
10
  import re
11
- import json
12
-
 
13
  # Compile regex patterns once outside the loop for better performance
14
  p_pattern = re.compile(r"(\s*\[.*?\])")
15
  p_start_pattern = re.compile(r"(\s*\[.*)")
@@ -178,6 +179,32 @@ def pcm_bytes_to_np_array(pcm_bytes: bytes, dtype=np.float32, channels=1):
178
  audio_np = audio_np.reshape(-1, channels)
179
  return audio_np
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  class TestDataWriter:
182
  def __init__(self, file_path='test_data.csv'):
183
  self.file_path = file_path
 
8
  import csv
9
  import av
10
  import re
11
+ from functools import wraps
12
+ import time
13
+ import threading
14
  # Compile regex patterns once outside the loop for better performance
15
  p_pattern = re.compile(r"(\s*\[.*?\])")
16
  p_start_pattern = re.compile(r"(\s*\[.*)")
 
179
  audio_np = audio_np.reshape(-1, channels)
180
  return audio_np
181
 
182
+ def timer(name: str):
183
+ def decorator(func):
184
+ @wraps(func)
185
+ def wrapper(*args, **kwargs):
186
+ start_time = time.perf_counter()
187
+ result = func(*args, **kwargs)
188
+ end_time = time.perf_counter()
189
+ duration = end_time - start_time
190
+ log_block(f"{name} cost:", f"{duration:.2f} s")
191
+ return result
192
+ return wrapper
193
+ return decorator
194
+
195
+ def get_text_separator(language: str) -> str:
196
+ """根据语言返回适当的文本分隔符"""
197
+ return "" if language == "zh" else " "
198
+
199
+
200
+ def start_thread(target_function) -> threading.Thread:
201
+ """启动守护线程执行指定函数"""
202
+ thread = threading.Thread(target=target_function)
203
+ thread.daemon = True
204
+ thread.start()
205
+ return thread
206
+
207
+
208
  class TestDataWriter:
209
  def __init__(self, file_path='test_data.csv'):
210
  self.file_path = file_path