daihui.zhang
commited on
Commit
·
d35ad3d
1
Parent(s):
96789f1
update code structures and some code
Browse files- config/settings.py +2 -2
- main.py +1 -2
- transcribe/pipelines/pipe_vad.py +1 -1
- transcribe/{whisper_llm_serve.py → serve.py} +22 -73
- transcribe/translatepipes.py +5 -2
- transcribe/utils.py +29 -2
config/settings.py
CHANGED
@@ -60,8 +60,8 @@ WHISPER_MODEL_EN = 'large-v3-turbo-q5_0'
|
|
60 |
WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
|
61 |
# LLM
|
62 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
63 |
-
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
64 |
-
|
65 |
|
66 |
# VAD
|
67 |
VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
|
|
|
60 |
WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
|
61 |
# LLM
|
62 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
63 |
+
# LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
64 |
+
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
|
65 |
|
66 |
# VAD
|
67 |
VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
|
main.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
2 |
-
from
|
3 |
-
from transcribe.whisper_llm_serve import WhisperTranscriptionService
|
4 |
from uuid import uuid1
|
5 |
from logging import getLogger
|
6 |
import numpy as np
|
|
|
1 |
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
2 |
+
from transcribe.serve import WhisperTranscriptionService
|
|
|
3 |
from uuid import uuid1
|
4 |
from logging import getLogger
|
5 |
import numpy as np
|
transcribe/pipelines/pipe_vad.py
CHANGED
@@ -72,7 +72,7 @@ class VadPipe(BasePipe):
|
|
72 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
73 |
else:
|
74 |
self._status = 'END'
|
75 |
-
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
76 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
77 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
78 |
else:
|
|
|
72 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
73 |
else:
|
74 |
self._status = 'END'
|
75 |
+
target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame]
|
76 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
77 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
78 |
else:
|
transcribe/{whisper_llm_serve.py → serve.py}
RENAMED
@@ -7,9 +7,9 @@ import asyncio
|
|
7 |
import numpy as np
|
8 |
import config
|
9 |
import collections
|
10 |
-
from api_model import TransResult, Message
|
11 |
|
12 |
-
from .utils import log_block,
|
13 |
from .translatepipes import TranslatePipes
|
14 |
|
15 |
from transcribe.pipelines import MetaItem
|
@@ -18,26 +18,12 @@ from transcribe.pipelines import MetaItem
|
|
18 |
logger = getLogger("TranscriptionService")
|
19 |
|
20 |
|
21 |
-
def _get_text_separator(language: str) -> str:
|
22 |
-
"""根据语言返回适当的文本分隔符"""
|
23 |
-
return "" if language == "zh" else " "
|
24 |
-
|
25 |
-
|
26 |
-
def _start_thread(target_function) -> threading.Thread:
|
27 |
-
"""启动守护线程执行指定函数"""
|
28 |
-
thread = threading.Thread(target=target_function)
|
29 |
-
thread.daemon = True
|
30 |
-
thread.start()
|
31 |
-
return thread
|
32 |
-
|
33 |
|
34 |
class WhisperTranscriptionService:
|
35 |
"""
|
36 |
Whisper语音转录服务类,处理音频流转录和翻译
|
37 |
"""
|
38 |
|
39 |
-
SERVER_READY = "SERVER_READY"
|
40 |
-
DISCONNECT = "DISCONNECT"
|
41 |
|
42 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
43 |
print('>>>>>>>>>>>>>>>> init service >>>>>>>>>>>>>>>>>>>>>>')
|
@@ -47,52 +33,37 @@ class WhisperTranscriptionService:
|
|
47 |
self.client_uid = client_uid
|
48 |
# 转录结果稳定性管理
|
49 |
self.websocket = websocket
|
50 |
-
self.
|
51 |
-
|
52 |
# 音频处理相关
|
53 |
self.sample_rate = config.SAMPLE_RATE
|
54 |
|
55 |
self.lock = threading.Lock()
|
56 |
# 文本分隔符,根据语言设置
|
57 |
-
self.text_separator =
|
58 |
self.loop = asyncio.get_event_loop()
|
59 |
# 原始音频队列
|
60 |
-
self.
|
61 |
# 音频队列缓冲区
|
62 |
self.frames_np = np.array([], dtype=np.float32)
|
|
|
63 |
self.frames_np_start_timestamp = None
|
64 |
# 完整音频队列
|
65 |
self.full_segments_queue = collections.deque()
|
66 |
# 启动处理线程
|
67 |
-
self.
|
68 |
-
self.
|
69 |
-
|
70 |
-
|
71 |
-
self.frame_processing_thread = _start_thread(self._read_frame_processing_loop)
|
72 |
self.row_number = 0
|
73 |
-
# for test
|
74 |
-
self._transcribe_time_cost = 0.
|
75 |
-
self._translate_time_cost = 0.
|
76 |
-
|
77 |
-
if config.SAVE_DATA_SAVE:
|
78 |
-
self._save_task_stop = threading.Event()
|
79 |
-
self._save_queue = queue.Queue()
|
80 |
-
self._save_thread = _start_thread(self.save_data_loop)
|
81 |
-
|
82 |
|
83 |
-
def save_data_loop(self):
|
84 |
-
writer = TestDataWriter()
|
85 |
-
while not self._save_task_stop.is_set():
|
86 |
-
test_data = self._save_queue.get()
|
87 |
-
writer.write(test_data) # Save test_data to CSV
|
88 |
|
89 |
def add_frames(self, frame_np: np.ndarray) -> None:
|
90 |
"""添加音频帧到处理队列"""
|
91 |
-
self.
|
92 |
|
93 |
def _apply_voice_activity_detection(self, frame_np:np.array):
|
94 |
"""应用语音活动检测来优化音频缓冲区"""
|
95 |
-
processed_audio = self.
|
96 |
speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
|
97 |
speech_status = processed_audio.speech_status
|
98 |
return speech_audio, speech_status
|
@@ -100,9 +71,9 @@ class WhisperTranscriptionService:
|
|
100 |
|
101 |
def _read_frame_processing_loop(self) -> None:
|
102 |
"""从队列获取音频帧并合并到缓冲区"""
|
103 |
-
while not self.
|
104 |
try:
|
105 |
-
frame_np = self.
|
106 |
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
107 |
|
108 |
if frame_np is None:
|
@@ -142,7 +113,7 @@ class WhisperTranscriptionService:
|
|
142 |
"""主转录处理循环"""
|
143 |
frame_epoch = 1
|
144 |
|
145 |
-
while not self.
|
146 |
|
147 |
if len(self.frames_np) ==0:
|
148 |
time.sleep(0.1)
|
@@ -187,54 +158,35 @@ class WhisperTranscriptionService:
|
|
187 |
partial=partial
|
188 |
)
|
189 |
self._send_result_to_client(result)
|
190 |
-
|
191 |
-
|
192 |
|
193 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
194 |
"""转录音频并返回转录片段"""
|
195 |
log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
segments = result.segments
|
200 |
-
time_diff = (time.perf_counter() - start_time)
|
201 |
-
logger.debug(f"📝 transcribe Segments: {segments} ")
|
202 |
-
log_block("📝 transcribe output", f"{self.text_separator.join(seg.text for seg in segments)}", "")
|
203 |
-
log_block("📝 transcribe time", f"{time_diff:.3f}", "s")
|
204 |
-
self._transcribe_time_cost = round(time_diff, 3)
|
205 |
return result
|
206 |
|
207 |
def _translate_text(self, text: str) -> str:
|
208 |
"""将文本翻译为目标语言"""
|
209 |
if not text.strip():
|
210 |
return ""
|
211 |
-
|
212 |
log_block("🐧 Translation input ", f"{text}")
|
213 |
-
start_time = time.perf_counter()
|
214 |
|
215 |
-
result = self.
|
216 |
translated_text = result.translate_content
|
217 |
-
|
218 |
-
log_block("🐧 Translation time ", f"{time_diff:.3f}", "s")
|
219 |
log_block("🐧 Translation out ", f"{translated_text}")
|
220 |
-
self._translate_time_cost = round(time_diff, 3)
|
221 |
return translated_text
|
222 |
|
223 |
def _translate_text_large(self, text: str) -> str:
|
224 |
"""将文本翻译为目标语言"""
|
225 |
if not text.strip():
|
226 |
return ""
|
227 |
-
|
228 |
log_block("Translation input", f"{text}")
|
229 |
-
|
230 |
-
|
231 |
-
result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
|
232 |
translated_text = result.translate_content
|
233 |
-
|
234 |
-
time_diff = (time.perf_counter() - start_time)
|
235 |
-
log_block("Translation large model time ", f"{time_diff:.3f}", "s")
|
236 |
log_block("Translation large model output", f"{translated_text}")
|
237 |
-
self._translate_time_cost = round(time_diff, 3)
|
238 |
return translated_text
|
239 |
|
240 |
|
@@ -252,8 +204,5 @@ class WhisperTranscriptionService:
|
|
252 |
|
253 |
def stop(self) -> None:
|
254 |
"""停止所有处理线程并清理资源"""
|
255 |
-
self.
|
256 |
-
self._frame_processing_thread_stop.set()
|
257 |
-
if config.SAVE_DATA_SAVE:
|
258 |
-
self._save_task_stop.set()
|
259 |
logger.info(f"Stopping transcription service for client: {self.client_uid}")
|
|
|
7 |
import numpy as np
|
8 |
import config
|
9 |
import collections
|
10 |
+
from api_model import TransResult, Message
|
11 |
|
12 |
+
from .utils import log_block, start_thread, get_text_separator, filter_words
|
13 |
from .translatepipes import TranslatePipes
|
14 |
|
15 |
from transcribe.pipelines import MetaItem
|
|
|
18 |
logger = getLogger("TranscriptionService")
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
class WhisperTranscriptionService:
|
23 |
"""
|
24 |
Whisper语音转录服务类,处理音频流转录和翻译
|
25 |
"""
|
26 |
|
|
|
|
|
27 |
|
28 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
29 |
print('>>>>>>>>>>>>>>>> init service >>>>>>>>>>>>>>>>>>>>>>')
|
|
|
33 |
self.client_uid = client_uid
|
34 |
# 转录结果稳定性管理
|
35 |
self.websocket = websocket
|
36 |
+
self.translate_pipe = pipe
|
|
|
37 |
# 音频处理相关
|
38 |
self.sample_rate = config.SAMPLE_RATE
|
39 |
|
40 |
self.lock = threading.Lock()
|
41 |
# 文本分隔符,根据语言设置
|
42 |
+
self.text_separator = get_text_separator(language)
|
43 |
self.loop = asyncio.get_event_loop()
|
44 |
# 原始音频队列
|
45 |
+
self.frame_queue = queue.Queue()
|
46 |
# 音频队列缓冲区
|
47 |
self.frames_np = np.array([], dtype=np.float32)
|
48 |
+
# 音频开始的时间点 用于约束最小断句时间
|
49 |
self.frames_np_start_timestamp = None
|
50 |
# 完整音频队列
|
51 |
self.full_segments_queue = collections.deque()
|
52 |
# 启动处理线程
|
53 |
+
self._stop = threading.Event()
|
54 |
+
self.translate_thread = start_thread(self._transcription_processing_loop)
|
55 |
+
self.frame_processing_thread = start_thread(self._read_frame_processing_loop)
|
56 |
+
# 行号
|
|
|
57 |
self.row_number = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
def add_frames(self, frame_np: np.ndarray) -> None:
|
61 |
"""添加音频帧到处理队列"""
|
62 |
+
self.frame_queue.put(frame_np)
|
63 |
|
64 |
def _apply_voice_activity_detection(self, frame_np:np.array):
|
65 |
"""应用语音活动检测来优化音频缓冲区"""
|
66 |
+
processed_audio = self.translate_pipe.voice_detect(frame_np.tobytes())
|
67 |
speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
|
68 |
speech_status = processed_audio.speech_status
|
69 |
return speech_audio, speech_status
|
|
|
71 |
|
72 |
def _read_frame_processing_loop(self) -> None:
|
73 |
"""从队列获取音频帧并合并到缓冲区"""
|
74 |
+
while not self._stop.is_set():
|
75 |
try:
|
76 |
+
frame_np = self.frame_queue.get(timeout=0.1)
|
77 |
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
78 |
|
79 |
if frame_np is None:
|
|
|
113 |
"""主转录处理循环"""
|
114 |
frame_epoch = 1
|
115 |
|
116 |
+
while not self._stop.is_set():
|
117 |
|
118 |
if len(self.frames_np) ==0:
|
119 |
time.sleep(0.1)
|
|
|
158 |
partial=partial
|
159 |
)
|
160 |
self._send_result_to_client(result)
|
|
|
|
|
161 |
|
162 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
163 |
"""转录音频并返回转录片段"""
|
164 |
log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
|
165 |
+
result = self.translate_pipe.transcribe(audio_buffer.tobytes(), self.source_language)
|
166 |
+
log_block("📝 transcribe output", f"{self.text_separator.join(seg.text for seg in result.segments)}", "")
|
167 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
return result
|
169 |
|
170 |
def _translate_text(self, text: str) -> str:
|
171 |
"""将文本翻译为目标语言"""
|
172 |
if not text.strip():
|
173 |
return ""
|
|
|
174 |
log_block("🐧 Translation input ", f"{text}")
|
|
|
175 |
|
176 |
+
result = self.translate_pipe.translate(text, self.source_language, self.target_language)
|
177 |
translated_text = result.translate_content
|
178 |
+
|
|
|
179 |
log_block("🐧 Translation out ", f"{translated_text}")
|
|
|
180 |
return translated_text
|
181 |
|
182 |
def _translate_text_large(self, text: str) -> str:
|
183 |
"""将文本翻译为目标语言"""
|
184 |
if not text.strip():
|
185 |
return ""
|
|
|
186 |
log_block("Translation input", f"{text}")
|
187 |
+
result = self.translate_pipe.translate_large(text, self.source_language, self.target_language)
|
|
|
|
|
188 |
translated_text = result.translate_content
|
|
|
|
|
|
|
189 |
log_block("Translation large model output", f"{translated_text}")
|
|
|
190 |
return translated_text
|
191 |
|
192 |
|
|
|
204 |
|
205 |
def stop(self) -> None:
|
206 |
"""停止所有处理线程并清理资源"""
|
207 |
+
self._stop.set()
|
|
|
|
|
|
|
208 |
logger.info(f"Stopping transcription service for client: {self.client_uid}")
|
transcribe/translatepipes.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
from
|
2 |
-
|
3 |
|
4 |
class TranslatePipes:
|
5 |
def __init__(self) -> None:
|
@@ -26,6 +26,7 @@ class TranslatePipes:
|
|
26 |
for p in self._process:
|
27 |
p.wait()
|
28 |
|
|
|
29 |
def translate(self, text, src_lang, dst_lang) -> MetaItem:
|
30 |
item = MetaItem(
|
31 |
transcribe_content=text,
|
@@ -34,6 +35,7 @@ class TranslatePipes:
|
|
34 |
self._translate_pipe.input_queue.put(item)
|
35 |
return self._translate_pipe.output_queue.get()
|
36 |
|
|
|
37 |
def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
|
38 |
item = MetaItem(
|
39 |
transcribe_content=text,
|
@@ -47,6 +49,7 @@ class TranslatePipes:
|
|
47 |
return self._funasr_pipe
|
48 |
return self._whisper_pipe_en
|
49 |
|
|
|
50 |
def transcribe(self, audio_buffer: bytes, src_lang: str) -> MetaItem:
|
51 |
transcription_model = self.get_transcription_model(src_lang)
|
52 |
item = MetaItem(audio=audio_buffer, source_language=src_lang)
|
|
|
1 |
+
from .pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe, TranslatePipe
|
2 |
+
from .utils import timer
|
3 |
|
4 |
class TranslatePipes:
|
5 |
def __init__(self) -> None:
|
|
|
26 |
for p in self._process:
|
27 |
p.wait()
|
28 |
|
29 |
+
@timer(name="🐧 Translate")
|
30 |
def translate(self, text, src_lang, dst_lang) -> MetaItem:
|
31 |
item = MetaItem(
|
32 |
transcribe_content=text,
|
|
|
35 |
self._translate_pipe.input_queue.put(item)
|
36 |
return self._translate_pipe.output_queue.get()
|
37 |
|
38 |
+
@timer(name="🐧 Translate-large")
|
39 |
def translate_large(self, text, src_lang, dst_lang) -> MetaItem:
|
40 |
item = MetaItem(
|
41 |
transcribe_content=text,
|
|
|
49 |
return self._funasr_pipe
|
50 |
return self._whisper_pipe_en
|
51 |
|
52 |
+
@timer(name="📝 transcribe")
|
53 |
def transcribe(self, audio_buffer: bytes, src_lang: str) -> MetaItem:
|
54 |
transcription_model = self.get_transcription_model(src_lang)
|
55 |
item = MetaItem(audio=audio_buffer, source_language=src_lang)
|
transcribe/utils.py
CHANGED
@@ -8,8 +8,9 @@ import config
|
|
8 |
import csv
|
9 |
import av
|
10 |
import re
|
11 |
-
import
|
12 |
-
|
|
|
13 |
# Compile regex patterns once outside the loop for better performance
|
14 |
p_pattern = re.compile(r"(\s*\[.*?\])")
|
15 |
p_start_pattern = re.compile(r"(\s*\[.*)")
|
@@ -178,6 +179,32 @@ def pcm_bytes_to_np_array(pcm_bytes: bytes, dtype=np.float32, channels=1):
|
|
178 |
audio_np = audio_np.reshape(-1, channels)
|
179 |
return audio_np
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
class TestDataWriter:
|
182 |
def __init__(self, file_path='test_data.csv'):
|
183 |
self.file_path = file_path
|
|
|
8 |
import csv
|
9 |
import av
|
10 |
import re
|
11 |
+
from functools import wraps
|
12 |
+
import time
|
13 |
+
import threading
|
14 |
# Compile regex patterns once outside the loop for better performance
|
15 |
p_pattern = re.compile(r"(\s*\[.*?\])")
|
16 |
p_start_pattern = re.compile(r"(\s*\[.*)")
|
|
|
179 |
audio_np = audio_np.reshape(-1, channels)
|
180 |
return audio_np
|
181 |
|
182 |
+
def timer(name: str):
|
183 |
+
def decorator(func):
|
184 |
+
@wraps(func)
|
185 |
+
def wrapper(*args, **kwargs):
|
186 |
+
start_time = time.perf_counter()
|
187 |
+
result = func(*args, **kwargs)
|
188 |
+
end_time = time.perf_counter()
|
189 |
+
duration = end_time - start_time
|
190 |
+
log_block(f"{name} cost:", f"{duration:.2f} s")
|
191 |
+
return result
|
192 |
+
return wrapper
|
193 |
+
return decorator
|
194 |
+
|
195 |
+
def get_text_separator(language: str) -> str:
|
196 |
+
"""根据语言返回适当的文本分隔符"""
|
197 |
+
return "" if language == "zh" else " "
|
198 |
+
|
199 |
+
|
200 |
+
def start_thread(target_function) -> threading.Thread:
|
201 |
+
"""启动守护线程执行指定函数"""
|
202 |
+
thread = threading.Thread(target=target_function)
|
203 |
+
thread.daemon = True
|
204 |
+
thread.start()
|
205 |
+
return thread
|
206 |
+
|
207 |
+
|
208 |
class TestDataWriter:
|
209 |
def __init__(self, file_path='test_data.csv'):
|
210 |
self.file_path = file_path
|