daihui.zhang
commited on
Commit
·
ac3675c
1
Parent(s):
5029a39
fix max speech duration bug
Browse files- config.py +2 -0
- tests/audio_utils.py +54 -0
- tests/test_vad.ipynb +129 -0
- transcribe/helpers/vadprocessor.py +7 -7
- transcribe/pipelines/pipe_vad.py +0 -1
- transcribe/whisper_llm_serve.py +8 -5
config.py
CHANGED
@@ -25,6 +25,8 @@ logging.getLogger().addHandler(console_handler)
|
|
25 |
TEXT_THREHOLD = 6
|
26 |
# 音频段的决策时间
|
27 |
DESIGN_TIME_THREHOLD = 3
|
|
|
|
|
28 |
|
29 |
BASE_DIR = pathlib.Path(__file__).parent
|
30 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
|
|
25 |
TEXT_THREHOLD = 6
|
26 |
# 音频段的决策时间
|
27 |
DESIGN_TIME_THREHOLD = 3
|
28 |
+
# 最长语音时长
|
29 |
+
MAX_SPEECH_DURATION_S = 15
|
30 |
|
31 |
BASE_DIR = pathlib.Path(__file__).parent
|
32 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
tests/audio_utils.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import soundfile as sf
|
3 |
+
import time
|
4 |
+
|
5 |
+
def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
|
6 |
+
"""
|
7 |
+
音频流生成器,从音频文件中读取数据并以流的方式输出
|
8 |
+
|
9 |
+
参数:
|
10 |
+
audio_file_path: 音频文件路径
|
11 |
+
chunk_size: 每个数据块的大小(采样点数)
|
12 |
+
simulate_realtime: 是否模拟实时流处理的速度
|
13 |
+
|
14 |
+
生成:
|
15 |
+
numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
|
16 |
+
"""
|
17 |
+
# 加载音频文件
|
18 |
+
audio_data, sample_rate = sf.read(audio_file_path)
|
19 |
+
|
20 |
+
# 确保音频数据是float32类型
|
21 |
+
if audio_data.dtype != np.float32:
|
22 |
+
audio_data = audio_data.astype(np.float32)
|
23 |
+
|
24 |
+
# 如果是立体声,转换为单声道
|
25 |
+
if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
|
26 |
+
audio_data = audio_data.mean(axis=1)
|
27 |
+
|
28 |
+
print(f"已加载音频文件: {audio_file_path}")
|
29 |
+
print(f"采样率: {sample_rate} Hz")
|
30 |
+
print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
|
31 |
+
|
32 |
+
# 计算每个块的时长(秒)
|
33 |
+
chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
|
34 |
+
|
35 |
+
# 按块生成数据
|
36 |
+
audio_len = len(audio_data)
|
37 |
+
for pos in range(0, audio_len, chunk_size):
|
38 |
+
# 获取当前块
|
39 |
+
end_pos = min(pos + chunk_size, audio_len)
|
40 |
+
chunk = audio_data[pos:end_pos]
|
41 |
+
|
42 |
+
# 如果块大小不足,用0填充
|
43 |
+
if len(chunk) < chunk_size:
|
44 |
+
padded_chunk = np.zeros(chunk_size, dtype=np.float32)
|
45 |
+
padded_chunk[:len(chunk)] = chunk
|
46 |
+
chunk = padded_chunk
|
47 |
+
|
48 |
+
# 模拟实时处理的延迟
|
49 |
+
if simulate_realtime:
|
50 |
+
time.sleep(chunk_duration)
|
51 |
+
|
52 |
+
yield chunk
|
53 |
+
|
54 |
+
print("音频流处理完成")
|
tests/test_vad.ipynb
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from audio_utils import audio_stream_generator\n",
|
10 |
+
"import IPython.display as ipd\n",
|
11 |
+
"import sys\n",
|
12 |
+
"sys.path.append(\"..\")\n",
|
13 |
+
"from transcribe.helpers.vadprocessor import FixedVADIterator\n"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "code",
|
18 |
+
"execution_count": 3,
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"vac = FixedVADIterator(\n",
|
23 |
+
" threshold=0.5,\n",
|
24 |
+
" sampling_rate=16000,\n",
|
25 |
+
" # speech_pad_ms=10\n",
|
26 |
+
" min_silence_duration_ms = 100,\n",
|
27 |
+
" # speech_pad_ms = 30,\n",
|
28 |
+
" max_speech_duration_s=5.0,\n",
|
29 |
+
" )\n"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 10,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"SAMPLE_FILE_PATH = \"/Users/david/Samples/Audio/zh/liyongle.wav\"\n",
|
39 |
+
"SAMPLING_RATE = 16000\n",
|
40 |
+
"\n",
|
41 |
+
"chunks_generator = audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
|
42 |
+
"vac.reset_states()"
|
43 |
+
]
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"cell_type": "code",
|
47 |
+
"execution_count": 11,
|
48 |
+
"metadata": {},
|
49 |
+
"outputs": [
|
50 |
+
{
|
51 |
+
"name": "stdout",
|
52 |
+
"output_type": "stream",
|
53 |
+
"text": [
|
54 |
+
"已加载音频文件: /Users/david/Samples/Audio/zh/liyongle.wav\n",
|
55 |
+
"采样率: 16000 Hz\n",
|
56 |
+
"音频长度: 64.00 秒\n",
|
57 |
+
"{'start': 3616}\n",
|
58 |
+
"{'end': 83968}\n",
|
59 |
+
"{'end': 164352}\n",
|
60 |
+
"{'end': 244736}\n",
|
61 |
+
"{'end': 325120}\n",
|
62 |
+
"{'end': 405504}\n",
|
63 |
+
"{'end': 485888}\n",
|
64 |
+
"{'end': 566272}\n",
|
65 |
+
"{'end': 624608}\n",
|
66 |
+
"{'start': 631328}\n",
|
67 |
+
"{'end': 691168}\n",
|
68 |
+
"{'start': 698912}\n",
|
69 |
+
"{'end': 779264}\n",
|
70 |
+
"{'end': 800736}\n",
|
71 |
+
"{'start': 805920}\n",
|
72 |
+
"{'end': 846816}\n",
|
73 |
+
"{'start': 855072}\n",
|
74 |
+
"{'end': 862176}\n",
|
75 |
+
"{'start': 864288}\n",
|
76 |
+
"{'end': 890336}\n",
|
77 |
+
"{'start': 893984}\n",
|
78 |
+
"{'end': 912352}\n",
|
79 |
+
"{'start': 917536}\n",
|
80 |
+
"{'end': 932320}\n",
|
81 |
+
"{'start': 939040}\n",
|
82 |
+
"{'end': 966112}\n",
|
83 |
+
"{'start': 970784}\n",
|
84 |
+
"{'end': 1015264}\n",
|
85 |
+
"{'start': 1019424}\n",
|
86 |
+
"音频流处理完成\n"
|
87 |
+
]
|
88 |
+
}
|
89 |
+
],
|
90 |
+
"source": [
|
91 |
+
"for chunk in chunks_generator:\n",
|
92 |
+
" # vad_iterator.reset_states()\n",
|
93 |
+
" # audio_buffer = np.append(audio_buffer, chunk)\n",
|
94 |
+
" \n",
|
95 |
+
" speech_dict = vac(chunk, return_seconds=False)\n",
|
96 |
+
" if speech_dict:\n",
|
97 |
+
" print(speech_dict)"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "code",
|
102 |
+
"execution_count": null,
|
103 |
+
"metadata": {},
|
104 |
+
"outputs": [],
|
105 |
+
"source": []
|
106 |
+
}
|
107 |
+
],
|
108 |
+
"metadata": {
|
109 |
+
"kernelspec": {
|
110 |
+
"display_name": ".venv",
|
111 |
+
"language": "python",
|
112 |
+
"name": "python3"
|
113 |
+
},
|
114 |
+
"language_info": {
|
115 |
+
"codemirror_mode": {
|
116 |
+
"name": "ipython",
|
117 |
+
"version": 3
|
118 |
+
},
|
119 |
+
"file_extension": ".py",
|
120 |
+
"mimetype": "text/x-python",
|
121 |
+
"name": "python",
|
122 |
+
"nbconvert_exporter": "python",
|
123 |
+
"pygments_lexer": "ipython3",
|
124 |
+
"version": "3.11.11"
|
125 |
+
}
|
126 |
+
},
|
127 |
+
"nbformat": 4,
|
128 |
+
"nbformat_minor": 2
|
129 |
+
}
|
transcribe/helpers/vadprocessor.py
CHANGED
@@ -155,7 +155,7 @@ class VADIteratorOnnx:
|
|
155 |
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
156 |
|
157 |
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
158 |
-
self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
159 |
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
160 |
self.reset_states()
|
161 |
|
@@ -184,7 +184,7 @@ class VADIteratorOnnx:
|
|
184 |
self.current_sample += window_size_samples
|
185 |
|
186 |
speech_prob = self.model(x, self.sampling_rate)[0,0]
|
187 |
-
|
188 |
|
189 |
if (speech_prob >= self.threshold) and self.temp_end:
|
190 |
self.temp_end = 0
|
@@ -196,11 +196,11 @@ class VADIteratorOnnx:
|
|
196 |
self.start = speech_start
|
197 |
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
198 |
|
199 |
-
if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
|
205 |
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
206 |
if not self.temp_end:
|
|
|
155 |
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
156 |
|
157 |
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
158 |
+
# self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
159 |
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
160 |
self.reset_states()
|
161 |
|
|
|
184 |
self.current_sample += window_size_samples
|
185 |
|
186 |
speech_prob = self.model(x, self.sampling_rate)[0,0]
|
187 |
+
|
188 |
|
189 |
if (speech_prob >= self.threshold) and self.temp_end:
|
190 |
self.temp_end = 0
|
|
|
196 |
self.start = speech_start
|
197 |
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
198 |
|
199 |
+
# if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
|
200 |
+
# if self.temp_end:
|
201 |
+
# self.temp_end = 0
|
202 |
+
# self.start = self.current_sample
|
203 |
+
# return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
|
204 |
|
205 |
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
206 |
if not self.temp_end:
|
transcribe/pipelines/pipe_vad.py
CHANGED
@@ -33,7 +33,6 @@ class VadPipe(BasePipe):
|
|
33 |
# speech_pad_ms=10
|
34 |
min_silence_duration_ms = 100,
|
35 |
# speech_pad_ms = 30,
|
36 |
-
max_speech_duration_s=20.0,
|
37 |
)
|
38 |
cls.vac.reset_states()
|
39 |
|
|
|
33 |
# speech_pad_ms=10
|
34 |
min_silence_duration_ms = 100,
|
35 |
# speech_pad_ms = 30,
|
|
|
36 |
)
|
37 |
cls.vac.reset_states()
|
38 |
|
transcribe/whisper_llm_serve.py
CHANGED
@@ -145,7 +145,13 @@ class WhisperTranscriptionService:
|
|
145 |
self.frames_np_start_timestamp = time.time()
|
146 |
# 添加音频到音频缓冲区
|
147 |
self.frames_np = np.append(self.frames_np, frame_np)
|
148 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
time_diff = time.time() - self.frames_np_start_timestamp
|
150 |
if time_diff >= config.DESIGN_TIME_THREHOLD:
|
151 |
audio_array=self.frames_np.copy()
|
@@ -160,10 +166,7 @@ class WhisperTranscriptionService:
|
|
160 |
|
161 |
def _transcription_processing_loop(self) -> None:
|
162 |
"""主转录处理循环"""
|
163 |
-
|
164 |
-
# loop_start_time = time.perf_counter()
|
165 |
-
# 1. 音频进入的时间戳长度低于3s和后面拼接起来
|
166 |
-
# 2. 超过25s后主动短句
|
167 |
|
168 |
while not self._translate_thread_stop.is_set():
|
169 |
|
|
|
145 |
self.frames_np_start_timestamp = time.time()
|
146 |
# 添加音频到音频缓冲区
|
147 |
self.frames_np = np.append(self.frames_np, frame_np)
|
148 |
+
if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
|
149 |
+
audio_array=self.frames_np.copy()
|
150 |
+
self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
|
151 |
+
self.frames_np_start_timestamp = time.time()
|
152 |
+
self.frames_np = np.array([], dtype=np.float32)
|
153 |
+
|
154 |
+
elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
|
155 |
time_diff = time.time() - self.frames_np_start_timestamp
|
156 |
if time_diff >= config.DESIGN_TIME_THREHOLD:
|
157 |
audio_array=self.frames_np.copy()
|
|
|
166 |
|
167 |
def _transcription_processing_loop(self) -> None:
|
168 |
"""主转录处理循环"""
|
169 |
+
frame_epoch = 1
|
|
|
|
|
|
|
170 |
|
171 |
while not self._translate_thread_stop.is_set():
|
172 |
|