daihui.zhang commited on
Commit
ac3675c
·
1 Parent(s): 5029a39

fix max speech duration bug

Browse files
config.py CHANGED
@@ -25,6 +25,8 @@ logging.getLogger().addHandler(console_handler)
25
  TEXT_THREHOLD = 6
26
  # 音频段的决策时间
27
  DESIGN_TIME_THREHOLD = 3
 
 
28
 
29
  BASE_DIR = pathlib.Path(__file__).parent
30
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
 
25
  TEXT_THREHOLD = 6
26
  # 音频段的决策时间
27
  DESIGN_TIME_THREHOLD = 3
28
+ # 最长语音时长
29
+ MAX_SPEECH_DURATION_S = 15
30
 
31
  BASE_DIR = pathlib.Path(__file__).parent
32
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
tests/audio_utils.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import soundfile as sf
3
+ import time
4
+
5
+ def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
6
+ """
7
+ 音频流生成器,从音频文件中读取数据并以流的方式输出
8
+
9
+ 参数:
10
+ audio_file_path: 音频文件路径
11
+ chunk_size: 每个数据块的大小(采样点数)
12
+ simulate_realtime: 是否模拟实时流处理的速度
13
+
14
+ 生成:
15
+ numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
16
+ """
17
+ # 加载音频文件
18
+ audio_data, sample_rate = sf.read(audio_file_path)
19
+
20
+ # 确保音频数据是float32类型
21
+ if audio_data.dtype != np.float32:
22
+ audio_data = audio_data.astype(np.float32)
23
+
24
+ # 如果是立体声,转换为单声道
25
+ if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
26
+ audio_data = audio_data.mean(axis=1)
27
+
28
+ print(f"已加载音频文件: {audio_file_path}")
29
+ print(f"采样率: {sample_rate} Hz")
30
+ print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
31
+
32
+ # 计算每个块的时长(秒)
33
+ chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
34
+
35
+ # 按块生成数据
36
+ audio_len = len(audio_data)
37
+ for pos in range(0, audio_len, chunk_size):
38
+ # 获取当前块
39
+ end_pos = min(pos + chunk_size, audio_len)
40
+ chunk = audio_data[pos:end_pos]
41
+
42
+ # 如果块大小不足,用0填充
43
+ if len(chunk) < chunk_size:
44
+ padded_chunk = np.zeros(chunk_size, dtype=np.float32)
45
+ padded_chunk[:len(chunk)] = chunk
46
+ chunk = padded_chunk
47
+
48
+ # 模拟实时处理的延迟
49
+ if simulate_realtime:
50
+ time.sleep(chunk_duration)
51
+
52
+ yield chunk
53
+
54
+ print("音频流处理完成")
tests/test_vad.ipynb ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from audio_utils import audio_stream_generator\n",
10
+ "import IPython.display as ipd\n",
11
+ "import sys\n",
12
+ "sys.path.append(\"..\")\n",
13
+ "from transcribe.helpers.vadprocessor import FixedVADIterator\n"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 3,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "vac = FixedVADIterator(\n",
23
+ " threshold=0.5,\n",
24
+ " sampling_rate=16000,\n",
25
+ " # speech_pad_ms=10\n",
26
+ " min_silence_duration_ms = 100,\n",
27
+ " # speech_pad_ms = 30,\n",
28
+ " max_speech_duration_s=5.0,\n",
29
+ " )\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 10,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "SAMPLE_FILE_PATH = \"/Users/david/Samples/Audio/zh/liyongle.wav\"\n",
39
+ "SAMPLING_RATE = 16000\n",
40
+ "\n",
41
+ "chunks_generator = audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
42
+ "vac.reset_states()"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 11,
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "name": "stdout",
52
+ "output_type": "stream",
53
+ "text": [
54
+ "已加载音频文件: /Users/david/Samples/Audio/zh/liyongle.wav\n",
55
+ "采样率: 16000 Hz\n",
56
+ "音频长度: 64.00 秒\n",
57
+ "{'start': 3616}\n",
58
+ "{'end': 83968}\n",
59
+ "{'end': 164352}\n",
60
+ "{'end': 244736}\n",
61
+ "{'end': 325120}\n",
62
+ "{'end': 405504}\n",
63
+ "{'end': 485888}\n",
64
+ "{'end': 566272}\n",
65
+ "{'end': 624608}\n",
66
+ "{'start': 631328}\n",
67
+ "{'end': 691168}\n",
68
+ "{'start': 698912}\n",
69
+ "{'end': 779264}\n",
70
+ "{'end': 800736}\n",
71
+ "{'start': 805920}\n",
72
+ "{'end': 846816}\n",
73
+ "{'start': 855072}\n",
74
+ "{'end': 862176}\n",
75
+ "{'start': 864288}\n",
76
+ "{'end': 890336}\n",
77
+ "{'start': 893984}\n",
78
+ "{'end': 912352}\n",
79
+ "{'start': 917536}\n",
80
+ "{'end': 932320}\n",
81
+ "{'start': 939040}\n",
82
+ "{'end': 966112}\n",
83
+ "{'start': 970784}\n",
84
+ "{'end': 1015264}\n",
85
+ "{'start': 1019424}\n",
86
+ "音频流处理完成\n"
87
+ ]
88
+ }
89
+ ],
90
+ "source": [
91
+ "for chunk in chunks_generator:\n",
92
+ " # vad_iterator.reset_states()\n",
93
+ " # audio_buffer = np.append(audio_buffer, chunk)\n",
94
+ " \n",
95
+ " speech_dict = vac(chunk, return_seconds=False)\n",
96
+ " if speech_dict:\n",
97
+ " print(speech_dict)"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": []
106
+ }
107
+ ],
108
+ "metadata": {
109
+ "kernelspec": {
110
+ "display_name": ".venv",
111
+ "language": "python",
112
+ "name": "python3"
113
+ },
114
+ "language_info": {
115
+ "codemirror_mode": {
116
+ "name": "ipython",
117
+ "version": 3
118
+ },
119
+ "file_extension": ".py",
120
+ "mimetype": "text/x-python",
121
+ "name": "python",
122
+ "nbconvert_exporter": "python",
123
+ "pygments_lexer": "ipython3",
124
+ "version": "3.11.11"
125
+ }
126
+ },
127
+ "nbformat": 4,
128
+ "nbformat_minor": 2
129
+ }
transcribe/helpers/vadprocessor.py CHANGED
@@ -155,7 +155,7 @@ class VADIteratorOnnx:
155
  raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
156
 
157
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
158
- self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
159
  self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
160
  self.reset_states()
161
 
@@ -184,7 +184,7 @@ class VADIteratorOnnx:
184
  self.current_sample += window_size_samples
185
 
186
  speech_prob = self.model(x, self.sampling_rate)[0,0]
187
- # print(f"{self.current_sample/self.sampling_rate:.2f}: {speech_prob}")
188
 
189
  if (speech_prob >= self.threshold) and self.temp_end:
190
  self.temp_end = 0
@@ -196,11 +196,11 @@ class VADIteratorOnnx:
196
  self.start = speech_start
197
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
198
 
199
- if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
200
- if self.temp_end:
201
- self.temp_end = 0
202
- self.start = self.current_sample
203
- return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
204
 
205
  if (speech_prob < self.threshold - 0.15) and self.triggered:
206
  if not self.temp_end:
 
155
  raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
156
 
157
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
158
+ # self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
159
  self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
160
  self.reset_states()
161
 
 
184
  self.current_sample += window_size_samples
185
 
186
  speech_prob = self.model(x, self.sampling_rate)[0,0]
187
+
188
 
189
  if (speech_prob >= self.threshold) and self.temp_end:
190
  self.temp_end = 0
 
196
  self.start = speech_start
197
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
198
 
199
+ # if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
200
+ # if self.temp_end:
201
+ # self.temp_end = 0
202
+ # self.start = self.current_sample
203
+ # return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
204
 
205
  if (speech_prob < self.threshold - 0.15) and self.triggered:
206
  if not self.temp_end:
transcribe/pipelines/pipe_vad.py CHANGED
@@ -33,7 +33,6 @@ class VadPipe(BasePipe):
33
  # speech_pad_ms=10
34
  min_silence_duration_ms = 100,
35
  # speech_pad_ms = 30,
36
- max_speech_duration_s=20.0,
37
  )
38
  cls.vac.reset_states()
39
 
 
33
  # speech_pad_ms=10
34
  min_silence_duration_ms = 100,
35
  # speech_pad_ms = 30,
 
36
  )
37
  cls.vac.reset_states()
38
 
transcribe/whisper_llm_serve.py CHANGED
@@ -145,7 +145,13 @@ class WhisperTranscriptionService:
145
  self.frames_np_start_timestamp = time.time()
146
  # 添加音频到音频缓冲区
147
  self.frames_np = np.append(self.frames_np, frame_np)
148
- if speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
 
 
 
 
 
 
149
  time_diff = time.time() - self.frames_np_start_timestamp
150
  if time_diff >= config.DESIGN_TIME_THREHOLD:
151
  audio_array=self.frames_np.copy()
@@ -160,10 +166,7 @@ class WhisperTranscriptionService:
160
 
161
  def _transcription_processing_loop(self) -> None:
162
  """主转录处理循环"""
163
-
164
- # loop_start_time = time.perf_counter()
165
- # 1. 音频进入的时间戳长度低于3s和后面拼接起来
166
- # 2. 超过25s后主动短句
167
 
168
  while not self._translate_thread_stop.is_set():
169
 
 
145
  self.frames_np_start_timestamp = time.time()
146
  # 添加音频到音频缓冲区
147
  self.frames_np = np.append(self.frames_np, frame_np)
148
+ if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
149
+ audio_array=self.frames_np.copy()
150
+ self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
151
+ self.frames_np_start_timestamp = time.time()
152
+ self.frames_np = np.array([], dtype=np.float32)
153
+
154
+ elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
155
  time_diff = time.time() - self.frames_np_start_timestamp
156
  if time_diff >= config.DESIGN_TIME_THREHOLD:
157
  audio_array=self.frames_np.copy()
 
166
 
167
  def _transcription_processing_loop(self) -> None:
168
  """主转录处理循环"""
169
+ frame_epoch = 1
 
 
 
170
 
171
  while not self._translate_thread_stop.is_set():
172