Upload folder using huggingface_hub
Browse files- README.md +1 -0
- model_repo_sense_voice_small/encoder/1/.gitkeep +0 -0
- model_repo_sense_voice_small/encoder/1/model.onnx +3 -0
- model_repo_sense_voice_small/encoder/config.pbtxt +71 -0
- model_repo_sense_voice_small/feature_extractor/1/__pycache__/model.cpython-310.pyc +0 -0
- model_repo_sense_voice_small/feature_extractor/1/model.py +325 -0
- model_repo_sense_voice_small/feature_extractor/am.mvn +8 -0
- model_repo_sense_voice_small/feature_extractor/config.pbtxt +81 -0
- model_repo_sense_voice_small/feature_extractor/config.yaml +97 -0
- model_repo_sense_voice_small/scoring/1/__pycache__/model.cpython-310.pyc +0 -0
- model_repo_sense_voice_small/scoring/1/model.py +136 -0
- model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
- model_repo_sense_voice_small/scoring/config.pbtxt +59 -0
- model_repo_sense_voice_small/sensevoice/1/.gitkeep +0 -0
- model_repo_sense_voice_small/sensevoice/config.pbtxt +117 -0
- run.sh +3 -0
README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
See https://github.com/modelscope/FunASR/tree/main/runtime/triton_gpu
|
model_repo_sense_voice_small/encoder/1/.gitkeep
ADDED
File without changes
|
model_repo_sense_voice_small/encoder/1/model.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07fe6dd7a4765c64dd63e01d0bba340d3c1eefa3b591553060fb231e2d7cd874
|
3 |
+
size 937424191
|
model_repo_sense_voice_small/encoder/config.pbtxt
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
name: "encoder"
|
16 |
+
backend: "onnxruntime"
|
17 |
+
default_model_filename: "model.onnx"
|
18 |
+
|
19 |
+
max_batch_size: 16
|
20 |
+
|
21 |
+
input [
|
22 |
+
{
|
23 |
+
name: "speech"
|
24 |
+
data_type: TYPE_FP32
|
25 |
+
dims: [-1, 560]
|
26 |
+
},
|
27 |
+
{
|
28 |
+
name: "speech_lengths"
|
29 |
+
data_type: TYPE_INT32
|
30 |
+
dims: [1]
|
31 |
+
reshape: { shape: [ ] }
|
32 |
+
},
|
33 |
+
{
|
34 |
+
name: "language"
|
35 |
+
data_type: TYPE_INT32
|
36 |
+
dims: [1]
|
37 |
+
reshape: { shape: [ ] }
|
38 |
+
},
|
39 |
+
{
|
40 |
+
name: "textnorm"
|
41 |
+
data_type: TYPE_INT32
|
42 |
+
dims: [1]
|
43 |
+
reshape: { shape: [ ] }
|
44 |
+
}
|
45 |
+
]
|
46 |
+
|
47 |
+
output [
|
48 |
+
{
|
49 |
+
name: "ctc_logits"
|
50 |
+
data_type: TYPE_FP32
|
51 |
+
dims: [-1, 25055]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "encoder_out_lens"
|
55 |
+
data_type: TYPE_INT32
|
56 |
+
dims: [1]
|
57 |
+
reshape: { shape: [ ] }
|
58 |
+
}
|
59 |
+
]
|
60 |
+
|
61 |
+
dynamic_batching {
|
62 |
+
}
|
63 |
+
parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
|
64 |
+
|
65 |
+
instance_group [
|
66 |
+
{
|
67 |
+
count: 1
|
68 |
+
kind: KIND_GPU
|
69 |
+
}
|
70 |
+
]
|
71 |
+
|
model_repo_sense_voice_small/feature_extractor/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (9.94 kB). View file
|
|
model_repo_sense_voice_small/feature_extractor/1/model.py
ADDED
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
import math
|
17 |
+
import triton_python_backend_utils as pb_utils
|
18 |
+
from torch.utils.dlpack import to_dlpack
|
19 |
+
import torch
|
20 |
+
import numpy as np
|
21 |
+
import kaldifeat
|
22 |
+
import _kaldifeat
|
23 |
+
from typing import List
|
24 |
+
import json
|
25 |
+
import yaml
|
26 |
+
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
|
27 |
+
|
28 |
+
|
29 |
+
class LFR(torch.nn.Module):
|
30 |
+
"""Batch LFR: https://github.com/Mddct/devil-asr/blob/main/patch/lfr.py"""
|
31 |
+
|
32 |
+
def __init__(self, m: int = 7, n: int = 6) -> None:
|
33 |
+
"""
|
34 |
+
Actually, this implements stacking frames and skipping frames.
|
35 |
+
if m = 1 and n = 1, just return the origin features.
|
36 |
+
if m = 1 and n > 1, it works like skipping.
|
37 |
+
if m > 1 and n = 1, it works like stacking but only support right frames.
|
38 |
+
if m > 1 and n > 1, it works like LFR.
|
39 |
+
"""
|
40 |
+
super().__init__()
|
41 |
+
|
42 |
+
self.m = m
|
43 |
+
self.n = n
|
44 |
+
|
45 |
+
self.left_padding_nums = math.ceil((self.m - 1) // 2)
|
46 |
+
|
47 |
+
def forward(
|
48 |
+
self, input_tensor: torch.Tensor, input_lens: torch.Tensor
|
49 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
50 |
+
B, _, D = input_tensor.size()
|
51 |
+
n_lfr = torch.ceil(input_lens / self.n)
|
52 |
+
|
53 |
+
prepad_nums = input_lens + self.left_padding_nums
|
54 |
+
|
55 |
+
right_padding_nums = torch.where(
|
56 |
+
self.m >= (prepad_nums - self.n * (n_lfr - 1)),
|
57 |
+
self.m - (prepad_nums - self.n * (n_lfr - 1)),
|
58 |
+
0,
|
59 |
+
)
|
60 |
+
|
61 |
+
T_all = self.left_padding_nums + input_lens + right_padding_nums
|
62 |
+
|
63 |
+
new_len = T_all // self.n
|
64 |
+
|
65 |
+
T_all_max = T_all.max().int()
|
66 |
+
|
67 |
+
tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1, D) # [B,1,D]
|
68 |
+
|
69 |
+
tail_frames = torch.gather(input_tensor, 1, tail_frames_index)
|
70 |
+
tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1)
|
71 |
+
head_frames = input_tensor[:, 0:1, :].repeat(1, self.left_padding_nums, 1)
|
72 |
+
|
73 |
+
# stack
|
74 |
+
input_tensor = torch.cat([head_frames, input_tensor, tail_frames], dim=1)
|
75 |
+
|
76 |
+
index = (
|
77 |
+
torch.arange(T_all_max, device=input_tensor.device, dtype=input_lens.dtype)
|
78 |
+
.unsqueeze(0)
|
79 |
+
.repeat(B, 1)
|
80 |
+
) # [B, T_all_max]
|
81 |
+
index_mask = index < (self.left_padding_nums + input_lens).unsqueeze(1) # [B, T_all_max]
|
82 |
+
|
83 |
+
tail_index_mask = torch.logical_not(index >= (T_all.unsqueeze(1))) & index_mask
|
84 |
+
tail = torch.ones(T_all_max, dtype=input_lens.dtype, device=input_tensor.device).unsqueeze(
|
85 |
+
0
|
86 |
+
).repeat(B, 1) * (
|
87 |
+
T_all_max - 1
|
88 |
+
) # [B, T_all_max]
|
89 |
+
indices = torch.where(torch.logical_or(index_mask, tail_index_mask), index, tail)
|
90 |
+
input_tensor = torch.gather(input_tensor, 1, indices.unsqueeze(2).repeat(1, 1, D))
|
91 |
+
|
92 |
+
input_tensor = input_tensor.unfold(1, self.m, step=self.n).transpose(2, 3)
|
93 |
+
|
94 |
+
return input_tensor.reshape(B, -1, D * self.m), new_len
|
95 |
+
|
96 |
+
|
97 |
+
class WavFrontend:
|
98 |
+
"""Conventional frontend structure for ASR."""
|
99 |
+
|
100 |
+
def __init__(
|
101 |
+
self,
|
102 |
+
cmvn_file: str = None,
|
103 |
+
fs: int = 16000,
|
104 |
+
window: str = "hamming",
|
105 |
+
n_mels: int = 80,
|
106 |
+
frame_length: int = 25,
|
107 |
+
frame_shift: int = 10,
|
108 |
+
filter_length_min: int = -1,
|
109 |
+
filter_length_max: float = -1,
|
110 |
+
lfr_m: int = 7,
|
111 |
+
lfr_n: int = 6,
|
112 |
+
dither: float = 1.0,
|
113 |
+
) -> None:
|
114 |
+
|
115 |
+
self.fs = fs
|
116 |
+
self.window = window
|
117 |
+
self.n_mels = n_mels
|
118 |
+
self.frame_length = frame_length
|
119 |
+
self.frame_shift = frame_shift
|
120 |
+
self.filter_length_min = filter_length_min
|
121 |
+
self.filter_length_max = filter_length_max
|
122 |
+
self.lfr_m = lfr_m
|
123 |
+
self.lfr_n = lfr_n
|
124 |
+
self.lfr = LFR(lfr_m, lfr_n)
|
125 |
+
self.cmvn_file = cmvn_file
|
126 |
+
self.dither = dither
|
127 |
+
|
128 |
+
if self.cmvn_file:
|
129 |
+
self.cmvn = self.load_cmvn()
|
130 |
+
|
131 |
+
def apply_cmvn_batch(self, inputs: np.ndarray) -> np.ndarray:
|
132 |
+
"""
|
133 |
+
Apply CMVN with mvn data
|
134 |
+
"""
|
135 |
+
batch, frame, dim = inputs.shape
|
136 |
+
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
|
137 |
+
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
|
138 |
+
|
139 |
+
means = torch.from_numpy(means).to(inputs.device)
|
140 |
+
vars = torch.from_numpy(vars).to(inputs.device)
|
141 |
+
|
142 |
+
inputs = (inputs + means) * vars
|
143 |
+
return inputs
|
144 |
+
|
145 |
+
def load_cmvn(
|
146 |
+
self,
|
147 |
+
) -> np.ndarray:
|
148 |
+
with open(self.cmvn_file, "r", encoding="utf-8") as f:
|
149 |
+
lines = f.readlines()
|
150 |
+
|
151 |
+
means_list = []
|
152 |
+
vars_list = []
|
153 |
+
for i in range(len(lines)):
|
154 |
+
line_item = lines[i].split()
|
155 |
+
if line_item[0] == "<AddShift>":
|
156 |
+
line_item = lines[i + 1].split()
|
157 |
+
if line_item[0] == "<LearnRateCoef>":
|
158 |
+
add_shift_line = line_item[3 : (len(line_item) - 1)]
|
159 |
+
means_list = list(add_shift_line)
|
160 |
+
continue
|
161 |
+
elif line_item[0] == "<Rescale>":
|
162 |
+
line_item = lines[i + 1].split()
|
163 |
+
if line_item[0] == "<LearnRateCoef>":
|
164 |
+
rescale_line = line_item[3 : (len(line_item) - 1)]
|
165 |
+
vars_list = list(rescale_line)
|
166 |
+
continue
|
167 |
+
|
168 |
+
means = np.array(means_list).astype(np.float64)
|
169 |
+
vars = np.array(vars_list).astype(np.float64)
|
170 |
+
cmvn = np.array([means, vars])
|
171 |
+
return cmvn
|
172 |
+
|
173 |
+
|
174 |
+
class Fbank(torch.nn.Module):
|
175 |
+
def __init__(self, opts):
|
176 |
+
super(Fbank, self).__init__()
|
177 |
+
self.fbank = kaldifeat.Fbank(opts)
|
178 |
+
|
179 |
+
def forward(self, waves: List[torch.Tensor]):
|
180 |
+
return self.fbank(waves)
|
181 |
+
|
182 |
+
|
183 |
+
class TritonPythonModel:
|
184 |
+
"""Your Python model must use the same class name. Every Python model
|
185 |
+
that is created must have "TritonPythonModel" as the class name.
|
186 |
+
"""
|
187 |
+
|
188 |
+
def initialize(self, args):
|
189 |
+
"""`initialize` is called only once when the model is being loaded.
|
190 |
+
Implementing `initialize` function is optional. This function allows
|
191 |
+
the model to initialize any state associated with this model.
|
192 |
+
|
193 |
+
Parameters
|
194 |
+
----------
|
195 |
+
args : dict
|
196 |
+
Both keys and values are strings. The dictionary keys and values are:
|
197 |
+
* model_config: A JSON string containing the model configuration
|
198 |
+
* model_instance_kind: A string containing model instance kind
|
199 |
+
* model_instance_device_id: A string containing model instance device ID
|
200 |
+
* model_repository: Model repository path
|
201 |
+
* model_version: Model version
|
202 |
+
* model_name: Model name
|
203 |
+
"""
|
204 |
+
self.model_config = model_config = json.loads(args["model_config"])
|
205 |
+
self.max_batch_size = max(model_config["max_batch_size"], 1)
|
206 |
+
self.device = "cuda"
|
207 |
+
|
208 |
+
# Get OUTPUT0 configuration
|
209 |
+
output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
|
210 |
+
# Convert Triton types to numpy types
|
211 |
+
output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
|
212 |
+
|
213 |
+
if output0_dtype == np.float32:
|
214 |
+
self.output0_dtype = torch.float32
|
215 |
+
else:
|
216 |
+
self.output0_dtype = torch.float16
|
217 |
+
|
218 |
+
# Get OUTPUT1 configuration
|
219 |
+
output1_config = pb_utils.get_output_config_by_name(model_config, "speech_lengths")
|
220 |
+
# Convert Triton types to numpy types
|
221 |
+
self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"])
|
222 |
+
|
223 |
+
params = self.model_config["parameters"]
|
224 |
+
|
225 |
+
for li in params.items():
|
226 |
+
key, value = li
|
227 |
+
value = value["string_value"]
|
228 |
+
if key == "config_path":
|
229 |
+
with open(str(value), "rb") as f:
|
230 |
+
config = yaml.load(f, Loader=yaml.Loader)
|
231 |
+
if key == "cmvn_path":
|
232 |
+
cmvn_path = str(value)
|
233 |
+
config["frontend_conf"]["cmvn_file"] = cmvn_path
|
234 |
+
|
235 |
+
opts = kaldifeat.FbankOptions()
|
236 |
+
opts.frame_opts.dither = 1.0 # TODO: 0.0 or 1.0
|
237 |
+
opts.frame_opts.window_type = config["frontend_conf"]["window"]
|
238 |
+
opts.mel_opts.num_bins = int(config["frontend_conf"]["n_mels"])
|
239 |
+
opts.frame_opts.frame_shift_ms = float(config["frontend_conf"]["frame_shift"])
|
240 |
+
opts.frame_opts.frame_length_ms = float(config["frontend_conf"]["frame_length"])
|
241 |
+
opts.frame_opts.samp_freq = int(config["frontend_conf"]["fs"])
|
242 |
+
opts.device = torch.device(self.device)
|
243 |
+
self.opts = opts
|
244 |
+
self.feature_extractor = Fbank(self.opts)
|
245 |
+
self.feature_size = opts.mel_opts.num_bins
|
246 |
+
|
247 |
+
self.frontend = WavFrontend(**config["frontend_conf"])
|
248 |
+
|
249 |
+
def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
|
250 |
+
feats, feats_len = [], []
|
251 |
+
wavs = []
|
252 |
+
for waveform in waveform_list:
|
253 |
+
wav = torch.from_numpy(waveform).float().squeeze().to(self.device)
|
254 |
+
wavs.append(wav)
|
255 |
+
|
256 |
+
features = self.feature_extractor(wavs)
|
257 |
+
features_len = [feature.shape[0] for feature in features]
|
258 |
+
speech = torch.zeros(
|
259 |
+
(len(features), max(features_len), self.opts.mel_opts.num_bins),
|
260 |
+
dtype=self.output0_dtype,
|
261 |
+
device=self.device,
|
262 |
+
)
|
263 |
+
for i, feature in enumerate(features):
|
264 |
+
speech[i, : int(features_len[i])] = feature
|
265 |
+
speech_lens = torch.tensor(features_len, dtype=torch.int64).to(self.device)
|
266 |
+
|
267 |
+
feats, feats_len = self.frontend.lfr(speech, speech_lens)
|
268 |
+
feats_len = feats_len.type(torch.int32)
|
269 |
+
|
270 |
+
feats = self.frontend.apply_cmvn_batch(feats)
|
271 |
+
feats = feats.type(self.output0_dtype)
|
272 |
+
|
273 |
+
return feats, feats_len
|
274 |
+
|
275 |
+
def execute(self, requests):
|
276 |
+
"""`execute` must be implemented in every Python model. `execute`
|
277 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
278 |
+
argument. This function is called when an inference is requested
|
279 |
+
for this model.
|
280 |
+
|
281 |
+
Parameters
|
282 |
+
----------
|
283 |
+
requests : list
|
284 |
+
A list of pb_utils.InferenceRequest
|
285 |
+
|
286 |
+
Returns
|
287 |
+
-------
|
288 |
+
list
|
289 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
290 |
+
be the same as `requests`
|
291 |
+
"""
|
292 |
+
batch_count = []
|
293 |
+
total_waves = []
|
294 |
+
batch_len = []
|
295 |
+
responses = []
|
296 |
+
for request in requests:
|
297 |
+
|
298 |
+
input0 = pb_utils.get_input_tensor_by_name(request, "wav")
|
299 |
+
input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
|
300 |
+
|
301 |
+
cur_b_wav = input0.as_numpy() * (1 << 15) # b x -1
|
302 |
+
# remove paddings, however, encoder may can't batch requests since different lengths.
|
303 |
+
# cur_b_wav = cur_b_wav[:, : int(input1.as_numpy()[0])]
|
304 |
+
batch_count.append(cur_b_wav.shape[0])
|
305 |
+
|
306 |
+
# convert the bx-1 numpy array into a 1x-1 list of arrays
|
307 |
+
cur_b_wav_list = [np.expand_dims(cur_b_wav[i],0) for i in range(cur_b_wav.shape[0])]
|
308 |
+
total_waves.extend(cur_b_wav_list)
|
309 |
+
|
310 |
+
features, feats_len = self.extract_feat(total_waves)
|
311 |
+
|
312 |
+
i = 0
|
313 |
+
for batch in batch_count:
|
314 |
+
speech = features[i : i + batch]
|
315 |
+
speech_lengths = feats_len[i : i + batch].unsqueeze(1)
|
316 |
+
|
317 |
+
speech, speech_lengths = speech.cpu(), speech_lengths.cpu()
|
318 |
+
|
319 |
+
out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
|
320 |
+
out1 = pb_utils.Tensor.from_dlpack("speech_lengths", to_dlpack(speech_lengths))
|
321 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
|
322 |
+
responses.append(inference_response)
|
323 |
+
i += batch
|
324 |
+
|
325 |
+
return responses
|
model_repo_sense_voice_small/feature_extractor/am.mvn
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<Nnet>
|
2 |
+
<Splice> 560 560
|
3 |
+
[ 0 ]
|
4 |
+
<AddShift> 560 560
|
5 |
+
<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
|
6 |
+
<Rescale> 560 560
|
7 |
+
<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
|
8 |
+
</Nnet>
|
model_repo_sense_voice_small/feature_extractor/config.pbtxt
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
name: "feature_extractor"
|
16 |
+
backend: "python"
|
17 |
+
max_batch_size: 16
|
18 |
+
|
19 |
+
parameters [
|
20 |
+
{
|
21 |
+
key: "num_mel_bins",
|
22 |
+
value: { string_value: "80"}
|
23 |
+
},
|
24 |
+
{
|
25 |
+
key: "frame_shift_in_ms"
|
26 |
+
value: { string_value: "10"}
|
27 |
+
},
|
28 |
+
{
|
29 |
+
key: "frame_length_in_ms"
|
30 |
+
value: { string_value: "25"}
|
31 |
+
},
|
32 |
+
{
|
33 |
+
key: "sample_rate"
|
34 |
+
value: { string_value: "16000"}
|
35 |
+
},
|
36 |
+
{
|
37 |
+
key: "cmvn_path"
|
38 |
+
value: { string_value: "./model_repo_sense_voice_small/feature_extractor/am.mvn"}
|
39 |
+
},
|
40 |
+
{
|
41 |
+
key: "config_path"
|
42 |
+
value: { string_value: "./model_repo_sense_voice_small/feature_extractor/config.yaml"}
|
43 |
+
}
|
44 |
+
|
45 |
+
]
|
46 |
+
|
47 |
+
input [
|
48 |
+
{
|
49 |
+
name: "wav"
|
50 |
+
data_type: TYPE_FP32
|
51 |
+
dims: [-1]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
name: "wav_lens"
|
55 |
+
data_type: TYPE_INT32
|
56 |
+
dims: [1]
|
57 |
+
}
|
58 |
+
]
|
59 |
+
|
60 |
+
output [
|
61 |
+
{
|
62 |
+
name: "speech"
|
63 |
+
data_type: TYPE_FP32
|
64 |
+
dims: [-1, 560] # 80
|
65 |
+
},
|
66 |
+
{
|
67 |
+
name: "speech_lengths"
|
68 |
+
data_type: TYPE_INT32
|
69 |
+
dims: [1]
|
70 |
+
}
|
71 |
+
]
|
72 |
+
|
73 |
+
dynamic_batching {
|
74 |
+
}
|
75 |
+
|
76 |
+
instance_group [
|
77 |
+
{
|
78 |
+
count: 2
|
79 |
+
kind: KIND_GPU
|
80 |
+
}
|
81 |
+
]
|
model_repo_sense_voice_small/feature_extractor/config.yaml
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
encoder: SenseVoiceEncoderSmall
|
2 |
+
encoder_conf:
|
3 |
+
output_size: 512
|
4 |
+
attention_heads: 4
|
5 |
+
linear_units: 2048
|
6 |
+
num_blocks: 50
|
7 |
+
tp_blocks: 20
|
8 |
+
dropout_rate: 0.1
|
9 |
+
positional_dropout_rate: 0.1
|
10 |
+
attention_dropout_rate: 0.1
|
11 |
+
input_layer: pe
|
12 |
+
pos_enc_class: SinusoidalPositionEncoder
|
13 |
+
normalize_before: true
|
14 |
+
kernel_size: 11
|
15 |
+
sanm_shfit: 0
|
16 |
+
selfattention_layer_type: sanm
|
17 |
+
|
18 |
+
|
19 |
+
model: SenseVoiceSmall
|
20 |
+
model_conf:
|
21 |
+
length_normalized_loss: true
|
22 |
+
sos: 1
|
23 |
+
eos: 2
|
24 |
+
ignore_id: -1
|
25 |
+
|
26 |
+
tokenizer: SentencepiecesTokenizer
|
27 |
+
tokenizer_conf:
|
28 |
+
bpemodel: null
|
29 |
+
unk_symbol: <unk>
|
30 |
+
split_with_space: true
|
31 |
+
|
32 |
+
frontend: WavFrontend
|
33 |
+
frontend_conf:
|
34 |
+
fs: 16000
|
35 |
+
window: hamming
|
36 |
+
n_mels: 80
|
37 |
+
frame_length: 25
|
38 |
+
frame_shift: 10
|
39 |
+
lfr_m: 7
|
40 |
+
lfr_n: 6
|
41 |
+
cmvn_file: null
|
42 |
+
|
43 |
+
|
44 |
+
dataset: SenseVoiceCTCDataset
|
45 |
+
dataset_conf:
|
46 |
+
index_ds: IndexDSJsonl
|
47 |
+
batch_sampler: EspnetStyleBatchSampler
|
48 |
+
data_split_num: 32
|
49 |
+
batch_type: token
|
50 |
+
batch_size: 14000
|
51 |
+
max_token_length: 2000
|
52 |
+
min_token_length: 60
|
53 |
+
max_source_length: 2000
|
54 |
+
min_source_length: 60
|
55 |
+
max_target_length: 200
|
56 |
+
min_target_length: 0
|
57 |
+
shuffle: true
|
58 |
+
num_workers: 4
|
59 |
+
sos: ${model_conf.sos}
|
60 |
+
eos: ${model_conf.eos}
|
61 |
+
IndexDSJsonl: IndexDSJsonl
|
62 |
+
retry: 20
|
63 |
+
|
64 |
+
train_conf:
|
65 |
+
accum_grad: 1
|
66 |
+
grad_clip: 5
|
67 |
+
max_epoch: 20
|
68 |
+
keep_nbest_models: 10
|
69 |
+
avg_nbest_model: 10
|
70 |
+
log_interval: 100
|
71 |
+
resume: true
|
72 |
+
validate_interval: 10000
|
73 |
+
save_checkpoint_interval: 10000
|
74 |
+
|
75 |
+
optim: adamw
|
76 |
+
optim_conf:
|
77 |
+
lr: 0.00002
|
78 |
+
scheduler: warmuplr
|
79 |
+
scheduler_conf:
|
80 |
+
warmup_steps: 25000
|
81 |
+
|
82 |
+
specaug: SpecAugLFR
|
83 |
+
specaug_conf:
|
84 |
+
apply_time_warp: false
|
85 |
+
time_warp_window: 5
|
86 |
+
time_warp_mode: bicubic
|
87 |
+
apply_freq_mask: true
|
88 |
+
freq_mask_width_range:
|
89 |
+
- 0
|
90 |
+
- 30
|
91 |
+
lfr_rate: 6
|
92 |
+
num_freq_mask: 1
|
93 |
+
apply_time_mask: true
|
94 |
+
time_mask_width_range:
|
95 |
+
- 0
|
96 |
+
- 12
|
97 |
+
num_time_mask: 1
|
model_repo_sense_voice_small/scoring/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (4.32 kB). View file
|
|
model_repo_sense_voice_small/scoring/1/model.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
#
|
3 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
|
17 |
+
import triton_python_backend_utils as pb_utils
|
18 |
+
import numpy as np
|
19 |
+
import torch
|
20 |
+
from torch.utils.dlpack import from_dlpack
|
21 |
+
|
22 |
+
import json
|
23 |
+
import os
|
24 |
+
import yaml
|
25 |
+
|
26 |
+
import sentencepiece as spm
|
27 |
+
|
28 |
+
class TritonPythonModel:
|
29 |
+
"""Your Python model must use the same class name. Every Python model
|
30 |
+
that is created must have "TritonPythonModel" as the class name.
|
31 |
+
"""
|
32 |
+
|
33 |
+
def initialize(self, args):
|
34 |
+
"""`initialize` is called only once when the model is being loaded.
|
35 |
+
Implementing `initialize` function is optional. This function allows
|
36 |
+
the model to initialize any state associated with this model.
|
37 |
+
|
38 |
+
Parameters
|
39 |
+
----------
|
40 |
+
args : dict
|
41 |
+
Both keys and values are strings. The dictionary keys and values are:
|
42 |
+
* model_config: A JSON string containing the model configuration
|
43 |
+
* model_instance_kind: A string containing model instance kind
|
44 |
+
* model_instance_device_id: A string containing model instance device ID
|
45 |
+
* model_repository: Model repository path
|
46 |
+
* model_version: Model version
|
47 |
+
* model_name: Model name
|
48 |
+
"""
|
49 |
+
self.model_config = model_config = json.loads(args["model_config"])
|
50 |
+
self.max_batch_size = max(model_config["max_batch_size"], 1)
|
51 |
+
|
52 |
+
# # Get OUTPUT0 configuration
|
53 |
+
output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
|
54 |
+
# # Convert Triton types to numpy types
|
55 |
+
self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
|
56 |
+
|
57 |
+
self.init_tokenizer(self.model_config["parameters"])
|
58 |
+
|
59 |
+
def init_tokenizer(self, parameters):
|
60 |
+
for li in parameters.items():
|
61 |
+
key, value = li
|
62 |
+
value = value["string_value"]
|
63 |
+
if key == "tokenizer_path":
|
64 |
+
tokenizer_path = value
|
65 |
+
self.tokenizer = spm.SentencePieceProcessor()
|
66 |
+
self.tokenizer.Load(tokenizer_path)
|
67 |
+
|
68 |
+
|
69 |
+
def execute(self, requests):
|
70 |
+
"""`execute` must be implemented in every Python model. `execute`
|
71 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
72 |
+
argument. This function is called when an inference is requested
|
73 |
+
for this model.
|
74 |
+
|
75 |
+
Parameters
|
76 |
+
----------
|
77 |
+
requests : list
|
78 |
+
A list of pb_utils.InferenceRequest
|
79 |
+
|
80 |
+
Returns
|
81 |
+
-------
|
82 |
+
list
|
83 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
84 |
+
be the same as `requests`
|
85 |
+
"""
|
86 |
+
# Every Python backend must iterate through list of requests and create
|
87 |
+
# an instance of pb_utils.InferenceResponse class for each of them. You
|
88 |
+
# should avoid storing any of the input Tensors in the class attributes
|
89 |
+
# as they will be overridden in subsequent inference requests. You can
|
90 |
+
# make a copy of the underlying NumPy array and store it if it is
|
91 |
+
# required.
|
92 |
+
|
93 |
+
total_seq = 0
|
94 |
+
logits_list, batch_count = [], []
|
95 |
+
|
96 |
+
for request in requests:
|
97 |
+
# Perform inference on the request and append it to responses list...
|
98 |
+
in_0 = pb_utils.get_input_tensor_by_name(request, "ctc_logits")
|
99 |
+
|
100 |
+
logits = from_dlpack(in_0.to_dlpack())
|
101 |
+
logits_list.append(logits)
|
102 |
+
|
103 |
+
total_seq += logits.shape[0]
|
104 |
+
batch_count.append(logits.shape[0])
|
105 |
+
|
106 |
+
logits_batch = torch.cat(logits_list, dim=0)
|
107 |
+
yseq_batch = logits_batch.argmax(axis=-1)
|
108 |
+
yseq_batch = torch.unique_consecutive(yseq_batch, dim=-1)
|
109 |
+
|
110 |
+
yseq_batch = yseq_batch.tolist()
|
111 |
+
|
112 |
+
# Remove blank_id and EOS tokens
|
113 |
+
token_int_batch = [list(filter(lambda x: x not in (0, 2), yseq)) for yseq in yseq_batch]
|
114 |
+
|
115 |
+
hyps = []
|
116 |
+
for i, token_int in enumerate(token_int_batch):
|
117 |
+
hyp = self.tokenizer.DecodeIds(token_int)
|
118 |
+
hyps.append(hyp)
|
119 |
+
|
120 |
+
responses = []
|
121 |
+
i = 0
|
122 |
+
for batch in batch_count:
|
123 |
+
sents = np.array(hyps[i : i + batch])
|
124 |
+
out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
|
125 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
|
126 |
+
responses.append(inference_response)
|
127 |
+
i += batch
|
128 |
+
|
129 |
+
return responses
|
130 |
+
|
131 |
+
def finalize(self):
|
132 |
+
"""`finalize` is called only once when the model is being unloaded.
|
133 |
+
Implementing `finalize` function is optional. This function allows
|
134 |
+
the model to perform any necessary clean ups before exit.
|
135 |
+
"""
|
136 |
+
print("Cleaning up...")
|
model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
|
3 |
+
size 377341
|
model_repo_sense_voice_small/scoring/config.pbtxt
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
name: "scoring"
|
16 |
+
backend: "python"
|
17 |
+
max_batch_size: 16
|
18 |
+
|
19 |
+
parameters [
|
20 |
+
{
|
21 |
+
key: "tokenizer_path",
|
22 |
+
value: { string_value: "./model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model"}
|
23 |
+
},
|
24 |
+
{ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
|
25 |
+
value: {string_value:"no"}
|
26 |
+
}
|
27 |
+
]
|
28 |
+
|
29 |
+
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "ctc_logits"
|
33 |
+
data_type: TYPE_FP32
|
34 |
+
dims: [-1, 25055]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "encoder_out_lens"
|
38 |
+
data_type: TYPE_INT32
|
39 |
+
dims: [1]
|
40 |
+
reshape: { shape: [ ] }
|
41 |
+
}
|
42 |
+
]
|
43 |
+
|
44 |
+
output [
|
45 |
+
{
|
46 |
+
name: "OUTPUT0"
|
47 |
+
data_type: TYPE_STRING
|
48 |
+
dims: [1]
|
49 |
+
}
|
50 |
+
]
|
51 |
+
|
52 |
+
dynamic_batching {
|
53 |
+
}
|
54 |
+
instance_group [
|
55 |
+
{
|
56 |
+
count: 2
|
57 |
+
kind: KIND_CPU
|
58 |
+
}
|
59 |
+
]
|
model_repo_sense_voice_small/sensevoice/1/.gitkeep
ADDED
File without changes
|
model_repo_sense_voice_small/sensevoice/config.pbtxt
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
name: "sensevoice"
|
16 |
+
platform: "ensemble"
|
17 |
+
max_batch_size: 16
|
18 |
+
|
19 |
+
input [
|
20 |
+
{
|
21 |
+
name: "WAV"
|
22 |
+
data_type: TYPE_FP32
|
23 |
+
dims: [-1]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
name: "WAV_LENS"
|
27 |
+
data_type: TYPE_INT32
|
28 |
+
dims: [1]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
name: "LANGUAGE"
|
32 |
+
data_type: TYPE_INT32
|
33 |
+
dims: [1]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
name: "TEXT_NORM"
|
37 |
+
data_type: TYPE_INT32
|
38 |
+
dims: [1]
|
39 |
+
}
|
40 |
+
]
|
41 |
+
|
42 |
+
output [
|
43 |
+
{
|
44 |
+
name: "TRANSCRIPTS"
|
45 |
+
data_type: TYPE_STRING
|
46 |
+
dims: [1]
|
47 |
+
}
|
48 |
+
]
|
49 |
+
|
50 |
+
ensemble_scheduling {
|
51 |
+
step [
|
52 |
+
{
|
53 |
+
model_name: "feature_extractor"
|
54 |
+
model_version: -1
|
55 |
+
input_map {
|
56 |
+
key: "wav"
|
57 |
+
value: "WAV"
|
58 |
+
}
|
59 |
+
input_map {
|
60 |
+
key: "wav_lens"
|
61 |
+
value: "WAV_LENS"
|
62 |
+
}
|
63 |
+
output_map {
|
64 |
+
key: "speech"
|
65 |
+
value: "SPEECH"
|
66 |
+
}
|
67 |
+
output_map {
|
68 |
+
key: "speech_lengths"
|
69 |
+
value: "SPEECH_LENGTHS"
|
70 |
+
}
|
71 |
+
},
|
72 |
+
{
|
73 |
+
model_name: "encoder"
|
74 |
+
model_version: -1
|
75 |
+
input_map {
|
76 |
+
key: "speech"
|
77 |
+
value: "SPEECH"
|
78 |
+
}
|
79 |
+
input_map {
|
80 |
+
key: "speech_lengths"
|
81 |
+
value: "SPEECH_LENGTHS"
|
82 |
+
}
|
83 |
+
input_map {
|
84 |
+
key: "language"
|
85 |
+
value: "LANGUAGE"
|
86 |
+
}
|
87 |
+
input_map {
|
88 |
+
key: "textnorm"
|
89 |
+
value: "TEXT_NORM"
|
90 |
+
}
|
91 |
+
output_map {
|
92 |
+
key: "ctc_logits"
|
93 |
+
value: "ctc_logits"
|
94 |
+
}
|
95 |
+
output_map {
|
96 |
+
key: "encoder_out_lens"
|
97 |
+
value: "encoder_out_lens"
|
98 |
+
}
|
99 |
+
},
|
100 |
+
{
|
101 |
+
model_name: "scoring"
|
102 |
+
model_version: -1
|
103 |
+
input_map {
|
104 |
+
key: "ctc_logits"
|
105 |
+
value: "ctc_logits"
|
106 |
+
}
|
107 |
+
input_map {
|
108 |
+
key: "encoder_out_lens"
|
109 |
+
value: "encoder_out_lens"
|
110 |
+
}
|
111 |
+
output_map {
|
112 |
+
key: "OUTPUT0"
|
113 |
+
value: "TRANSCRIPTS"
|
114 |
+
}
|
115 |
+
}
|
116 |
+
]
|
117 |
+
}
|
run.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
export CUDA_VISIBLE_DEVICES=0
|
2 |
+
tritonserver --model-repository=./model_repo_sense_voice_small \
|
3 |
+
--pinned-memory-pool-byte-size=2048000000 --cuda-memory-pool-byte-size=0:4096000000
|