yessenzhar
commited on
Commit
•
dd20dba
1
Parent(s):
2d64b65
update models for newer trt 0.6.1 version
Browse files- ensemble/config.pbtxt +121 -12
- postprocessing/1/model.py +38 -43
- postprocessing/config.pbtxt +35 -3
- preprocessing/1/model.py +117 -43
- preprocessing/config.pbtxt +31 -5
- tensorrt_llm/1/config.json +1 -0
- tensorrt_llm/config.pbtxt +97 -15
ensemble/config.pbtxt
CHANGED
@@ -26,7 +26,7 @@
|
|
26 |
|
27 |
name: "ensemble"
|
28 |
platform: "ensemble"
|
29 |
-
max_batch_size:
|
30 |
input [
|
31 |
{
|
32 |
name: "text_input"
|
@@ -35,34 +35,36 @@ input [
|
|
35 |
},
|
36 |
{
|
37 |
name: "max_tokens"
|
38 |
-
data_type:
|
39 |
dims: [ -1 ]
|
40 |
},
|
41 |
{
|
42 |
name: "bad_words"
|
43 |
data_type: TYPE_STRING
|
44 |
dims: [ -1 ]
|
|
|
45 |
},
|
46 |
{
|
47 |
name: "stop_words"
|
48 |
data_type: TYPE_STRING
|
49 |
dims: [ -1 ]
|
|
|
50 |
},
|
51 |
{
|
52 |
name: "end_id"
|
53 |
-
data_type:
|
54 |
dims: [ 1 ]
|
55 |
optional: true
|
56 |
},
|
57 |
{
|
58 |
name: "pad_id"
|
59 |
-
data_type:
|
60 |
dims: [ 1 ]
|
61 |
optional: true
|
62 |
},
|
63 |
{
|
64 |
name: "top_k"
|
65 |
-
data_type:
|
66 |
dims: [ 1 ]
|
67 |
optional: true
|
68 |
},
|
@@ -92,7 +94,7 @@ input [
|
|
92 |
},
|
93 |
{
|
94 |
name: "min_length"
|
95 |
-
data_type:
|
96 |
dims: [ 1 ]
|
97 |
optional: true
|
98 |
},
|
@@ -108,9 +110,15 @@ input [
|
|
108 |
dims: [ 1 ]
|
109 |
optional: true
|
110 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
{
|
112 |
name: "beam_width"
|
113 |
-
data_type:
|
114 |
dims: [ 1 ]
|
115 |
optional: true
|
116 |
},
|
@@ -119,18 +127,47 @@ input [
|
|
119 |
data_type: TYPE_BOOL
|
120 |
dims: [ 1 ]
|
121 |
optional: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
}
|
123 |
]
|
124 |
output [
|
125 |
{
|
126 |
name: "text_output"
|
127 |
data_type: TYPE_STRING
|
128 |
-
dims: [ -1
|
129 |
},
|
130 |
{
|
131 |
-
name: "
|
132 |
-
data_type:
|
133 |
dims: [ -1 ]
|
|
|
|
|
|
|
|
|
|
|
134 |
}
|
135 |
]
|
136 |
ensemble_scheduling {
|
@@ -154,6 +191,14 @@ ensemble_scheduling {
|
|
154 |
key: "STOP_WORDS_DICT"
|
155 |
value: "stop_words"
|
156 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
output_map {
|
158 |
key: "REQUEST_INPUT_LEN"
|
159 |
value: "_REQUEST_INPUT_LEN"
|
@@ -166,6 +211,18 @@ ensemble_scheduling {
|
|
166 |
key: "REQUEST_OUTPUT_LEN"
|
167 |
value: "_REQUEST_OUTPUT_LEN"
|
168 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
},
|
170 |
{
|
171 |
model_name: "tensorrt_llm"
|
@@ -190,6 +247,10 @@ ensemble_scheduling {
|
|
190 |
key: "pad_id"
|
191 |
value: "pad_id"
|
192 |
}
|
|
|
|
|
|
|
|
|
193 |
input_map {
|
194 |
key: "runtime_top_k"
|
195 |
value: "top_k"
|
@@ -222,6 +283,10 @@ ensemble_scheduling {
|
|
222 |
key: "random_seed"
|
223 |
value: "random_seed"
|
224 |
}
|
|
|
|
|
|
|
|
|
225 |
input_map {
|
226 |
key: "beam_width"
|
227 |
value: "beam_width"
|
@@ -230,10 +295,38 @@ ensemble_scheduling {
|
|
230 |
key: "streaming"
|
231 |
value: "stream"
|
232 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
output_map {
|
234 |
key: "output_ids"
|
235 |
value: "_TOKENS_BATCH"
|
236 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
},
|
238 |
{
|
239 |
model_name: "postprocessing"
|
@@ -242,13 +335,29 @@ ensemble_scheduling {
|
|
242 |
key: "TOKENS_BATCH"
|
243 |
value: "_TOKENS_BATCH"
|
244 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
output_map {
|
246 |
key: "OUTPUT"
|
247 |
value: "text_output"
|
248 |
}
|
249 |
output_map {
|
250 |
-
key: "
|
251 |
-
value: "
|
|
|
|
|
|
|
|
|
252 |
}
|
253 |
}
|
254 |
]
|
|
|
26 |
|
27 |
name: "ensemble"
|
28 |
platform: "ensemble"
|
29 |
+
max_batch_size: 64
|
30 |
input [
|
31 |
{
|
32 |
name: "text_input"
|
|
|
35 |
},
|
36 |
{
|
37 |
name: "max_tokens"
|
38 |
+
data_type: TYPE_INT32
|
39 |
dims: [ -1 ]
|
40 |
},
|
41 |
{
|
42 |
name: "bad_words"
|
43 |
data_type: TYPE_STRING
|
44 |
dims: [ -1 ]
|
45 |
+
optional: true
|
46 |
},
|
47 |
{
|
48 |
name: "stop_words"
|
49 |
data_type: TYPE_STRING
|
50 |
dims: [ -1 ]
|
51 |
+
optional: true
|
52 |
},
|
53 |
{
|
54 |
name: "end_id"
|
55 |
+
data_type: TYPE_INT32
|
56 |
dims: [ 1 ]
|
57 |
optional: true
|
58 |
},
|
59 |
{
|
60 |
name: "pad_id"
|
61 |
+
data_type: TYPE_INT32
|
62 |
dims: [ 1 ]
|
63 |
optional: true
|
64 |
},
|
65 |
{
|
66 |
name: "top_k"
|
67 |
+
data_type: TYPE_INT32
|
68 |
dims: [ 1 ]
|
69 |
optional: true
|
70 |
},
|
|
|
94 |
},
|
95 |
{
|
96 |
name: "min_length"
|
97 |
+
data_type: TYPE_INT32
|
98 |
dims: [ 1 ]
|
99 |
optional: true
|
100 |
},
|
|
|
110 |
dims: [ 1 ]
|
111 |
optional: true
|
112 |
},
|
113 |
+
{
|
114 |
+
name: "return_log_probs"
|
115 |
+
data_type: TYPE_BOOL
|
116 |
+
dims: [ 1 ]
|
117 |
+
optional: true
|
118 |
+
},
|
119 |
{
|
120 |
name: "beam_width"
|
121 |
+
data_type: TYPE_INT32
|
122 |
dims: [ 1 ]
|
123 |
optional: true
|
124 |
},
|
|
|
127 |
data_type: TYPE_BOOL
|
128 |
dims: [ 1 ]
|
129 |
optional: true
|
130 |
+
},
|
131 |
+
{
|
132 |
+
name: "prompt_embedding_table"
|
133 |
+
data_type: TYPE_FP16
|
134 |
+
dims: [ -1, -1 ]
|
135 |
+
optional: true
|
136 |
+
},
|
137 |
+
{
|
138 |
+
name: "prompt_vocab_size"
|
139 |
+
data_type: TYPE_INT32
|
140 |
+
dims: [ 1 ]
|
141 |
+
optional: true
|
142 |
+
},
|
143 |
+
{
|
144 |
+
name: "embedding_bias_words"
|
145 |
+
data_type: TYPE_STRING
|
146 |
+
dims: [ -1 ]
|
147 |
+
optional: true
|
148 |
+
},
|
149 |
+
{
|
150 |
+
name: "embedding_bias_weights"
|
151 |
+
data_type: TYPE_FP32
|
152 |
+
dims: [ -1 ]
|
153 |
+
optional: true
|
154 |
}
|
155 |
]
|
156 |
output [
|
157 |
{
|
158 |
name: "text_output"
|
159 |
data_type: TYPE_STRING
|
160 |
+
dims: [ -1 ]
|
161 |
},
|
162 |
{
|
163 |
+
name: "cum_log_probs"
|
164 |
+
data_type: TYPE_FP32
|
165 |
dims: [ -1 ]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
name: "output_log_probs"
|
169 |
+
data_type: TYPE_FP32
|
170 |
+
dims: [ -1, -1 ]
|
171 |
}
|
172 |
]
|
173 |
ensemble_scheduling {
|
|
|
191 |
key: "STOP_WORDS_DICT"
|
192 |
value: "stop_words"
|
193 |
}
|
194 |
+
input_map {
|
195 |
+
key: "EMBEDDING_BIAS_WORDS"
|
196 |
+
value: "embedding_bias_words"
|
197 |
+
}
|
198 |
+
input_map {
|
199 |
+
key: "EMBEDDING_BIAS_WEIGHTS"
|
200 |
+
value: "embedding_bias_weights"
|
201 |
+
}
|
202 |
output_map {
|
203 |
key: "REQUEST_INPUT_LEN"
|
204 |
value: "_REQUEST_INPUT_LEN"
|
|
|
211 |
key: "REQUEST_OUTPUT_LEN"
|
212 |
value: "_REQUEST_OUTPUT_LEN"
|
213 |
}
|
214 |
+
output_map {
|
215 |
+
key: "STOP_WORDS_IDS"
|
216 |
+
value: "_STOP_WORDS_IDS"
|
217 |
+
}
|
218 |
+
output_map {
|
219 |
+
key: "BAD_WORDS_IDS"
|
220 |
+
value: "_BAD_WORDS_IDS"
|
221 |
+
}
|
222 |
+
output_map {
|
223 |
+
key: "EMBEDDING_BIAS"
|
224 |
+
value: "_EMBEDDING_BIAS"
|
225 |
+
}
|
226 |
},
|
227 |
{
|
228 |
model_name: "tensorrt_llm"
|
|
|
247 |
key: "pad_id"
|
248 |
value: "pad_id"
|
249 |
}
|
250 |
+
input_map {
|
251 |
+
key: "embedding_bias"
|
252 |
+
value: "_EMBEDDING_BIAS"
|
253 |
+
}
|
254 |
input_map {
|
255 |
key: "runtime_top_k"
|
256 |
value: "top_k"
|
|
|
283 |
key: "random_seed"
|
284 |
value: "random_seed"
|
285 |
}
|
286 |
+
input_map {
|
287 |
+
key: "return_log_probs"
|
288 |
+
value: "return_log_probs"
|
289 |
+
}
|
290 |
input_map {
|
291 |
key: "beam_width"
|
292 |
value: "beam_width"
|
|
|
295 |
key: "streaming"
|
296 |
value: "stream"
|
297 |
}
|
298 |
+
input_map {
|
299 |
+
key: "prompt_embedding_table"
|
300 |
+
value: "prompt_embedding_table"
|
301 |
+
}
|
302 |
+
input_map {
|
303 |
+
key: "prompt_vocab_size"
|
304 |
+
value: "prompt_vocab_size"
|
305 |
+
}
|
306 |
+
input_map {
|
307 |
+
key: "stop_words_list"
|
308 |
+
value: "_STOP_WORDS_IDS"
|
309 |
+
}
|
310 |
+
input_map {
|
311 |
+
key: "bad_words_list"
|
312 |
+
value: "_BAD_WORDS_IDS"
|
313 |
+
}
|
314 |
output_map {
|
315 |
key: "output_ids"
|
316 |
value: "_TOKENS_BATCH"
|
317 |
}
|
318 |
+
output_map {
|
319 |
+
key: "sequence_length"
|
320 |
+
value: "_SEQUENCE_LENGTH"
|
321 |
+
},
|
322 |
+
output_map {
|
323 |
+
key: "cum_log_probs"
|
324 |
+
value: "_CUM_LOG_PROBS"
|
325 |
+
}
|
326 |
+
output_map {
|
327 |
+
key: "output_log_probs"
|
328 |
+
value: "_OUTPUT_LOG_PROBS"
|
329 |
+
}
|
330 |
},
|
331 |
{
|
332 |
model_name: "postprocessing"
|
|
|
335 |
key: "TOKENS_BATCH"
|
336 |
value: "_TOKENS_BATCH"
|
337 |
}
|
338 |
+
input_map {
|
339 |
+
key: "CUM_LOG_PROBS"
|
340 |
+
value: "_CUM_LOG_PROBS"
|
341 |
+
}
|
342 |
+
input_map {
|
343 |
+
key: "OUTPUT_LOG_PROBS"
|
344 |
+
value: "_OUTPUT_LOG_PROBS"
|
345 |
+
}
|
346 |
+
input_map {
|
347 |
+
key: "SEQUENCE_LENGTH"
|
348 |
+
value: "_SEQUENCE_LENGTH"
|
349 |
+
}
|
350 |
output_map {
|
351 |
key: "OUTPUT"
|
352 |
value: "text_output"
|
353 |
}
|
354 |
output_map {
|
355 |
+
key: "OUT_OUTPUT_LOG_PROBS"
|
356 |
+
value: "output_log_probs"
|
357 |
+
}
|
358 |
+
output_map {
|
359 |
+
key: "OUT_CUM_LOG_PROBS"
|
360 |
+
value: "cum_log_probs"
|
361 |
}
|
362 |
}
|
363 |
]
|
postprocessing/1/model.py
CHANGED
@@ -57,13 +57,18 @@ class TritonPythonModel:
|
|
57 |
'string_value']
|
58 |
tokenizer_type = model_config['parameters']['tokenizer_type'][
|
59 |
'string_value']
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
if tokenizer_type == 't5':
|
62 |
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
|
63 |
padding_side='left')
|
64 |
elif tokenizer_type == 'auto':
|
65 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
66 |
-
|
67 |
elif tokenizer_type == 'llama':
|
68 |
self.tokenizer = LlamaTokenizer.from_pretrained(
|
69 |
tokenizer_dir, legacy=False, padding_side='left')
|
@@ -72,8 +77,6 @@ class TritonPythonModel:
|
|
72 |
f'Unexpected tokenizer type: {tokenizer_type}')
|
73 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
74 |
|
75 |
-
self._init_token_map()
|
76 |
-
|
77 |
# Parse model output configs
|
78 |
output_config = pb_utils.get_output_config_by_name(
|
79 |
model_config, "OUTPUT")
|
@@ -84,20 +87,6 @@ class TritonPythonModel:
|
|
84 |
output_lens_config = pb_utils.get_output_config_by_name(
|
85 |
model_config, "OUTPUT_LENS")
|
86 |
|
87 |
-
# Convert Triton types to numpy types
|
88 |
-
self.output_lens_dtype = pb_utils.triton_string_to_numpy(
|
89 |
-
output_lens_config['data_type'])
|
90 |
-
|
91 |
-
def _init_token_map(self):
|
92 |
-
v = self.tokenizer.get_vocab()
|
93 |
-
self.token_map = [None] * len(v)
|
94 |
-
for k, val in v.items():
|
95 |
-
self.token_map[val] = k
|
96 |
-
|
97 |
-
for i in range(len(v)):
|
98 |
-
if self.token_map[i] is None:
|
99 |
-
print("error %s" % i)
|
100 |
-
|
101 |
def execute(self, requests):
|
102 |
"""`execute` must be implemented in every Python model. `execute`
|
103 |
function receives a list of pb_utils.InferenceRequest as the only
|
@@ -127,21 +116,37 @@ class TritonPythonModel:
|
|
127 |
tokens_batch = pb_utils.get_input_tensor_by_name(
|
128 |
request, 'TOKENS_BATCH').as_numpy()
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
# Reshape Input
|
131 |
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
|
132 |
# tokens_batch = tokens_batch.T
|
133 |
|
134 |
# Postprocessing output data.
|
135 |
-
outputs, output_lens = self._postprocessing(tokens_batch)
|
|
|
136 |
|
137 |
# Create output tensors. You need pb_utils.Tensor
|
138 |
# objects to create pb_utils.InferenceResponse.
|
139 |
output_tensor = pb_utils.Tensor(
|
140 |
'OUTPUT',
|
141 |
np.array(outputs).astype(self.output_dtype))
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
145 |
|
146 |
# Create InferenceResponse. You can set an error here in case
|
147 |
# there was a problem with handling this inference request.
|
@@ -150,8 +155,9 @@ class TritonPythonModel:
|
|
150 |
#
|
151 |
# pb_utils.InferenceResponse(
|
152 |
# output_tensors=..., TritonError("An error occurred"))
|
153 |
-
inference_response = pb_utils.InferenceResponse(
|
154 |
-
|
|
|
155 |
responses.append(inference_response)
|
156 |
|
157 |
# You should return a list of pb_utils.InferenceResponse. Length
|
@@ -165,24 +171,13 @@ class TritonPythonModel:
|
|
165 |
"""
|
166 |
print('Cleaning up...')
|
167 |
|
168 |
-
def
|
169 |
-
st = self.token_map[token]
|
170 |
-
if st[0] == '▁':
|
171 |
-
return " " + st[1:]
|
172 |
-
else:
|
173 |
-
return self.tokenizer.decode([token])
|
174 |
-
|
175 |
-
def _postprocessing(self, tokens_batch):
|
176 |
outputs = []
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
else:
|
184 |
-
output = self.tokenizer.decode(tokens)
|
185 |
outputs.append(output.encode('utf8'))
|
186 |
-
|
187 |
-
output_lens.append(total_len)
|
188 |
-
return outputs, output_lens
|
|
|
57 |
'string_value']
|
58 |
tokenizer_type = model_config['parameters']['tokenizer_type'][
|
59 |
'string_value']
|
60 |
+
self.skip_special_tokens = model_config['parameters'].get(
|
61 |
+
'skip_special_tokens',
|
62 |
+
{'string_value': "true"})['string_value'].lower() in [
|
63 |
+
'true', '1', 't', 'y', 'yes'
|
64 |
+
]
|
65 |
|
66 |
if tokenizer_type == 't5':
|
67 |
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
|
68 |
padding_side='left')
|
69 |
elif tokenizer_type == 'auto':
|
70 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
71 |
+
tokenizer_dir, padding_side='left', trust_remote_code=True)
|
72 |
elif tokenizer_type == 'llama':
|
73 |
self.tokenizer = LlamaTokenizer.from_pretrained(
|
74 |
tokenizer_dir, legacy=False, padding_side='left')
|
|
|
77 |
f'Unexpected tokenizer type: {tokenizer_type}')
|
78 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
79 |
|
|
|
|
|
80 |
# Parse model output configs
|
81 |
output_config = pb_utils.get_output_config_by_name(
|
82 |
model_config, "OUTPUT")
|
|
|
87 |
output_lens_config = pb_utils.get_output_config_by_name(
|
88 |
model_config, "OUTPUT_LENS")
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
def execute(self, requests):
|
91 |
"""`execute` must be implemented in every Python model. `execute`
|
92 |
function receives a list of pb_utils.InferenceRequest as the only
|
|
|
116 |
tokens_batch = pb_utils.get_input_tensor_by_name(
|
117 |
request, 'TOKENS_BATCH').as_numpy()
|
118 |
|
119 |
+
# Get sequence length
|
120 |
+
sequence_lengths = pb_utils.get_input_tensor_by_name(
|
121 |
+
request, 'SEQUENCE_LENGTH').as_numpy()
|
122 |
+
|
123 |
+
# Get cum log probs
|
124 |
+
cum_log_probs = pb_utils.get_input_tensor_by_name(
|
125 |
+
request, 'CUM_LOG_PROBS').as_numpy()
|
126 |
+
|
127 |
+
# Get sequence length
|
128 |
+
output_log_probs = pb_utils.get_input_tensor_by_name(
|
129 |
+
request, 'OUTPUT_LOG_PROBS').as_numpy()
|
130 |
+
|
131 |
# Reshape Input
|
132 |
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
|
133 |
# tokens_batch = tokens_batch.T
|
134 |
|
135 |
# Postprocessing output data.
|
136 |
+
outputs, output_lens = self._postprocessing(tokens_batch, sequence_lengths)
|
137 |
+
|
138 |
|
139 |
# Create output tensors. You need pb_utils.Tensor
|
140 |
# objects to create pb_utils.InferenceResponse.
|
141 |
output_tensor = pb_utils.Tensor(
|
142 |
'OUTPUT',
|
143 |
np.array(outputs).astype(self.output_dtype))
|
144 |
+
|
145 |
+
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
|
146 |
+
cum_log_probs)
|
147 |
+
|
148 |
+
out_output_log_probs = pb_utils.Tensor('OUT_OUTPUT_LOG_PROBS',
|
149 |
+
output_log_probs)
|
150 |
|
151 |
# Create InferenceResponse. You can set an error here in case
|
152 |
# there was a problem with handling this inference request.
|
|
|
155 |
#
|
156 |
# pb_utils.InferenceResponse(
|
157 |
# output_tensors=..., TritonError("An error occurred"))
|
158 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
159 |
+
output_tensor, out_cum_log_probs, out_output_log_probs
|
160 |
+
])
|
161 |
responses.append(inference_response)
|
162 |
|
163 |
# You should return a list of pb_utils.InferenceResponse. Length
|
|
|
171 |
"""
|
172 |
print('Cleaning up...')
|
173 |
|
174 |
+
def _postprocessing(self, tokens_batch, sequence_lengths):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
outputs = []
|
176 |
+
for batch_idx, beam_tokens in enumerate(tokens_batch):
|
177 |
+
for beam_idx, tokens in enumerate(beam_tokens):
|
178 |
+
seq_len = sequence_lengths[batch_idx][beam_idx]
|
179 |
+
output = self.tokenizer.decode(
|
180 |
+
tokens[:seq_len],
|
181 |
+
skip_special_tokens=self.skip_special_tokens)
|
|
|
|
|
182 |
outputs.append(output.encode('utf8'))
|
183 |
+
return outputs
|
|
|
|
postprocessing/config.pbtxt
CHANGED
@@ -32,17 +32,42 @@ input [
|
|
32 |
name: "TOKENS_BATCH"
|
33 |
data_type: TYPE_INT32
|
34 |
dims: [ -1, -1 ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
}
|
36 |
]
|
37 |
output [
|
38 |
{
|
39 |
name: "OUTPUT"
|
40 |
data_type: TYPE_STRING
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
dims: [ -1, -1 ]
|
42 |
},
|
43 |
{
|
44 |
name: "OUTPUT_LENS"
|
45 |
-
data_type:
|
46 |
dims: [ -1 ]
|
47 |
}
|
48 |
]
|
@@ -50,7 +75,7 @@ output [
|
|
50 |
parameters {
|
51 |
key: "tokenizer_dir"
|
52 |
value: {
|
53 |
-
string_value: "
|
54 |
}
|
55 |
}
|
56 |
|
@@ -61,9 +86,16 @@ parameters {
|
|
61 |
}
|
62 |
}
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
instance_group [
|
65 |
{
|
66 |
-
count:
|
67 |
kind: KIND_CPU
|
68 |
}
|
69 |
]
|
|
|
32 |
name: "TOKENS_BATCH"
|
33 |
data_type: TYPE_INT32
|
34 |
dims: [ -1, -1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "SEQUENCE_LENGTH"
|
38 |
+
data_type: TYPE_INT32
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "CUM_LOG_PROBS"
|
43 |
+
data_type: TYPE_FP32
|
44 |
+
dims: [ -1 ]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
name: "OUTPUT_LOG_PROBS"
|
48 |
+
data_type: TYPE_FP32
|
49 |
+
dims: [ -1, -1 ]
|
50 |
}
|
51 |
]
|
52 |
output [
|
53 |
{
|
54 |
name: "OUTPUT"
|
55 |
data_type: TYPE_STRING
|
56 |
+
dims: [ -1 ]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
name: "OUT_CUM_LOG_PROBS"
|
60 |
+
data_type: TYPE_FP32
|
61 |
+
dims: [ -1 ]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
name: "OUT_OUTPUT_LOG_PROBS"
|
65 |
+
data_type: TYPE_FP32
|
66 |
dims: [ -1, -1 ]
|
67 |
},
|
68 |
{
|
69 |
name: "OUTPUT_LENS"
|
70 |
+
data_type: TYPE_INT32
|
71 |
dims: [ -1 ]
|
72 |
}
|
73 |
]
|
|
|
75 |
parameters {
|
76 |
key: "tokenizer_dir"
|
77 |
value: {
|
78 |
+
string_value: "/data/llama/Llama-2-70b-chat-hf/"
|
79 |
}
|
80 |
}
|
81 |
|
|
|
86 |
}
|
87 |
}
|
88 |
|
89 |
+
parameters {
|
90 |
+
key: "skip_special_tokens"
|
91 |
+
value: {
|
92 |
+
string_value: "True"
|
93 |
+
}
|
94 |
+
}
|
95 |
+
|
96 |
instance_group [
|
97 |
{
|
98 |
+
count: 4
|
99 |
kind: KIND_CPU
|
100 |
}
|
101 |
]
|
preprocessing/1/model.py
CHANGED
@@ -24,14 +24,11 @@
|
|
24 |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
|
27 |
-
import csv
|
28 |
import json
|
29 |
from typing import List
|
30 |
|
31 |
import numpy as np
|
32 |
-
import torch
|
33 |
import triton_python_backend_utils as pb_utils
|
34 |
-
from torch.nn.utils.rnn import pad_sequence
|
35 |
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
|
36 |
|
37 |
|
@@ -61,13 +58,18 @@ class TritonPythonModel:
|
|
61 |
'string_value']
|
62 |
tokenizer_type = model_config['parameters']['tokenizer_type'][
|
63 |
'string_value']
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
if tokenizer_type == 't5':
|
66 |
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
|
67 |
padding_side='left')
|
68 |
elif tokenizer_type == 'auto':
|
69 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
70 |
-
|
71 |
elif tokenizer_type == 'llama':
|
72 |
self.tokenizer = LlamaTokenizer.from_pretrained(
|
73 |
tokenizer_dir, legacy=False, padding_side='left')
|
@@ -80,17 +82,26 @@ class TritonPythonModel:
|
|
80 |
add_special_tokens=False)[0]
|
81 |
|
82 |
# Parse model output configs and convert Triton types to numpy types
|
83 |
-
|
84 |
"INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
|
85 |
]
|
|
|
86 |
for input_name in input_names:
|
87 |
setattr(
|
88 |
self,
|
89 |
input_name.lower() + "_dtype",
|
90 |
pb_utils.triton_string_to_numpy(
|
91 |
-
pb_utils.
|
92 |
model_config, input_name)['data_type']))
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def execute(self, requests):
|
95 |
"""`execute` must be implemented in every Python model. `execute`
|
96 |
function receives a list of pb_utils.InferenceRequest as the only
|
@@ -115,48 +126,73 @@ class TritonPythonModel:
|
|
115 |
|
116 |
# Every Python backend must iterate over everyone of the requests
|
117 |
# and create a pb_utils.InferenceResponse for each of them.
|
|
|
118 |
for idx, request in enumerate(requests):
|
119 |
# Get input tensors
|
120 |
query = pb_utils.get_input_tensor_by_name(request,
|
121 |
'QUERY').as_numpy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
request_output_len = pb_utils.get_input_tensor_by_name(
|
123 |
request, 'REQUEST_OUTPUT_LEN').as_numpy()
|
124 |
|
125 |
bad_words_dict = pb_utils.get_input_tensor_by_name(
|
126 |
-
request, 'BAD_WORDS_DICT')
|
|
|
|
|
|
|
127 |
stop_words_dict = pb_utils.get_input_tensor_by_name(
|
128 |
-
request, 'STOP_WORDS_DICT')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
# Preprocessing input data.
|
131 |
input_id, request_input_len = self._create_request(query)
|
132 |
bad_words = self._to_word_list_format(bad_words_dict)
|
133 |
stop_words = self._to_word_list_format(stop_words_dict)
|
134 |
|
|
|
|
|
|
|
|
|
135 |
# Create output tensors. You need pb_utils.Tensor
|
136 |
# objects to create pb_utils.InferenceResponse.
|
137 |
input_id_tensor = pb_utils.Tensor(
|
138 |
-
'INPUT_ID',
|
139 |
-
np.array(input_id).astype(self.input_id_dtype))
|
140 |
request_input_len_tensor = pb_utils.Tensor(
|
141 |
'REQUEST_INPUT_LEN',
|
142 |
-
|
143 |
-
self.request_input_len_dtype))
|
144 |
request_output_len_tensor = pb_utils.Tensor(
|
145 |
'REQUEST_OUTPUT_LEN', request_output_len)
|
146 |
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
|
147 |
stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
|
148 |
stop_words)
|
|
|
|
|
149 |
|
150 |
-
# Create InferenceResponse. You can set an error here in case
|
151 |
-
# there was a problem with handling this inference request.
|
152 |
-
# Below is an example of how you can set errors in inference
|
153 |
-
# response:
|
154 |
-
#
|
155 |
-
# pb_utils.InferenceResponse(
|
156 |
-
# output_tensors=..., TritonError("An error occurred"))
|
157 |
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
158 |
input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
|
159 |
-
request_input_len_tensor, request_output_len_tensor
|
|
|
160 |
])
|
161 |
responses.append(inference_response)
|
162 |
|
@@ -176,44 +212,48 @@ class TritonPythonModel:
|
|
176 |
query : batch string (2D numpy array)
|
177 |
"""
|
178 |
start_ids = [
|
179 |
-
|
|
|
|
|
|
|
180 |
for s in query
|
181 |
]
|
182 |
-
start_lengths =
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
189 |
|
190 |
return start_ids, start_lengths
|
191 |
|
192 |
-
def _to_word_list_format(self,
|
193 |
'''
|
194 |
-
format
|
195 |
-
len(
|
196 |
-
|
197 |
-
len(word_dict[i]) must be 1, which means it only contains 1 string
|
198 |
-
This string can contains several sentences and split by ",".
|
199 |
-
For example, if word_dict[2] = " I am happy, I am sad", then this function will return
|
200 |
-
the ids for two short sentences " I am happy" and " I am sad".
|
201 |
'''
|
202 |
assert self.tokenizer != None, "need to set tokenizer"
|
203 |
|
|
|
|
|
|
|
|
|
204 |
flat_ids = []
|
205 |
offsets = []
|
206 |
-
for
|
207 |
item_flat_ids = []
|
208 |
item_offsets = []
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
words = list(csv.reader(word_dict_item))[0]
|
214 |
-
for word in words:
|
215 |
-
ids = self.tokenizer.encode(word)
|
216 |
|
|
|
217 |
if len(ids) == 0:
|
218 |
continue
|
219 |
|
@@ -233,3 +273,37 @@ class TritonPythonModel:
|
|
233 |
|
234 |
return np.array([flat_ids, offsets], dtype="int32").transpose(
|
235 |
(1, 0, 2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
|
|
|
27 |
import json
|
28 |
from typing import List
|
29 |
|
30 |
import numpy as np
|
|
|
31 |
import triton_python_backend_utils as pb_utils
|
|
|
32 |
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
|
33 |
|
34 |
|
|
|
58 |
'string_value']
|
59 |
tokenizer_type = model_config['parameters']['tokenizer_type'][
|
60 |
'string_value']
|
61 |
+
self.add_special_tokens = model_config['parameters'].get(
|
62 |
+
'add_special_tokens',
|
63 |
+
{'string_value': "false"})['string_value'].lower() in [
|
64 |
+
'true', '1', 't', 'y', 'yes'
|
65 |
+
]
|
66 |
|
67 |
if tokenizer_type == 't5':
|
68 |
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
|
69 |
padding_side='left')
|
70 |
elif tokenizer_type == 'auto':
|
71 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
72 |
+
tokenizer_dir, padding_side='left', trust_remote_code=True)
|
73 |
elif tokenizer_type == 'llama':
|
74 |
self.tokenizer = LlamaTokenizer.from_pretrained(
|
75 |
tokenizer_dir, legacy=False, padding_side='left')
|
|
|
82 |
add_special_tokens=False)[0]
|
83 |
|
84 |
# Parse model output configs and convert Triton types to numpy types
|
85 |
+
output_names = [
|
86 |
"INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
|
87 |
]
|
88 |
+
input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
|
89 |
for input_name in input_names:
|
90 |
setattr(
|
91 |
self,
|
92 |
input_name.lower() + "_dtype",
|
93 |
pb_utils.triton_string_to_numpy(
|
94 |
+
pb_utils.get_input_config_by_name(
|
95 |
model_config, input_name)['data_type']))
|
96 |
|
97 |
+
for output_name in output_names:
|
98 |
+
setattr(
|
99 |
+
self,
|
100 |
+
output_name.lower() + "_dtype",
|
101 |
+
pb_utils.triton_string_to_numpy(
|
102 |
+
pb_utils.get_output_config_by_name(
|
103 |
+
model_config, output_name)['data_type']))
|
104 |
+
|
105 |
def execute(self, requests):
|
106 |
"""`execute` must be implemented in every Python model. `execute`
|
107 |
function receives a list of pb_utils.InferenceRequest as the only
|
|
|
126 |
|
127 |
# Every Python backend must iterate over everyone of the requests
|
128 |
# and create a pb_utils.InferenceResponse for each of them.
|
129 |
+
logger = pb_utils.Logger
|
130 |
for idx, request in enumerate(requests):
|
131 |
# Get input tensors
|
132 |
query = pb_utils.get_input_tensor_by_name(request,
|
133 |
'QUERY').as_numpy()
|
134 |
+
batch_dim = query.shape[0]
|
135 |
+
if batch_dim != 1:
|
136 |
+
|
137 |
+
err_str = "Inflight batching backend expects requests with batch size of 1."
|
138 |
+
logger.log_error(err_str)
|
139 |
+
responses.append(
|
140 |
+
pb_utils.InferenceResponse(
|
141 |
+
output_tensors=[],
|
142 |
+
error=pb_utils.TritonError(err_str)))
|
143 |
+
continue
|
144 |
+
|
145 |
request_output_len = pb_utils.get_input_tensor_by_name(
|
146 |
request, 'REQUEST_OUTPUT_LEN').as_numpy()
|
147 |
|
148 |
bad_words_dict = pb_utils.get_input_tensor_by_name(
|
149 |
+
request, 'BAD_WORDS_DICT')
|
150 |
+
if bad_words_dict is not None:
|
151 |
+
bad_words_dict = bad_words_dict.as_numpy()
|
152 |
+
|
153 |
stop_words_dict = pb_utils.get_input_tensor_by_name(
|
154 |
+
request, 'STOP_WORDS_DICT')
|
155 |
+
if stop_words_dict is not None:
|
156 |
+
stop_words_dict = stop_words_dict.as_numpy()
|
157 |
+
|
158 |
+
embedding_bias_words = pb_utils.get_input_tensor_by_name(
|
159 |
+
request, 'EMBEDDING_BIAS_WORDS')
|
160 |
+
if embedding_bias_words is not None:
|
161 |
+
embedding_bias_words = embedding_bias_words.as_numpy()
|
162 |
+
|
163 |
+
embedding_bias_weights = pb_utils.get_input_tensor_by_name(
|
164 |
+
request, 'EMBEDDING_BIAS_WEIGHTS')
|
165 |
+
if embedding_bias_weights is not None:
|
166 |
+
embedding_bias_weights = embedding_bias_weights.as_numpy()
|
167 |
|
168 |
# Preprocessing input data.
|
169 |
input_id, request_input_len = self._create_request(query)
|
170 |
bad_words = self._to_word_list_format(bad_words_dict)
|
171 |
stop_words = self._to_word_list_format(stop_words_dict)
|
172 |
|
173 |
+
embedding_bias = self._get_embedding_bias(
|
174 |
+
embedding_bias_words, embedding_bias_weights,
|
175 |
+
self.embedding_bias_weights_dtype)
|
176 |
+
|
177 |
# Create output tensors. You need pb_utils.Tensor
|
178 |
# objects to create pb_utils.InferenceResponse.
|
179 |
input_id_tensor = pb_utils.Tensor(
|
180 |
+
'INPUT_ID', input_id.astype(self.input_id_dtype))
|
|
|
181 |
request_input_len_tensor = pb_utils.Tensor(
|
182 |
'REQUEST_INPUT_LEN',
|
183 |
+
request_input_len.astype(self.request_input_len_dtype))
|
|
|
184 |
request_output_len_tensor = pb_utils.Tensor(
|
185 |
'REQUEST_OUTPUT_LEN', request_output_len)
|
186 |
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
|
187 |
stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
|
188 |
stop_words)
|
189 |
+
embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
|
190 |
+
embedding_bias)
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
193 |
input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
|
194 |
+
request_input_len_tensor, request_output_len_tensor,
|
195 |
+
embedding_bias_tensor
|
196 |
])
|
197 |
responses.append(inference_response)
|
198 |
|
|
|
212 |
query : batch string (2D numpy array)
|
213 |
"""
|
214 |
start_ids = [
|
215 |
+
np.array(
|
216 |
+
self.tokenizer.encode(
|
217 |
+
s[0].decode(),
|
218 |
+
add_special_tokens=self.add_special_tokens)).astype(int)
|
219 |
for s in query
|
220 |
]
|
221 |
+
start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
|
222 |
|
223 |
+
max_len = 0
|
224 |
+
for seq in start_ids:
|
225 |
+
max_len = max(max_len, seq.shape[0])
|
226 |
+
start_ids = np.stack([
|
227 |
+
np.pad(seq, (0, max_len - seq.shape[0]),
|
228 |
+
'constant',
|
229 |
+
constant_values=(0, self.pad_id)) for seq in start_ids
|
230 |
+
])
|
231 |
|
232 |
return start_ids, start_lengths
|
233 |
|
234 |
+
def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
|
235 |
'''
|
236 |
+
word_lists format:
|
237 |
+
len(word_lists) == batch_size
|
238 |
+
word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
|
|
|
|
|
|
|
|
|
239 |
'''
|
240 |
assert self.tokenizer != None, "need to set tokenizer"
|
241 |
|
242 |
+
if word_lists is None:
|
243 |
+
# Return an empty array of shape (1,2,0)
|
244 |
+
return np.empty([1, 2, 0], dtype="int32")
|
245 |
+
|
246 |
flat_ids = []
|
247 |
offsets = []
|
248 |
+
for word_list in word_lists:
|
249 |
item_flat_ids = []
|
250 |
item_offsets = []
|
251 |
|
252 |
+
for word in word_list:
|
253 |
+
if isinstance(word, bytes):
|
254 |
+
word = word.decode()
|
|
|
|
|
|
|
255 |
|
256 |
+
ids = self.tokenizer.encode(word, add_special_tokens=False)
|
257 |
if len(ids) == 0:
|
258 |
continue
|
259 |
|
|
|
273 |
|
274 |
return np.array([flat_ids, offsets], dtype="int32").transpose(
|
275 |
(1, 0, 2))
|
276 |
+
|
277 |
+
def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
|
278 |
+
bias_dtype):
|
279 |
+
|
280 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
281 |
+
|
282 |
+
if embedding_bias_words is None or embedding_bias_weights is None:
|
283 |
+
return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
|
284 |
+
|
285 |
+
batch_embedding_bias = []
|
286 |
+
for words, weights in zip(embedding_bias_words,
|
287 |
+
embedding_bias_weights):
|
288 |
+
|
289 |
+
vocab_size = self.tokenizer.vocab_size
|
290 |
+
embedding_bias = [0.] * vocab_size
|
291 |
+
|
292 |
+
assert len(words) == len(
|
293 |
+
weights
|
294 |
+
), "Embedding bias words must have same dimension as embedding bias weights"
|
295 |
+
|
296 |
+
for word, weight in zip(words, weights):
|
297 |
+
if isinstance(word, bytes):
|
298 |
+
word = word.decode()
|
299 |
+
ids = self.tokenizer.encode(word)
|
300 |
+
|
301 |
+
if len(ids) == 0:
|
302 |
+
continue
|
303 |
+
|
304 |
+
for id in ids:
|
305 |
+
embedding_bias[id] += weight
|
306 |
+
|
307 |
+
batch_embedding_bias.append(np.array(embedding_bias))
|
308 |
+
|
309 |
+
return np.array(batch_embedding_bias, dtype=bias_dtype)
|
preprocessing/config.pbtxt
CHANGED
@@ -33,20 +33,34 @@ input [
|
|
33 |
data_type: TYPE_STRING
|
34 |
dims: [ -1 ]
|
35 |
},
|
|
|
|
|
|
|
|
|
|
|
36 |
{
|
37 |
name: "BAD_WORDS_DICT"
|
38 |
data_type: TYPE_STRING
|
39 |
dims: [ -1 ]
|
|
|
40 |
},
|
41 |
{
|
42 |
name: "STOP_WORDS_DICT"
|
43 |
data_type: TYPE_STRING
|
44 |
dims: [ -1 ]
|
|
|
45 |
},
|
46 |
{
|
47 |
-
name: "
|
48 |
-
data_type:
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
dims: [ -1 ]
|
|
|
50 |
}
|
51 |
]
|
52 |
output [
|
@@ -70,9 +84,14 @@ output [
|
|
70 |
data_type: TYPE_INT32
|
71 |
dims: [ 2, -1 ]
|
72 |
},
|
|
|
|
|
|
|
|
|
|
|
73 |
{
|
74 |
name: "REQUEST_OUTPUT_LEN"
|
75 |
-
data_type:
|
76 |
dims: [ -1 ]
|
77 |
}
|
78 |
]
|
@@ -80,7 +99,7 @@ output [
|
|
80 |
parameters {
|
81 |
key: "tokenizer_dir"
|
82 |
value: {
|
83 |
-
string_value: "
|
84 |
}
|
85 |
}
|
86 |
|
@@ -91,9 +110,16 @@ parameters {
|
|
91 |
}
|
92 |
}
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
instance_group [
|
95 |
{
|
96 |
-
count:
|
97 |
kind: KIND_CPU
|
98 |
}
|
99 |
]
|
|
|
33 |
data_type: TYPE_STRING
|
34 |
dims: [ -1 ]
|
35 |
},
|
36 |
+
{
|
37 |
+
name: "REQUEST_OUTPUT_LEN"
|
38 |
+
data_type: TYPE_INT32
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
{
|
42 |
name: "BAD_WORDS_DICT"
|
43 |
data_type: TYPE_STRING
|
44 |
dims: [ -1 ]
|
45 |
+
optional: true
|
46 |
},
|
47 |
{
|
48 |
name: "STOP_WORDS_DICT"
|
49 |
data_type: TYPE_STRING
|
50 |
dims: [ -1 ]
|
51 |
+
optional: true
|
52 |
},
|
53 |
{
|
54 |
+
name: "EMBEDDING_BIAS_WORDS"
|
55 |
+
data_type: TYPE_STRING
|
56 |
+
dims: [ -1 ]
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "EMBEDDING_BIAS_WEIGHTS"
|
61 |
+
data_type: TYPE_FP32
|
62 |
dims: [ -1 ]
|
63 |
+
optional: true
|
64 |
}
|
65 |
]
|
66 |
output [
|
|
|
84 |
data_type: TYPE_INT32
|
85 |
dims: [ 2, -1 ]
|
86 |
},
|
87 |
+
{
|
88 |
+
name: "EMBEDDING_BIAS"
|
89 |
+
data_type: TYPE_FP32
|
90 |
+
dims: [ -1 ]
|
91 |
+
},
|
92 |
{
|
93 |
name: "REQUEST_OUTPUT_LEN"
|
94 |
+
data_type: TYPE_INT32
|
95 |
dims: [ -1 ]
|
96 |
}
|
97 |
]
|
|
|
99 |
parameters {
|
100 |
key: "tokenizer_dir"
|
101 |
value: {
|
102 |
+
string_value: "/data/llama/Llama-2-70b-chat-hf/"
|
103 |
}
|
104 |
}
|
105 |
|
|
|
110 |
}
|
111 |
}
|
112 |
|
113 |
+
parameters {
|
114 |
+
key: "add_special_tokens"
|
115 |
+
value: {
|
116 |
+
string_value: "False"
|
117 |
+
}
|
118 |
+
}
|
119 |
+
|
120 |
instance_group [
|
121 |
{
|
122 |
+
count: 4
|
123 |
kind: KIND_CPU
|
124 |
}
|
125 |
]
|
tensorrt_llm/1/config.json
CHANGED
@@ -47,6 +47,7 @@
|
|
47 |
"tokens_per_block": 128,
|
48 |
"use_custom_all_reduce": false,
|
49 |
"use_paged_context_fmha": false,
|
|
|
50 |
"weight_only_groupwise_quant_matmul_plugin": false,
|
51 |
"weight_only_quant_matmul_plugin": false
|
52 |
}
|
|
|
47 |
"tokens_per_block": 128,
|
48 |
"use_custom_all_reduce": false,
|
49 |
"use_paged_context_fmha": false,
|
50 |
+
"use_context_fmha_for_generation": false,
|
51 |
"weight_only_groupwise_quant_matmul_plugin": false,
|
52 |
"weight_only_quant_matmul_plugin": false
|
53 |
}
|
tensorrt_llm/config.pbtxt
CHANGED
@@ -26,17 +26,23 @@
|
|
26 |
|
27 |
name: "tensorrt_llm"
|
28 |
backend: "tensorrtllm"
|
29 |
-
max_batch_size:
|
30 |
|
31 |
model_transaction_policy {
|
32 |
decoupled: True
|
33 |
}
|
34 |
|
|
|
|
|
|
|
|
|
|
|
35 |
input [
|
36 |
{
|
37 |
name: "input_ids"
|
38 |
data_type: TYPE_INT32
|
39 |
dims: [ -1 ]
|
|
|
40 |
},
|
41 |
{
|
42 |
name: "input_lengths"
|
@@ -46,26 +52,54 @@ input [
|
|
46 |
},
|
47 |
{
|
48 |
name: "request_output_len"
|
49 |
-
data_type:
|
50 |
dims: [ 1 ]
|
51 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
{
|
53 |
name: "end_id"
|
54 |
-
data_type:
|
55 |
dims: [ 1 ]
|
56 |
reshape: { shape: [ ] }
|
57 |
optional: true
|
58 |
},
|
59 |
{
|
60 |
name: "pad_id"
|
61 |
-
data_type:
|
62 |
dims: [ 1 ]
|
63 |
reshape: { shape: [ ] }
|
64 |
optional: true
|
65 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
{
|
67 |
name: "beam_width"
|
68 |
-
data_type:
|
69 |
dims: [ 1 ]
|
70 |
reshape: { shape: [ ] }
|
71 |
optional: true
|
@@ -79,7 +113,7 @@ input [
|
|
79 |
},
|
80 |
{
|
81 |
name: "runtime_top_k"
|
82 |
-
data_type:
|
83 |
dims: [ 1 ]
|
84 |
reshape: { shape: [ ] }
|
85 |
optional: true
|
@@ -107,7 +141,7 @@ input [
|
|
107 |
},
|
108 |
{
|
109 |
name: "min_length"
|
110 |
-
data_type:
|
111 |
dims: [ 1 ]
|
112 |
reshape: { shape: [ ] }
|
113 |
optional: true
|
@@ -126,6 +160,13 @@ input [
|
|
126 |
reshape: { shape: [ ] }
|
127 |
optional: true
|
128 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
{
|
130 |
name: "stop"
|
131 |
data_type: TYPE_BOOL
|
@@ -137,6 +178,20 @@ input [
|
|
137 |
data_type: TYPE_BOOL
|
138 |
dims: [ 1 ]
|
139 |
optional: true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
}
|
141 |
]
|
142 |
output [
|
@@ -144,6 +199,21 @@ output [
|
|
144 |
name: "output_ids"
|
145 |
data_type: TYPE_INT32
|
146 |
dims: [ -1, -1 ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
}
|
148 |
]
|
149 |
instance_group [
|
@@ -173,13 +243,19 @@ parameters: {
|
|
173 |
parameters: {
|
174 |
key: "gpt_model_path"
|
175 |
value: {
|
176 |
-
string_value: "
|
177 |
}
|
178 |
}
|
179 |
parameters: {
|
180 |
key: "max_tokens_in_paged_kv_cache"
|
181 |
value: {
|
182 |
-
string_value: "
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
}
|
184 |
}
|
185 |
parameters: {
|
@@ -195,20 +271,26 @@ parameters: {
|
|
195 |
}
|
196 |
}
|
197 |
parameters: {
|
198 |
-
key: "
|
199 |
value: {
|
200 |
-
string_value: "
|
201 |
}
|
202 |
}
|
203 |
parameters: {
|
204 |
-
key: "
|
205 |
value: {
|
206 |
-
string_value: "
|
207 |
}
|
208 |
}
|
209 |
parameters: {
|
210 |
-
key: "
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
value: {
|
212 |
-
string_value: "${
|
213 |
}
|
214 |
}
|
|
|
26 |
|
27 |
name: "tensorrt_llm"
|
28 |
backend: "tensorrtllm"
|
29 |
+
max_batch_size: 64
|
30 |
|
31 |
model_transaction_policy {
|
32 |
decoupled: True
|
33 |
}
|
34 |
|
35 |
+
dynamic_batching {
|
36 |
+
preferred_batch_size: [ 1 ]
|
37 |
+
max_queue_delay_microseconds: 1000
|
38 |
+
}
|
39 |
+
|
40 |
input [
|
41 |
{
|
42 |
name: "input_ids"
|
43 |
data_type: TYPE_INT32
|
44 |
dims: [ -1 ]
|
45 |
+
allow_ragged_batch: true
|
46 |
},
|
47 |
{
|
48 |
name: "input_lengths"
|
|
|
52 |
},
|
53 |
{
|
54 |
name: "request_output_len"
|
55 |
+
data_type: TYPE_INT32
|
56 |
dims: [ 1 ]
|
57 |
},
|
58 |
+
{
|
59 |
+
name: "draft_input_ids"
|
60 |
+
data_type: TYPE_INT32
|
61 |
+
dims: [ -1 ]
|
62 |
+
optional: true
|
63 |
+
allow_ragged_batch: true
|
64 |
+
},
|
65 |
{
|
66 |
name: "end_id"
|
67 |
+
data_type: TYPE_INT32
|
68 |
dims: [ 1 ]
|
69 |
reshape: { shape: [ ] }
|
70 |
optional: true
|
71 |
},
|
72 |
{
|
73 |
name: "pad_id"
|
74 |
+
data_type: TYPE_INT32
|
75 |
dims: [ 1 ]
|
76 |
reshape: { shape: [ ] }
|
77 |
optional: true
|
78 |
},
|
79 |
+
{
|
80 |
+
name: "stop_words_list"
|
81 |
+
data_type: TYPE_INT32
|
82 |
+
dims: [ 2, -1 ]
|
83 |
+
optional: true
|
84 |
+
allow_ragged_batch: true
|
85 |
+
},
|
86 |
+
{
|
87 |
+
name: "bad_words_list"
|
88 |
+
data_type: TYPE_INT32
|
89 |
+
dims: [ 2, -1 ]
|
90 |
+
optional: true
|
91 |
+
allow_ragged_batch: true
|
92 |
+
},
|
93 |
+
{
|
94 |
+
name: "embedding_bias"
|
95 |
+
data_type: TYPE_FP32
|
96 |
+
dims: [ -1 ]
|
97 |
+
optional: true
|
98 |
+
allow_ragged_batch: true
|
99 |
+
},
|
100 |
{
|
101 |
name: "beam_width"
|
102 |
+
data_type: TYPE_INT32
|
103 |
dims: [ 1 ]
|
104 |
reshape: { shape: [ ] }
|
105 |
optional: true
|
|
|
113 |
},
|
114 |
{
|
115 |
name: "runtime_top_k"
|
116 |
+
data_type: TYPE_INT32
|
117 |
dims: [ 1 ]
|
118 |
reshape: { shape: [ ] }
|
119 |
optional: true
|
|
|
141 |
},
|
142 |
{
|
143 |
name: "min_length"
|
144 |
+
data_type: TYPE_INT32
|
145 |
dims: [ 1 ]
|
146 |
reshape: { shape: [ ] }
|
147 |
optional: true
|
|
|
160 |
reshape: { shape: [ ] }
|
161 |
optional: true
|
162 |
},
|
163 |
+
{
|
164 |
+
name: "return_log_probs"
|
165 |
+
data_type: TYPE_BOOL
|
166 |
+
dims: [ 1 ]
|
167 |
+
reshape: { shape: [ ] }
|
168 |
+
optional: true
|
169 |
+
},
|
170 |
{
|
171 |
name: "stop"
|
172 |
data_type: TYPE_BOOL
|
|
|
178 |
data_type: TYPE_BOOL
|
179 |
dims: [ 1 ]
|
180 |
optional: true
|
181 |
+
},
|
182 |
+
{
|
183 |
+
name: "prompt_embedding_table"
|
184 |
+
data_type: TYPE_FP16
|
185 |
+
dims: [ -1, -1 ]
|
186 |
+
optional: true
|
187 |
+
allow_ragged_batch: true
|
188 |
+
},
|
189 |
+
{
|
190 |
+
name: "prompt_vocab_size"
|
191 |
+
data_type: TYPE_INT32
|
192 |
+
dims: [ 1 ]
|
193 |
+
reshape: { shape: [ ] }
|
194 |
+
optional: true
|
195 |
}
|
196 |
]
|
197 |
output [
|
|
|
199 |
name: "output_ids"
|
200 |
data_type: TYPE_INT32
|
201 |
dims: [ -1, -1 ]
|
202 |
+
},
|
203 |
+
{
|
204 |
+
name: "sequence_length"
|
205 |
+
data_type: TYPE_INT32
|
206 |
+
dims: [ -1 ]
|
207 |
+
},
|
208 |
+
{
|
209 |
+
name: "cum_log_probs"
|
210 |
+
data_type: TYPE_FP32
|
211 |
+
dims: [ -1 ]
|
212 |
+
},
|
213 |
+
{
|
214 |
+
name: "output_log_probs"
|
215 |
+
data_type: TYPE_FP32
|
216 |
+
dims: [ -1, -1 ]
|
217 |
}
|
218 |
]
|
219 |
instance_group [
|
|
|
243 |
parameters: {
|
244 |
key: "gpt_model_path"
|
245 |
value: {
|
246 |
+
string_value: "/data/tgi-data/yessen/Llama-2-70b-chat-hf-trt-fp8/tensorrt_llm/1"
|
247 |
}
|
248 |
}
|
249 |
parameters: {
|
250 |
key: "max_tokens_in_paged_kv_cache"
|
251 |
value: {
|
252 |
+
string_value: "40000"
|
253 |
+
}
|
254 |
+
}
|
255 |
+
parameters: {
|
256 |
+
key: "max_attention_window_size"
|
257 |
+
value: {
|
258 |
+
string_value: "4096"
|
259 |
}
|
260 |
}
|
261 |
parameters: {
|
|
|
271 |
}
|
272 |
}
|
273 |
parameters: {
|
274 |
+
key: "max_num_sequences"
|
275 |
value: {
|
276 |
+
string_value: "64"
|
277 |
}
|
278 |
}
|
279 |
parameters: {
|
280 |
+
key: "enable_trt_overlap"
|
281 |
value: {
|
282 |
+
string_value: "false"
|
283 |
}
|
284 |
}
|
285 |
parameters: {
|
286 |
+
key: "exclude_input_in_output"
|
287 |
+
value: {
|
288 |
+
string_value: "true"
|
289 |
+
}
|
290 |
+
}
|
291 |
+
parameters: {
|
292 |
+
key: "enable_kv_cache_reuse"
|
293 |
value: {
|
294 |
+
string_value: "${enable_kv_cache_reuse}"
|
295 |
}
|
296 |
}
|