yessenzhar commited on
Commit
dd20dba
1 Parent(s): 2d64b65

update models for newer trt 0.6.1 version

Browse files
ensemble/config.pbtxt CHANGED
@@ -26,7 +26,7 @@
26
 
27
  name: "ensemble"
28
  platform: "ensemble"
29
- max_batch_size: 128
30
  input [
31
  {
32
  name: "text_input"
@@ -35,34 +35,36 @@ input [
35
  },
36
  {
37
  name: "max_tokens"
38
- data_type: TYPE_UINT32
39
  dims: [ -1 ]
40
  },
41
  {
42
  name: "bad_words"
43
  data_type: TYPE_STRING
44
  dims: [ -1 ]
 
45
  },
46
  {
47
  name: "stop_words"
48
  data_type: TYPE_STRING
49
  dims: [ -1 ]
 
50
  },
51
  {
52
  name: "end_id"
53
- data_type: TYPE_UINT32
54
  dims: [ 1 ]
55
  optional: true
56
  },
57
  {
58
  name: "pad_id"
59
- data_type: TYPE_UINT32
60
  dims: [ 1 ]
61
  optional: true
62
  },
63
  {
64
  name: "top_k"
65
- data_type: TYPE_UINT32
66
  dims: [ 1 ]
67
  optional: true
68
  },
@@ -92,7 +94,7 @@ input [
92
  },
93
  {
94
  name: "min_length"
95
- data_type: TYPE_UINT32
96
  dims: [ 1 ]
97
  optional: true
98
  },
@@ -108,9 +110,15 @@ input [
108
  dims: [ 1 ]
109
  optional: true
110
  },
 
 
 
 
 
 
111
  {
112
  name: "beam_width"
113
- data_type: TYPE_UINT32
114
  dims: [ 1 ]
115
  optional: true
116
  },
@@ -119,18 +127,47 @@ input [
119
  data_type: TYPE_BOOL
120
  dims: [ 1 ]
121
  optional: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  }
123
  ]
124
  output [
125
  {
126
  name: "text_output"
127
  data_type: TYPE_STRING
128
- dims: [ -1, -1 ]
129
  },
130
  {
131
- name: "output_tokens"
132
- data_type: TYPE_UINT32
133
  dims: [ -1 ]
 
 
 
 
 
134
  }
135
  ]
136
  ensemble_scheduling {
@@ -154,6 +191,14 @@ ensemble_scheduling {
154
  key: "STOP_WORDS_DICT"
155
  value: "stop_words"
156
  }
 
 
 
 
 
 
 
 
157
  output_map {
158
  key: "REQUEST_INPUT_LEN"
159
  value: "_REQUEST_INPUT_LEN"
@@ -166,6 +211,18 @@ ensemble_scheduling {
166
  key: "REQUEST_OUTPUT_LEN"
167
  value: "_REQUEST_OUTPUT_LEN"
168
  }
 
 
 
 
 
 
 
 
 
 
 
 
169
  },
170
  {
171
  model_name: "tensorrt_llm"
@@ -190,6 +247,10 @@ ensemble_scheduling {
190
  key: "pad_id"
191
  value: "pad_id"
192
  }
 
 
 
 
193
  input_map {
194
  key: "runtime_top_k"
195
  value: "top_k"
@@ -222,6 +283,10 @@ ensemble_scheduling {
222
  key: "random_seed"
223
  value: "random_seed"
224
  }
 
 
 
 
225
  input_map {
226
  key: "beam_width"
227
  value: "beam_width"
@@ -230,10 +295,38 @@ ensemble_scheduling {
230
  key: "streaming"
231
  value: "stream"
232
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  output_map {
234
  key: "output_ids"
235
  value: "_TOKENS_BATCH"
236
  }
 
 
 
 
 
 
 
 
 
 
 
 
237
  },
238
  {
239
  model_name: "postprocessing"
@@ -242,13 +335,29 @@ ensemble_scheduling {
242
  key: "TOKENS_BATCH"
243
  value: "_TOKENS_BATCH"
244
  }
 
 
 
 
 
 
 
 
 
 
 
 
245
  output_map {
246
  key: "OUTPUT"
247
  value: "text_output"
248
  }
249
  output_map {
250
- key: "OUTPUT_LENS"
251
- value: "output_tokens"
 
 
 
 
252
  }
253
  }
254
  ]
 
26
 
27
  name: "ensemble"
28
  platform: "ensemble"
29
+ max_batch_size: 64
30
  input [
31
  {
32
  name: "text_input"
 
35
  },
36
  {
37
  name: "max_tokens"
38
+ data_type: TYPE_INT32
39
  dims: [ -1 ]
40
  },
41
  {
42
  name: "bad_words"
43
  data_type: TYPE_STRING
44
  dims: [ -1 ]
45
+ optional: true
46
  },
47
  {
48
  name: "stop_words"
49
  data_type: TYPE_STRING
50
  dims: [ -1 ]
51
+ optional: true
52
  },
53
  {
54
  name: "end_id"
55
+ data_type: TYPE_INT32
56
  dims: [ 1 ]
57
  optional: true
58
  },
59
  {
60
  name: "pad_id"
61
+ data_type: TYPE_INT32
62
  dims: [ 1 ]
63
  optional: true
64
  },
65
  {
66
  name: "top_k"
67
+ data_type: TYPE_INT32
68
  dims: [ 1 ]
69
  optional: true
70
  },
 
94
  },
95
  {
96
  name: "min_length"
97
+ data_type: TYPE_INT32
98
  dims: [ 1 ]
99
  optional: true
100
  },
 
110
  dims: [ 1 ]
111
  optional: true
112
  },
113
+ {
114
+ name: "return_log_probs"
115
+ data_type: TYPE_BOOL
116
+ dims: [ 1 ]
117
+ optional: true
118
+ },
119
  {
120
  name: "beam_width"
121
+ data_type: TYPE_INT32
122
  dims: [ 1 ]
123
  optional: true
124
  },
 
127
  data_type: TYPE_BOOL
128
  dims: [ 1 ]
129
  optional: true
130
+ },
131
+ {
132
+ name: "prompt_embedding_table"
133
+ data_type: TYPE_FP16
134
+ dims: [ -1, -1 ]
135
+ optional: true
136
+ },
137
+ {
138
+ name: "prompt_vocab_size"
139
+ data_type: TYPE_INT32
140
+ dims: [ 1 ]
141
+ optional: true
142
+ },
143
+ {
144
+ name: "embedding_bias_words"
145
+ data_type: TYPE_STRING
146
+ dims: [ -1 ]
147
+ optional: true
148
+ },
149
+ {
150
+ name: "embedding_bias_weights"
151
+ data_type: TYPE_FP32
152
+ dims: [ -1 ]
153
+ optional: true
154
  }
155
  ]
156
  output [
157
  {
158
  name: "text_output"
159
  data_type: TYPE_STRING
160
+ dims: [ -1 ]
161
  },
162
  {
163
+ name: "cum_log_probs"
164
+ data_type: TYPE_FP32
165
  dims: [ -1 ]
166
+ },
167
+ {
168
+ name: "output_log_probs"
169
+ data_type: TYPE_FP32
170
+ dims: [ -1, -1 ]
171
  }
172
  ]
173
  ensemble_scheduling {
 
191
  key: "STOP_WORDS_DICT"
192
  value: "stop_words"
193
  }
194
+ input_map {
195
+ key: "EMBEDDING_BIAS_WORDS"
196
+ value: "embedding_bias_words"
197
+ }
198
+ input_map {
199
+ key: "EMBEDDING_BIAS_WEIGHTS"
200
+ value: "embedding_bias_weights"
201
+ }
202
  output_map {
203
  key: "REQUEST_INPUT_LEN"
204
  value: "_REQUEST_INPUT_LEN"
 
211
  key: "REQUEST_OUTPUT_LEN"
212
  value: "_REQUEST_OUTPUT_LEN"
213
  }
214
+ output_map {
215
+ key: "STOP_WORDS_IDS"
216
+ value: "_STOP_WORDS_IDS"
217
+ }
218
+ output_map {
219
+ key: "BAD_WORDS_IDS"
220
+ value: "_BAD_WORDS_IDS"
221
+ }
222
+ output_map {
223
+ key: "EMBEDDING_BIAS"
224
+ value: "_EMBEDDING_BIAS"
225
+ }
226
  },
227
  {
228
  model_name: "tensorrt_llm"
 
247
  key: "pad_id"
248
  value: "pad_id"
249
  }
250
+ input_map {
251
+ key: "embedding_bias"
252
+ value: "_EMBEDDING_BIAS"
253
+ }
254
  input_map {
255
  key: "runtime_top_k"
256
  value: "top_k"
 
283
  key: "random_seed"
284
  value: "random_seed"
285
  }
286
+ input_map {
287
+ key: "return_log_probs"
288
+ value: "return_log_probs"
289
+ }
290
  input_map {
291
  key: "beam_width"
292
  value: "beam_width"
 
295
  key: "streaming"
296
  value: "stream"
297
  }
298
+ input_map {
299
+ key: "prompt_embedding_table"
300
+ value: "prompt_embedding_table"
301
+ }
302
+ input_map {
303
+ key: "prompt_vocab_size"
304
+ value: "prompt_vocab_size"
305
+ }
306
+ input_map {
307
+ key: "stop_words_list"
308
+ value: "_STOP_WORDS_IDS"
309
+ }
310
+ input_map {
311
+ key: "bad_words_list"
312
+ value: "_BAD_WORDS_IDS"
313
+ }
314
  output_map {
315
  key: "output_ids"
316
  value: "_TOKENS_BATCH"
317
  }
318
+ output_map {
319
+ key: "sequence_length"
320
+ value: "_SEQUENCE_LENGTH"
321
+ },
322
+ output_map {
323
+ key: "cum_log_probs"
324
+ value: "_CUM_LOG_PROBS"
325
+ }
326
+ output_map {
327
+ key: "output_log_probs"
328
+ value: "_OUTPUT_LOG_PROBS"
329
+ }
330
  },
331
  {
332
  model_name: "postprocessing"
 
335
  key: "TOKENS_BATCH"
336
  value: "_TOKENS_BATCH"
337
  }
338
+ input_map {
339
+ key: "CUM_LOG_PROBS"
340
+ value: "_CUM_LOG_PROBS"
341
+ }
342
+ input_map {
343
+ key: "OUTPUT_LOG_PROBS"
344
+ value: "_OUTPUT_LOG_PROBS"
345
+ }
346
+ input_map {
347
+ key: "SEQUENCE_LENGTH"
348
+ value: "_SEQUENCE_LENGTH"
349
+ }
350
  output_map {
351
  key: "OUTPUT"
352
  value: "text_output"
353
  }
354
  output_map {
355
+ key: "OUT_OUTPUT_LOG_PROBS"
356
+ value: "output_log_probs"
357
+ }
358
+ output_map {
359
+ key: "OUT_CUM_LOG_PROBS"
360
+ value: "cum_log_probs"
361
  }
362
  }
363
  ]
postprocessing/1/model.py CHANGED
@@ -57,13 +57,18 @@ class TritonPythonModel:
57
  'string_value']
58
  tokenizer_type = model_config['parameters']['tokenizer_type'][
59
  'string_value']
 
 
 
 
 
60
 
61
  if tokenizer_type == 't5':
62
  self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
63
  padding_side='left')
64
  elif tokenizer_type == 'auto':
65
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
66
- padding_side='left')
67
  elif tokenizer_type == 'llama':
68
  self.tokenizer = LlamaTokenizer.from_pretrained(
69
  tokenizer_dir, legacy=False, padding_side='left')
@@ -72,8 +77,6 @@ class TritonPythonModel:
72
  f'Unexpected tokenizer type: {tokenizer_type}')
73
  self.tokenizer.pad_token = self.tokenizer.eos_token
74
 
75
- self._init_token_map()
76
-
77
  # Parse model output configs
78
  output_config = pb_utils.get_output_config_by_name(
79
  model_config, "OUTPUT")
@@ -84,20 +87,6 @@ class TritonPythonModel:
84
  output_lens_config = pb_utils.get_output_config_by_name(
85
  model_config, "OUTPUT_LENS")
86
 
87
- # Convert Triton types to numpy types
88
- self.output_lens_dtype = pb_utils.triton_string_to_numpy(
89
- output_lens_config['data_type'])
90
-
91
- def _init_token_map(self):
92
- v = self.tokenizer.get_vocab()
93
- self.token_map = [None] * len(v)
94
- for k, val in v.items():
95
- self.token_map[val] = k
96
-
97
- for i in range(len(v)):
98
- if self.token_map[i] is None:
99
- print("error %s" % i)
100
-
101
  def execute(self, requests):
102
  """`execute` must be implemented in every Python model. `execute`
103
  function receives a list of pb_utils.InferenceRequest as the only
@@ -127,21 +116,37 @@ class TritonPythonModel:
127
  tokens_batch = pb_utils.get_input_tensor_by_name(
128
  request, 'TOKENS_BATCH').as_numpy()
129
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  # Reshape Input
131
  # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
132
  # tokens_batch = tokens_batch.T
133
 
134
  # Postprocessing output data.
135
- outputs, output_lens = self._postprocessing(tokens_batch)
 
136
 
137
  # Create output tensors. You need pb_utils.Tensor
138
  # objects to create pb_utils.InferenceResponse.
139
  output_tensor = pb_utils.Tensor(
140
  'OUTPUT',
141
  np.array(outputs).astype(self.output_dtype))
142
- output_lens_tensor = pb_utils.Tensor(
143
- 'OUTPUT_LENS',
144
- np.array(output_lens).astype(self.output_lens_dtype))
 
 
 
145
 
146
  # Create InferenceResponse. You can set an error here in case
147
  # there was a problem with handling this inference request.
@@ -150,8 +155,9 @@ class TritonPythonModel:
150
  #
151
  # pb_utils.InferenceResponse(
152
  # output_tensors=..., TritonError("An error occurred"))
153
- inference_response = pb_utils.InferenceResponse(
154
- output_tensors=[output_tensor, output_lens_tensor])
 
155
  responses.append(inference_response)
156
 
157
  # You should return a list of pb_utils.InferenceResponse. Length
@@ -165,24 +171,13 @@ class TritonPythonModel:
165
  """
166
  print('Cleaning up...')
167
 
168
- def _single_token_decode(self, token):
169
- st = self.token_map[token]
170
- if st[0] == '▁':
171
- return " " + st[1:]
172
- else:
173
- return self.tokenizer.decode([token])
174
-
175
- def _postprocessing(self, tokens_batch):
176
  outputs = []
177
- output_lens = []
178
- for beam_tokens in tokens_batch:
179
- total_len = 0
180
- for tokens in beam_tokens:
181
- if len(tokens) == 1:
182
- output = self._single_token_decode(tokens[0])
183
- else:
184
- output = self.tokenizer.decode(tokens)
185
  outputs.append(output.encode('utf8'))
186
- total_len += len(tokens)
187
- output_lens.append(total_len)
188
- return outputs, output_lens
 
57
  'string_value']
58
  tokenizer_type = model_config['parameters']['tokenizer_type'][
59
  'string_value']
60
+ self.skip_special_tokens = model_config['parameters'].get(
61
+ 'skip_special_tokens',
62
+ {'string_value': "true"})['string_value'].lower() in [
63
+ 'true', '1', 't', 'y', 'yes'
64
+ ]
65
 
66
  if tokenizer_type == 't5':
67
  self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
68
  padding_side='left')
69
  elif tokenizer_type == 'auto':
70
+ self.tokenizer = AutoTokenizer.from_pretrained(
71
+ tokenizer_dir, padding_side='left', trust_remote_code=True)
72
  elif tokenizer_type == 'llama':
73
  self.tokenizer = LlamaTokenizer.from_pretrained(
74
  tokenizer_dir, legacy=False, padding_side='left')
 
77
  f'Unexpected tokenizer type: {tokenizer_type}')
78
  self.tokenizer.pad_token = self.tokenizer.eos_token
79
 
 
 
80
  # Parse model output configs
81
  output_config = pb_utils.get_output_config_by_name(
82
  model_config, "OUTPUT")
 
87
  output_lens_config = pb_utils.get_output_config_by_name(
88
  model_config, "OUTPUT_LENS")
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def execute(self, requests):
91
  """`execute` must be implemented in every Python model. `execute`
92
  function receives a list of pb_utils.InferenceRequest as the only
 
116
  tokens_batch = pb_utils.get_input_tensor_by_name(
117
  request, 'TOKENS_BATCH').as_numpy()
118
 
119
+ # Get sequence length
120
+ sequence_lengths = pb_utils.get_input_tensor_by_name(
121
+ request, 'SEQUENCE_LENGTH').as_numpy()
122
+
123
+ # Get cum log probs
124
+ cum_log_probs = pb_utils.get_input_tensor_by_name(
125
+ request, 'CUM_LOG_PROBS').as_numpy()
126
+
127
+ # Get sequence length
128
+ output_log_probs = pb_utils.get_input_tensor_by_name(
129
+ request, 'OUTPUT_LOG_PROBS').as_numpy()
130
+
131
  # Reshape Input
132
  # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
133
  # tokens_batch = tokens_batch.T
134
 
135
  # Postprocessing output data.
136
+ outputs, output_lens = self._postprocessing(tokens_batch, sequence_lengths)
137
+
138
 
139
  # Create output tensors. You need pb_utils.Tensor
140
  # objects to create pb_utils.InferenceResponse.
141
  output_tensor = pb_utils.Tensor(
142
  'OUTPUT',
143
  np.array(outputs).astype(self.output_dtype))
144
+
145
+ out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
146
+ cum_log_probs)
147
+
148
+ out_output_log_probs = pb_utils.Tensor('OUT_OUTPUT_LOG_PROBS',
149
+ output_log_probs)
150
 
151
  # Create InferenceResponse. You can set an error here in case
152
  # there was a problem with handling this inference request.
 
155
  #
156
  # pb_utils.InferenceResponse(
157
  # output_tensors=..., TritonError("An error occurred"))
158
+ inference_response = pb_utils.InferenceResponse(output_tensors=[
159
+ output_tensor, out_cum_log_probs, out_output_log_probs
160
+ ])
161
  responses.append(inference_response)
162
 
163
  # You should return a list of pb_utils.InferenceResponse. Length
 
171
  """
172
  print('Cleaning up...')
173
 
174
+ def _postprocessing(self, tokens_batch, sequence_lengths):
 
 
 
 
 
 
 
175
  outputs = []
176
+ for batch_idx, beam_tokens in enumerate(tokens_batch):
177
+ for beam_idx, tokens in enumerate(beam_tokens):
178
+ seq_len = sequence_lengths[batch_idx][beam_idx]
179
+ output = self.tokenizer.decode(
180
+ tokens[:seq_len],
181
+ skip_special_tokens=self.skip_special_tokens)
 
 
182
  outputs.append(output.encode('utf8'))
183
+ return outputs
 
 
postprocessing/config.pbtxt CHANGED
@@ -32,17 +32,42 @@ input [
32
  name: "TOKENS_BATCH"
33
  data_type: TYPE_INT32
34
  dims: [ -1, -1 ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
  ]
37
  output [
38
  {
39
  name: "OUTPUT"
40
  data_type: TYPE_STRING
 
 
 
 
 
 
 
 
 
 
41
  dims: [ -1, -1 ]
42
  },
43
  {
44
  name: "OUTPUT_LENS"
45
- data_type: TYPE_UINT32
46
  dims: [ -1 ]
47
  }
48
  ]
@@ -50,7 +75,7 @@ output [
50
  parameters {
51
  key: "tokenizer_dir"
52
  value: {
53
- string_value: "${tokenizer_dir}"
54
  }
55
  }
56
 
@@ -61,9 +86,16 @@ parameters {
61
  }
62
  }
63
 
 
 
 
 
 
 
 
64
  instance_group [
65
  {
66
- count: 1
67
  kind: KIND_CPU
68
  }
69
  ]
 
32
  name: "TOKENS_BATCH"
33
  data_type: TYPE_INT32
34
  dims: [ -1, -1 ]
35
+ },
36
+ {
37
+ name: "SEQUENCE_LENGTH"
38
+ data_type: TYPE_INT32
39
+ dims: [ -1 ]
40
+ },
41
+ {
42
+ name: "CUM_LOG_PROBS"
43
+ data_type: TYPE_FP32
44
+ dims: [ -1 ]
45
+ },
46
+ {
47
+ name: "OUTPUT_LOG_PROBS"
48
+ data_type: TYPE_FP32
49
+ dims: [ -1, -1 ]
50
  }
51
  ]
52
  output [
53
  {
54
  name: "OUTPUT"
55
  data_type: TYPE_STRING
56
+ dims: [ -1 ]
57
+ },
58
+ {
59
+ name: "OUT_CUM_LOG_PROBS"
60
+ data_type: TYPE_FP32
61
+ dims: [ -1 ]
62
+ },
63
+ {
64
+ name: "OUT_OUTPUT_LOG_PROBS"
65
+ data_type: TYPE_FP32
66
  dims: [ -1, -1 ]
67
  },
68
  {
69
  name: "OUTPUT_LENS"
70
+ data_type: TYPE_INT32
71
  dims: [ -1 ]
72
  }
73
  ]
 
75
  parameters {
76
  key: "tokenizer_dir"
77
  value: {
78
+ string_value: "/data/llama/Llama-2-70b-chat-hf/"
79
  }
80
  }
81
 
 
86
  }
87
  }
88
 
89
+ parameters {
90
+ key: "skip_special_tokens"
91
+ value: {
92
+ string_value: "True"
93
+ }
94
+ }
95
+
96
  instance_group [
97
  {
98
+ count: 4
99
  kind: KIND_CPU
100
  }
101
  ]
preprocessing/1/model.py CHANGED
@@ -24,14 +24,11 @@
24
  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
 
27
- import csv
28
  import json
29
  from typing import List
30
 
31
  import numpy as np
32
- import torch
33
  import triton_python_backend_utils as pb_utils
34
- from torch.nn.utils.rnn import pad_sequence
35
  from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
36
 
37
 
@@ -61,13 +58,18 @@ class TritonPythonModel:
61
  'string_value']
62
  tokenizer_type = model_config['parameters']['tokenizer_type'][
63
  'string_value']
 
 
 
 
 
64
 
65
  if tokenizer_type == 't5':
66
  self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
67
  padding_side='left')
68
  elif tokenizer_type == 'auto':
69
- self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
70
- padding_side='left')
71
  elif tokenizer_type == 'llama':
72
  self.tokenizer = LlamaTokenizer.from_pretrained(
73
  tokenizer_dir, legacy=False, padding_side='left')
@@ -80,17 +82,26 @@ class TritonPythonModel:
80
  add_special_tokens=False)[0]
81
 
82
  # Parse model output configs and convert Triton types to numpy types
83
- input_names = [
84
  "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
85
  ]
 
86
  for input_name in input_names:
87
  setattr(
88
  self,
89
  input_name.lower() + "_dtype",
90
  pb_utils.triton_string_to_numpy(
91
- pb_utils.get_output_config_by_name(
92
  model_config, input_name)['data_type']))
93
 
 
 
 
 
 
 
 
 
94
  def execute(self, requests):
95
  """`execute` must be implemented in every Python model. `execute`
96
  function receives a list of pb_utils.InferenceRequest as the only
@@ -115,48 +126,73 @@ class TritonPythonModel:
115
 
116
  # Every Python backend must iterate over everyone of the requests
117
  # and create a pb_utils.InferenceResponse for each of them.
 
118
  for idx, request in enumerate(requests):
119
  # Get input tensors
120
  query = pb_utils.get_input_tensor_by_name(request,
121
  'QUERY').as_numpy()
 
 
 
 
 
 
 
 
 
 
 
122
  request_output_len = pb_utils.get_input_tensor_by_name(
123
  request, 'REQUEST_OUTPUT_LEN').as_numpy()
124
 
125
  bad_words_dict = pb_utils.get_input_tensor_by_name(
126
- request, 'BAD_WORDS_DICT').as_numpy()
 
 
 
127
  stop_words_dict = pb_utils.get_input_tensor_by_name(
128
- request, 'STOP_WORDS_DICT').as_numpy()
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  # Preprocessing input data.
131
  input_id, request_input_len = self._create_request(query)
132
  bad_words = self._to_word_list_format(bad_words_dict)
133
  stop_words = self._to_word_list_format(stop_words_dict)
134
 
 
 
 
 
135
  # Create output tensors. You need pb_utils.Tensor
136
  # objects to create pb_utils.InferenceResponse.
137
  input_id_tensor = pb_utils.Tensor(
138
- 'INPUT_ID',
139
- np.array(input_id).astype(self.input_id_dtype))
140
  request_input_len_tensor = pb_utils.Tensor(
141
  'REQUEST_INPUT_LEN',
142
- np.array(request_input_len).astype(
143
- self.request_input_len_dtype))
144
  request_output_len_tensor = pb_utils.Tensor(
145
  'REQUEST_OUTPUT_LEN', request_output_len)
146
  bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
147
  stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
148
  stop_words)
 
 
149
 
150
- # Create InferenceResponse. You can set an error here in case
151
- # there was a problem with handling this inference request.
152
- # Below is an example of how you can set errors in inference
153
- # response:
154
- #
155
- # pb_utils.InferenceResponse(
156
- # output_tensors=..., TritonError("An error occurred"))
157
  inference_response = pb_utils.InferenceResponse(output_tensors=[
158
  input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
159
- request_input_len_tensor, request_output_len_tensor
 
160
  ])
161
  responses.append(inference_response)
162
 
@@ -176,44 +212,48 @@ class TritonPythonModel:
176
  query : batch string (2D numpy array)
177
  """
178
  start_ids = [
179
- torch.IntTensor(self.tokenizer.encode(s[0].decode()))
 
 
 
180
  for s in query
181
  ]
182
- start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
183
 
184
- start_ids = pad_sequence(start_ids,
185
- batch_first=True,
186
- padding_value=self.pad_id)
187
- # input_len = min(start_lengths)
188
- #attn_mask = torch.ones((batch_size, input_len, input_len)).tril()
 
 
 
189
 
190
  return start_ids, start_lengths
191
 
192
- def _to_word_list_format(self, word_dict: List[List[str]]):
193
  '''
194
- format of word_dict
195
- len(word_dict) should be same to batch_size
196
- word_dict[i] means the words for batch i
197
- len(word_dict[i]) must be 1, which means it only contains 1 string
198
- This string can contains several sentences and split by ",".
199
- For example, if word_dict[2] = " I am happy, I am sad", then this function will return
200
- the ids for two short sentences " I am happy" and " I am sad".
201
  '''
202
  assert self.tokenizer != None, "need to set tokenizer"
203
 
 
 
 
 
204
  flat_ids = []
205
  offsets = []
206
- for word_dict_item in word_dict:
207
  item_flat_ids = []
208
  item_offsets = []
209
 
210
- if isinstance(word_dict_item[0], bytes):
211
- word_dict_item = [word_dict_item[0].decode()]
212
-
213
- words = list(csv.reader(word_dict_item))[0]
214
- for word in words:
215
- ids = self.tokenizer.encode(word)
216
 
 
217
  if len(ids) == 0:
218
  continue
219
 
@@ -233,3 +273,37 @@ class TritonPythonModel:
233
 
234
  return np.array([flat_ids, offsets], dtype="int32").transpose(
235
  (1, 0, 2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
  # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
 
 
27
  import json
28
  from typing import List
29
 
30
  import numpy as np
 
31
  import triton_python_backend_utils as pb_utils
 
32
  from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
33
 
34
 
 
58
  'string_value']
59
  tokenizer_type = model_config['parameters']['tokenizer_type'][
60
  'string_value']
61
+ self.add_special_tokens = model_config['parameters'].get(
62
+ 'add_special_tokens',
63
+ {'string_value': "false"})['string_value'].lower() in [
64
+ 'true', '1', 't', 'y', 'yes'
65
+ ]
66
 
67
  if tokenizer_type == 't5':
68
  self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
69
  padding_side='left')
70
  elif tokenizer_type == 'auto':
71
+ self.tokenizer = AutoTokenizer.from_pretrained(
72
+ tokenizer_dir, padding_side='left', trust_remote_code=True)
73
  elif tokenizer_type == 'llama':
74
  self.tokenizer = LlamaTokenizer.from_pretrained(
75
  tokenizer_dir, legacy=False, padding_side='left')
 
82
  add_special_tokens=False)[0]
83
 
84
  # Parse model output configs and convert Triton types to numpy types
85
+ output_names = [
86
  "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
87
  ]
88
+ input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
89
  for input_name in input_names:
90
  setattr(
91
  self,
92
  input_name.lower() + "_dtype",
93
  pb_utils.triton_string_to_numpy(
94
+ pb_utils.get_input_config_by_name(
95
  model_config, input_name)['data_type']))
96
 
97
+ for output_name in output_names:
98
+ setattr(
99
+ self,
100
+ output_name.lower() + "_dtype",
101
+ pb_utils.triton_string_to_numpy(
102
+ pb_utils.get_output_config_by_name(
103
+ model_config, output_name)['data_type']))
104
+
105
  def execute(self, requests):
106
  """`execute` must be implemented in every Python model. `execute`
107
  function receives a list of pb_utils.InferenceRequest as the only
 
126
 
127
  # Every Python backend must iterate over everyone of the requests
128
  # and create a pb_utils.InferenceResponse for each of them.
129
+ logger = pb_utils.Logger
130
  for idx, request in enumerate(requests):
131
  # Get input tensors
132
  query = pb_utils.get_input_tensor_by_name(request,
133
  'QUERY').as_numpy()
134
+ batch_dim = query.shape[0]
135
+ if batch_dim != 1:
136
+
137
+ err_str = "Inflight batching backend expects requests with batch size of 1."
138
+ logger.log_error(err_str)
139
+ responses.append(
140
+ pb_utils.InferenceResponse(
141
+ output_tensors=[],
142
+ error=pb_utils.TritonError(err_str)))
143
+ continue
144
+
145
  request_output_len = pb_utils.get_input_tensor_by_name(
146
  request, 'REQUEST_OUTPUT_LEN').as_numpy()
147
 
148
  bad_words_dict = pb_utils.get_input_tensor_by_name(
149
+ request, 'BAD_WORDS_DICT')
150
+ if bad_words_dict is not None:
151
+ bad_words_dict = bad_words_dict.as_numpy()
152
+
153
  stop_words_dict = pb_utils.get_input_tensor_by_name(
154
+ request, 'STOP_WORDS_DICT')
155
+ if stop_words_dict is not None:
156
+ stop_words_dict = stop_words_dict.as_numpy()
157
+
158
+ embedding_bias_words = pb_utils.get_input_tensor_by_name(
159
+ request, 'EMBEDDING_BIAS_WORDS')
160
+ if embedding_bias_words is not None:
161
+ embedding_bias_words = embedding_bias_words.as_numpy()
162
+
163
+ embedding_bias_weights = pb_utils.get_input_tensor_by_name(
164
+ request, 'EMBEDDING_BIAS_WEIGHTS')
165
+ if embedding_bias_weights is not None:
166
+ embedding_bias_weights = embedding_bias_weights.as_numpy()
167
 
168
  # Preprocessing input data.
169
  input_id, request_input_len = self._create_request(query)
170
  bad_words = self._to_word_list_format(bad_words_dict)
171
  stop_words = self._to_word_list_format(stop_words_dict)
172
 
173
+ embedding_bias = self._get_embedding_bias(
174
+ embedding_bias_words, embedding_bias_weights,
175
+ self.embedding_bias_weights_dtype)
176
+
177
  # Create output tensors. You need pb_utils.Tensor
178
  # objects to create pb_utils.InferenceResponse.
179
  input_id_tensor = pb_utils.Tensor(
180
+ 'INPUT_ID', input_id.astype(self.input_id_dtype))
 
181
  request_input_len_tensor = pb_utils.Tensor(
182
  'REQUEST_INPUT_LEN',
183
+ request_input_len.astype(self.request_input_len_dtype))
 
184
  request_output_len_tensor = pb_utils.Tensor(
185
  'REQUEST_OUTPUT_LEN', request_output_len)
186
  bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
187
  stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
188
  stop_words)
189
+ embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
190
+ embedding_bias)
191
 
 
 
 
 
 
 
 
192
  inference_response = pb_utils.InferenceResponse(output_tensors=[
193
  input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
194
+ request_input_len_tensor, request_output_len_tensor,
195
+ embedding_bias_tensor
196
  ])
197
  responses.append(inference_response)
198
 
 
212
  query : batch string (2D numpy array)
213
  """
214
  start_ids = [
215
+ np.array(
216
+ self.tokenizer.encode(
217
+ s[0].decode(),
218
+ add_special_tokens=self.add_special_tokens)).astype(int)
219
  for s in query
220
  ]
221
+ start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
222
 
223
+ max_len = 0
224
+ for seq in start_ids:
225
+ max_len = max(max_len, seq.shape[0])
226
+ start_ids = np.stack([
227
+ np.pad(seq, (0, max_len - seq.shape[0]),
228
+ 'constant',
229
+ constant_values=(0, self.pad_id)) for seq in start_ids
230
+ ])
231
 
232
  return start_ids, start_lengths
233
 
234
+ def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
235
  '''
236
+ word_lists format:
237
+ len(word_lists) == batch_size
238
+ word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
 
 
 
 
239
  '''
240
  assert self.tokenizer != None, "need to set tokenizer"
241
 
242
+ if word_lists is None:
243
+ # Return an empty array of shape (1,2,0)
244
+ return np.empty([1, 2, 0], dtype="int32")
245
+
246
  flat_ids = []
247
  offsets = []
248
+ for word_list in word_lists:
249
  item_flat_ids = []
250
  item_offsets = []
251
 
252
+ for word in word_list:
253
+ if isinstance(word, bytes):
254
+ word = word.decode()
 
 
 
255
 
256
+ ids = self.tokenizer.encode(word, add_special_tokens=False)
257
  if len(ids) == 0:
258
  continue
259
 
 
273
 
274
  return np.array([flat_ids, offsets], dtype="int32").transpose(
275
  (1, 0, 2))
276
+
277
+ def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
278
+ bias_dtype):
279
+
280
+ assert self.tokenizer != None, "need to set tokenizer"
281
+
282
+ if embedding_bias_words is None or embedding_bias_weights is None:
283
+ return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
284
+
285
+ batch_embedding_bias = []
286
+ for words, weights in zip(embedding_bias_words,
287
+ embedding_bias_weights):
288
+
289
+ vocab_size = self.tokenizer.vocab_size
290
+ embedding_bias = [0.] * vocab_size
291
+
292
+ assert len(words) == len(
293
+ weights
294
+ ), "Embedding bias words must have same dimension as embedding bias weights"
295
+
296
+ for word, weight in zip(words, weights):
297
+ if isinstance(word, bytes):
298
+ word = word.decode()
299
+ ids = self.tokenizer.encode(word)
300
+
301
+ if len(ids) == 0:
302
+ continue
303
+
304
+ for id in ids:
305
+ embedding_bias[id] += weight
306
+
307
+ batch_embedding_bias.append(np.array(embedding_bias))
308
+
309
+ return np.array(batch_embedding_bias, dtype=bias_dtype)
preprocessing/config.pbtxt CHANGED
@@ -33,20 +33,34 @@ input [
33
  data_type: TYPE_STRING
34
  dims: [ -1 ]
35
  },
 
 
 
 
 
36
  {
37
  name: "BAD_WORDS_DICT"
38
  data_type: TYPE_STRING
39
  dims: [ -1 ]
 
40
  },
41
  {
42
  name: "STOP_WORDS_DICT"
43
  data_type: TYPE_STRING
44
  dims: [ -1 ]
 
45
  },
46
  {
47
- name: "REQUEST_OUTPUT_LEN"
48
- data_type: TYPE_UINT32
 
 
 
 
 
 
49
  dims: [ -1 ]
 
50
  }
51
  ]
52
  output [
@@ -70,9 +84,14 @@ output [
70
  data_type: TYPE_INT32
71
  dims: [ 2, -1 ]
72
  },
 
 
 
 
 
73
  {
74
  name: "REQUEST_OUTPUT_LEN"
75
- data_type: TYPE_UINT32
76
  dims: [ -1 ]
77
  }
78
  ]
@@ -80,7 +99,7 @@ output [
80
  parameters {
81
  key: "tokenizer_dir"
82
  value: {
83
- string_value: "${tokenizer_dir}"
84
  }
85
  }
86
 
@@ -91,9 +110,16 @@ parameters {
91
  }
92
  }
93
 
 
 
 
 
 
 
 
94
  instance_group [
95
  {
96
- count: 1
97
  kind: KIND_CPU
98
  }
99
  ]
 
33
  data_type: TYPE_STRING
34
  dims: [ -1 ]
35
  },
36
+ {
37
+ name: "REQUEST_OUTPUT_LEN"
38
+ data_type: TYPE_INT32
39
+ dims: [ -1 ]
40
+ },
41
  {
42
  name: "BAD_WORDS_DICT"
43
  data_type: TYPE_STRING
44
  dims: [ -1 ]
45
+ optional: true
46
  },
47
  {
48
  name: "STOP_WORDS_DICT"
49
  data_type: TYPE_STRING
50
  dims: [ -1 ]
51
+ optional: true
52
  },
53
  {
54
+ name: "EMBEDDING_BIAS_WORDS"
55
+ data_type: TYPE_STRING
56
+ dims: [ -1 ]
57
+ optional: true
58
+ },
59
+ {
60
+ name: "EMBEDDING_BIAS_WEIGHTS"
61
+ data_type: TYPE_FP32
62
  dims: [ -1 ]
63
+ optional: true
64
  }
65
  ]
66
  output [
 
84
  data_type: TYPE_INT32
85
  dims: [ 2, -1 ]
86
  },
87
+ {
88
+ name: "EMBEDDING_BIAS"
89
+ data_type: TYPE_FP32
90
+ dims: [ -1 ]
91
+ },
92
  {
93
  name: "REQUEST_OUTPUT_LEN"
94
+ data_type: TYPE_INT32
95
  dims: [ -1 ]
96
  }
97
  ]
 
99
  parameters {
100
  key: "tokenizer_dir"
101
  value: {
102
+ string_value: "/data/llama/Llama-2-70b-chat-hf/"
103
  }
104
  }
105
 
 
110
  }
111
  }
112
 
113
+ parameters {
114
+ key: "add_special_tokens"
115
+ value: {
116
+ string_value: "False"
117
+ }
118
+ }
119
+
120
  instance_group [
121
  {
122
+ count: 4
123
  kind: KIND_CPU
124
  }
125
  ]
tensorrt_llm/1/config.json CHANGED
@@ -47,6 +47,7 @@
47
  "tokens_per_block": 128,
48
  "use_custom_all_reduce": false,
49
  "use_paged_context_fmha": false,
 
50
  "weight_only_groupwise_quant_matmul_plugin": false,
51
  "weight_only_quant_matmul_plugin": false
52
  }
 
47
  "tokens_per_block": 128,
48
  "use_custom_all_reduce": false,
49
  "use_paged_context_fmha": false,
50
+ "use_context_fmha_for_generation": false,
51
  "weight_only_groupwise_quant_matmul_plugin": false,
52
  "weight_only_quant_matmul_plugin": false
53
  }
tensorrt_llm/config.pbtxt CHANGED
@@ -26,17 +26,23 @@
26
 
27
  name: "tensorrt_llm"
28
  backend: "tensorrtllm"
29
- max_batch_size: 128
30
 
31
  model_transaction_policy {
32
  decoupled: True
33
  }
34
 
 
 
 
 
 
35
  input [
36
  {
37
  name: "input_ids"
38
  data_type: TYPE_INT32
39
  dims: [ -1 ]
 
40
  },
41
  {
42
  name: "input_lengths"
@@ -46,26 +52,54 @@ input [
46
  },
47
  {
48
  name: "request_output_len"
49
- data_type: TYPE_UINT32
50
  dims: [ 1 ]
51
  },
 
 
 
 
 
 
 
52
  {
53
  name: "end_id"
54
- data_type: TYPE_UINT32
55
  dims: [ 1 ]
56
  reshape: { shape: [ ] }
57
  optional: true
58
  },
59
  {
60
  name: "pad_id"
61
- data_type: TYPE_UINT32
62
  dims: [ 1 ]
63
  reshape: { shape: [ ] }
64
  optional: true
65
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  {
67
  name: "beam_width"
68
- data_type: TYPE_UINT32
69
  dims: [ 1 ]
70
  reshape: { shape: [ ] }
71
  optional: true
@@ -79,7 +113,7 @@ input [
79
  },
80
  {
81
  name: "runtime_top_k"
82
- data_type: TYPE_UINT32
83
  dims: [ 1 ]
84
  reshape: { shape: [ ] }
85
  optional: true
@@ -107,7 +141,7 @@ input [
107
  },
108
  {
109
  name: "min_length"
110
- data_type: TYPE_UINT32
111
  dims: [ 1 ]
112
  reshape: { shape: [ ] }
113
  optional: true
@@ -126,6 +160,13 @@ input [
126
  reshape: { shape: [ ] }
127
  optional: true
128
  },
 
 
 
 
 
 
 
129
  {
130
  name: "stop"
131
  data_type: TYPE_BOOL
@@ -137,6 +178,20 @@ input [
137
  data_type: TYPE_BOOL
138
  dims: [ 1 ]
139
  optional: true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  }
141
  ]
142
  output [
@@ -144,6 +199,21 @@ output [
144
  name: "output_ids"
145
  data_type: TYPE_INT32
146
  dims: [ -1, -1 ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  }
148
  ]
149
  instance_group [
@@ -173,13 +243,19 @@ parameters: {
173
  parameters: {
174
  key: "gpt_model_path"
175
  value: {
176
- string_value: "${gpt_model_path}"
177
  }
178
  }
179
  parameters: {
180
  key: "max_tokens_in_paged_kv_cache"
181
  value: {
182
- string_value: "${max_tokens_in_paged_kv_cache}"
 
 
 
 
 
 
183
  }
184
  }
185
  parameters: {
@@ -195,20 +271,26 @@ parameters: {
195
  }
196
  }
197
  parameters: {
198
- key: "exclude_input_in_output"
199
  value: {
200
- string_value: "true"
201
  }
202
  }
203
  parameters: {
204
- key: "max_num_sequences"
205
  value: {
206
- string_value: "${max_num_sequences}"
207
  }
208
  }
209
  parameters: {
210
- key: "enable_trt_overlap"
 
 
 
 
 
 
211
  value: {
212
- string_value: "${enable_trt_overlap}"
213
  }
214
  }
 
26
 
27
  name: "tensorrt_llm"
28
  backend: "tensorrtllm"
29
+ max_batch_size: 64
30
 
31
  model_transaction_policy {
32
  decoupled: True
33
  }
34
 
35
+ dynamic_batching {
36
+ preferred_batch_size: [ 1 ]
37
+ max_queue_delay_microseconds: 1000
38
+ }
39
+
40
  input [
41
  {
42
  name: "input_ids"
43
  data_type: TYPE_INT32
44
  dims: [ -1 ]
45
+ allow_ragged_batch: true
46
  },
47
  {
48
  name: "input_lengths"
 
52
  },
53
  {
54
  name: "request_output_len"
55
+ data_type: TYPE_INT32
56
  dims: [ 1 ]
57
  },
58
+ {
59
+ name: "draft_input_ids"
60
+ data_type: TYPE_INT32
61
+ dims: [ -1 ]
62
+ optional: true
63
+ allow_ragged_batch: true
64
+ },
65
  {
66
  name: "end_id"
67
+ data_type: TYPE_INT32
68
  dims: [ 1 ]
69
  reshape: { shape: [ ] }
70
  optional: true
71
  },
72
  {
73
  name: "pad_id"
74
+ data_type: TYPE_INT32
75
  dims: [ 1 ]
76
  reshape: { shape: [ ] }
77
  optional: true
78
  },
79
+ {
80
+ name: "stop_words_list"
81
+ data_type: TYPE_INT32
82
+ dims: [ 2, -1 ]
83
+ optional: true
84
+ allow_ragged_batch: true
85
+ },
86
+ {
87
+ name: "bad_words_list"
88
+ data_type: TYPE_INT32
89
+ dims: [ 2, -1 ]
90
+ optional: true
91
+ allow_ragged_batch: true
92
+ },
93
+ {
94
+ name: "embedding_bias"
95
+ data_type: TYPE_FP32
96
+ dims: [ -1 ]
97
+ optional: true
98
+ allow_ragged_batch: true
99
+ },
100
  {
101
  name: "beam_width"
102
+ data_type: TYPE_INT32
103
  dims: [ 1 ]
104
  reshape: { shape: [ ] }
105
  optional: true
 
113
  },
114
  {
115
  name: "runtime_top_k"
116
+ data_type: TYPE_INT32
117
  dims: [ 1 ]
118
  reshape: { shape: [ ] }
119
  optional: true
 
141
  },
142
  {
143
  name: "min_length"
144
+ data_type: TYPE_INT32
145
  dims: [ 1 ]
146
  reshape: { shape: [ ] }
147
  optional: true
 
160
  reshape: { shape: [ ] }
161
  optional: true
162
  },
163
+ {
164
+ name: "return_log_probs"
165
+ data_type: TYPE_BOOL
166
+ dims: [ 1 ]
167
+ reshape: { shape: [ ] }
168
+ optional: true
169
+ },
170
  {
171
  name: "stop"
172
  data_type: TYPE_BOOL
 
178
  data_type: TYPE_BOOL
179
  dims: [ 1 ]
180
  optional: true
181
+ },
182
+ {
183
+ name: "prompt_embedding_table"
184
+ data_type: TYPE_FP16
185
+ dims: [ -1, -1 ]
186
+ optional: true
187
+ allow_ragged_batch: true
188
+ },
189
+ {
190
+ name: "prompt_vocab_size"
191
+ data_type: TYPE_INT32
192
+ dims: [ 1 ]
193
+ reshape: { shape: [ ] }
194
+ optional: true
195
  }
196
  ]
197
  output [
 
199
  name: "output_ids"
200
  data_type: TYPE_INT32
201
  dims: [ -1, -1 ]
202
+ },
203
+ {
204
+ name: "sequence_length"
205
+ data_type: TYPE_INT32
206
+ dims: [ -1 ]
207
+ },
208
+ {
209
+ name: "cum_log_probs"
210
+ data_type: TYPE_FP32
211
+ dims: [ -1 ]
212
+ },
213
+ {
214
+ name: "output_log_probs"
215
+ data_type: TYPE_FP32
216
+ dims: [ -1, -1 ]
217
  }
218
  ]
219
  instance_group [
 
243
  parameters: {
244
  key: "gpt_model_path"
245
  value: {
246
+ string_value: "/data/tgi-data/yessen/Llama-2-70b-chat-hf-trt-fp8/tensorrt_llm/1"
247
  }
248
  }
249
  parameters: {
250
  key: "max_tokens_in_paged_kv_cache"
251
  value: {
252
+ string_value: "40000"
253
+ }
254
+ }
255
+ parameters: {
256
+ key: "max_attention_window_size"
257
+ value: {
258
+ string_value: "4096"
259
  }
260
  }
261
  parameters: {
 
271
  }
272
  }
273
  parameters: {
274
+ key: "max_num_sequences"
275
  value: {
276
+ string_value: "64"
277
  }
278
  }
279
  parameters: {
280
+ key: "enable_trt_overlap"
281
  value: {
282
+ string_value: "false"
283
  }
284
  }
285
  parameters: {
286
+ key: "exclude_input_in_output"
287
+ value: {
288
+ string_value: "true"
289
+ }
290
+ }
291
+ parameters: {
292
+ key: "enable_kv_cache_reuse"
293
  value: {
294
+ string_value: "${enable_kv_cache_reuse}"
295
  }
296
  }