yessenzhar
commited on
Commit
•
a83b588
1
Parent(s):
210dc9f
add smaller files
Browse files- .gitignore +1 -0
- ensemble/1/.tmp +0 -0
- ensemble/config.pbtxt +255 -0
- postprocessing/1/model.py +188 -0
- postprocessing/config.pbtxt +69 -0
- preprocessing/1/model.py +235 -0
- preprocessing/config.pbtxt +99 -0
- tensorrt_llm/1/config.json +47 -0
- tensorrt_llm/config.pbtxt +208 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.pyc
|
ensemble/1/.tmp
ADDED
File without changes
|
ensemble/config.pbtxt
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "ensemble"
|
28 |
+
platform: "ensemble"
|
29 |
+
max_batch_size: 128
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "text_input"
|
33 |
+
data_type: TYPE_STRING
|
34 |
+
dims: [ -1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "max_tokens"
|
38 |
+
data_type: TYPE_UINT32
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "bad_words"
|
43 |
+
data_type: TYPE_STRING
|
44 |
+
dims: [ -1 ]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
name: "stop_words"
|
48 |
+
data_type: TYPE_STRING
|
49 |
+
dims: [ -1 ]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
name: "end_id"
|
53 |
+
data_type: TYPE_UINT32
|
54 |
+
dims: [ 1 ]
|
55 |
+
optional: true
|
56 |
+
},
|
57 |
+
{
|
58 |
+
name: "pad_id"
|
59 |
+
data_type: TYPE_UINT32
|
60 |
+
dims: [ 1 ]
|
61 |
+
optional: true
|
62 |
+
},
|
63 |
+
{
|
64 |
+
name: "top_k"
|
65 |
+
data_type: TYPE_UINT32
|
66 |
+
dims: [ 1 ]
|
67 |
+
optional: true
|
68 |
+
},
|
69 |
+
{
|
70 |
+
name: "top_p"
|
71 |
+
data_type: TYPE_FP32
|
72 |
+
dims: [ 1 ]
|
73 |
+
optional: true
|
74 |
+
},
|
75 |
+
{
|
76 |
+
name: "temperature"
|
77 |
+
data_type: TYPE_FP32
|
78 |
+
dims: [ 1 ]
|
79 |
+
optional: true
|
80 |
+
},
|
81 |
+
{
|
82 |
+
name: "length_penalty"
|
83 |
+
data_type: TYPE_FP32
|
84 |
+
dims: [ 1 ]
|
85 |
+
optional: true
|
86 |
+
},
|
87 |
+
{
|
88 |
+
name: "repetition_penalty"
|
89 |
+
data_type: TYPE_FP32
|
90 |
+
dims: [ 1 ]
|
91 |
+
optional: true
|
92 |
+
},
|
93 |
+
{
|
94 |
+
name: "min_length"
|
95 |
+
data_type: TYPE_UINT32
|
96 |
+
dims: [ 1 ]
|
97 |
+
optional: true
|
98 |
+
},
|
99 |
+
{
|
100 |
+
name: "presence_penalty"
|
101 |
+
data_type: TYPE_FP32
|
102 |
+
dims: [ 1 ]
|
103 |
+
optional: true
|
104 |
+
},
|
105 |
+
{
|
106 |
+
name: "random_seed"
|
107 |
+
data_type: TYPE_UINT64
|
108 |
+
dims: [ 1 ]
|
109 |
+
optional: true
|
110 |
+
},
|
111 |
+
{
|
112 |
+
name: "beam_width"
|
113 |
+
data_type: TYPE_UINT32
|
114 |
+
dims: [ 1 ]
|
115 |
+
optional: true
|
116 |
+
},
|
117 |
+
{
|
118 |
+
name: "stream"
|
119 |
+
data_type: TYPE_BOOL
|
120 |
+
dims: [ 1 ]
|
121 |
+
optional: true
|
122 |
+
}
|
123 |
+
]
|
124 |
+
output [
|
125 |
+
{
|
126 |
+
name: "text_output"
|
127 |
+
data_type: TYPE_STRING
|
128 |
+
dims: [ -1, -1 ]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
name: "output_tokens"
|
132 |
+
data_type: TYPE_UINT32
|
133 |
+
dims: [ -1 ]
|
134 |
+
}
|
135 |
+
]
|
136 |
+
ensemble_scheduling {
|
137 |
+
step [
|
138 |
+
{
|
139 |
+
model_name: "preprocessing"
|
140 |
+
model_version: -1
|
141 |
+
input_map {
|
142 |
+
key: "QUERY"
|
143 |
+
value: "text_input"
|
144 |
+
}
|
145 |
+
input_map {
|
146 |
+
key: "REQUEST_OUTPUT_LEN"
|
147 |
+
value: "max_tokens"
|
148 |
+
}
|
149 |
+
input_map {
|
150 |
+
key: "BAD_WORDS_DICT"
|
151 |
+
value: "bad_words"
|
152 |
+
}
|
153 |
+
input_map {
|
154 |
+
key: "STOP_WORDS_DICT"
|
155 |
+
value: "stop_words"
|
156 |
+
}
|
157 |
+
output_map {
|
158 |
+
key: "REQUEST_INPUT_LEN"
|
159 |
+
value: "_REQUEST_INPUT_LEN"
|
160 |
+
}
|
161 |
+
output_map {
|
162 |
+
key: "INPUT_ID"
|
163 |
+
value: "_INPUT_ID"
|
164 |
+
}
|
165 |
+
output_map {
|
166 |
+
key: "REQUEST_OUTPUT_LEN"
|
167 |
+
value: "_REQUEST_OUTPUT_LEN"
|
168 |
+
}
|
169 |
+
},
|
170 |
+
{
|
171 |
+
model_name: "tensorrt_llm"
|
172 |
+
model_version: -1
|
173 |
+
input_map {
|
174 |
+
key: "input_ids"
|
175 |
+
value: "_INPUT_ID"
|
176 |
+
}
|
177 |
+
input_map {
|
178 |
+
key: "input_lengths"
|
179 |
+
value: "_REQUEST_INPUT_LEN"
|
180 |
+
}
|
181 |
+
input_map {
|
182 |
+
key: "request_output_len"
|
183 |
+
value: "_REQUEST_OUTPUT_LEN"
|
184 |
+
}
|
185 |
+
input_map {
|
186 |
+
key: "end_id"
|
187 |
+
value: "end_id"
|
188 |
+
}
|
189 |
+
input_map {
|
190 |
+
key: "pad_id"
|
191 |
+
value: "pad_id"
|
192 |
+
}
|
193 |
+
input_map {
|
194 |
+
key: "runtime_top_k"
|
195 |
+
value: "top_k"
|
196 |
+
}
|
197 |
+
input_map {
|
198 |
+
key: "runtime_top_p"
|
199 |
+
value: "top_p"
|
200 |
+
}
|
201 |
+
input_map {
|
202 |
+
key: "temperature"
|
203 |
+
value: "temperature"
|
204 |
+
}
|
205 |
+
input_map {
|
206 |
+
key: "len_penalty"
|
207 |
+
value: "length_penalty"
|
208 |
+
}
|
209 |
+
input_map {
|
210 |
+
key: "repetition_penalty"
|
211 |
+
value: "repetition_penalty"
|
212 |
+
}
|
213 |
+
input_map {
|
214 |
+
key: "min_length"
|
215 |
+
value: "min_length"
|
216 |
+
}
|
217 |
+
input_map {
|
218 |
+
key: "presence_penalty"
|
219 |
+
value: "presence_penalty"
|
220 |
+
}
|
221 |
+
input_map {
|
222 |
+
key: "random_seed"
|
223 |
+
value: "random_seed"
|
224 |
+
}
|
225 |
+
input_map {
|
226 |
+
key: "beam_width"
|
227 |
+
value: "beam_width"
|
228 |
+
}
|
229 |
+
input_map {
|
230 |
+
key: "streaming"
|
231 |
+
value: "stream"
|
232 |
+
}
|
233 |
+
output_map {
|
234 |
+
key: "output_ids"
|
235 |
+
value: "_TOKENS_BATCH"
|
236 |
+
}
|
237 |
+
},
|
238 |
+
{
|
239 |
+
model_name: "postprocessing"
|
240 |
+
model_version: -1
|
241 |
+
input_map {
|
242 |
+
key: "TOKENS_BATCH"
|
243 |
+
value: "_TOKENS_BATCH"
|
244 |
+
}
|
245 |
+
output_map {
|
246 |
+
key: "OUTPUT"
|
247 |
+
value: "text_output"
|
248 |
+
}
|
249 |
+
output_map {
|
250 |
+
key: "OUTPUT_LENS"
|
251 |
+
value: "output_tokens"
|
252 |
+
}
|
253 |
+
}
|
254 |
+
]
|
255 |
+
}
|
postprocessing/1/model.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import json
|
28 |
+
|
29 |
+
import numpy as np
|
30 |
+
import triton_python_backend_utils as pb_utils
|
31 |
+
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
|
32 |
+
|
33 |
+
|
34 |
+
class TritonPythonModel:
|
35 |
+
"""Your Python model must use the same class name. Every Python model
|
36 |
+
that is created must have "TritonPythonModel" as the class name.
|
37 |
+
"""
|
38 |
+
|
39 |
+
def initialize(self, args):
|
40 |
+
"""`initialize` is called only once when the model is being loaded.
|
41 |
+
Implementing `initialize` function is optional. This function allows
|
42 |
+
the model to initialize any state associated with this model.
|
43 |
+
Parameters
|
44 |
+
----------
|
45 |
+
args : dict
|
46 |
+
Both keys and values are strings. The dictionary keys and values are:
|
47 |
+
* model_config: A JSON string containing the model configuration
|
48 |
+
* model_instance_kind: A string containing model instance kind
|
49 |
+
* model_instance_device_id: A string containing model instance device ID
|
50 |
+
* model_repository: Model repository path
|
51 |
+
* model_version: Model version
|
52 |
+
* model_name: Model name
|
53 |
+
"""
|
54 |
+
# Parse model configs
|
55 |
+
model_config = json.loads(args['model_config'])
|
56 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
57 |
+
'string_value']
|
58 |
+
tokenizer_type = model_config['parameters']['tokenizer_type'][
|
59 |
+
'string_value']
|
60 |
+
|
61 |
+
if tokenizer_type == 't5':
|
62 |
+
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
|
63 |
+
padding_side='left')
|
64 |
+
elif tokenizer_type == 'auto':
|
65 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
66 |
+
padding_side='left')
|
67 |
+
elif tokenizer_type == 'llama':
|
68 |
+
self.tokenizer = LlamaTokenizer.from_pretrained(
|
69 |
+
tokenizer_dir, legacy=False, padding_side='left')
|
70 |
+
else:
|
71 |
+
raise AttributeError(
|
72 |
+
f'Unexpected tokenizer type: {tokenizer_type}')
|
73 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
74 |
+
|
75 |
+
self._init_token_map()
|
76 |
+
|
77 |
+
# Parse model output configs
|
78 |
+
output_config = pb_utils.get_output_config_by_name(
|
79 |
+
model_config, "OUTPUT")
|
80 |
+
|
81 |
+
# Convert Triton types to numpy types
|
82 |
+
self.output_dtype = pb_utils.triton_string_to_numpy(
|
83 |
+
output_config['data_type'])
|
84 |
+
output_lens_config = pb_utils.get_output_config_by_name(
|
85 |
+
model_config, "OUTPUT_LENS")
|
86 |
+
|
87 |
+
# Convert Triton types to numpy types
|
88 |
+
self.output_lens_dtype = pb_utils.triton_string_to_numpy(
|
89 |
+
output_lens_config['data_type'])
|
90 |
+
|
91 |
+
def _init_token_map(self):
|
92 |
+
v = self.tokenizer.get_vocab()
|
93 |
+
self.token_map = [None] * len(v)
|
94 |
+
for k, val in v.items():
|
95 |
+
self.token_map[val] = k
|
96 |
+
|
97 |
+
for i in range(len(v)):
|
98 |
+
if self.token_map[i] is None:
|
99 |
+
print("error %s" % i)
|
100 |
+
|
101 |
+
def execute(self, requests):
|
102 |
+
"""`execute` must be implemented in every Python model. `execute`
|
103 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
104 |
+
argument. This function is called when an inference is requested
|
105 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
106 |
+
Batching) used, `requests` may contain multiple requests. Every
|
107 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
108 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
109 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
110 |
+
Parameters
|
111 |
+
----------
|
112 |
+
requests : list
|
113 |
+
A list of pb_utils.InferenceRequest
|
114 |
+
Returns
|
115 |
+
-------
|
116 |
+
list
|
117 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
118 |
+
be the same as `requests`
|
119 |
+
"""
|
120 |
+
|
121 |
+
responses = []
|
122 |
+
|
123 |
+
# Every Python backend must iterate over everyone of the requests
|
124 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
125 |
+
for idx, request in enumerate(requests):
|
126 |
+
# Get input tensors
|
127 |
+
tokens_batch = pb_utils.get_input_tensor_by_name(
|
128 |
+
request, 'TOKENS_BATCH').as_numpy()
|
129 |
+
|
130 |
+
# Reshape Input
|
131 |
+
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
|
132 |
+
# tokens_batch = tokens_batch.T
|
133 |
+
|
134 |
+
# Postprocessing output data.
|
135 |
+
outputs, output_lens = self._postprocessing(tokens_batch)
|
136 |
+
|
137 |
+
# Create output tensors. You need pb_utils.Tensor
|
138 |
+
# objects to create pb_utils.InferenceResponse.
|
139 |
+
output_tensor = pb_utils.Tensor(
|
140 |
+
'OUTPUT',
|
141 |
+
np.array(outputs).astype(self.output_dtype))
|
142 |
+
output_lens_tensor = pb_utils.Tensor(
|
143 |
+
'OUTPUT_LENS',
|
144 |
+
np.array(output_lens).astype(self.output_lens_dtype))
|
145 |
+
|
146 |
+
# Create InferenceResponse. You can set an error here in case
|
147 |
+
# there was a problem with handling this inference request.
|
148 |
+
# Below is an example of how you can set errors in inference
|
149 |
+
# response:
|
150 |
+
#
|
151 |
+
# pb_utils.InferenceResponse(
|
152 |
+
# output_tensors=..., TritonError("An error occurred"))
|
153 |
+
inference_response = pb_utils.InferenceResponse(
|
154 |
+
output_tensors=[output_tensor, output_lens_tensor])
|
155 |
+
responses.append(inference_response)
|
156 |
+
|
157 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
158 |
+
# of this list must match the length of `requests` list.
|
159 |
+
return responses
|
160 |
+
|
161 |
+
def finalize(self):
|
162 |
+
"""`finalize` is called only once when the model is being unloaded.
|
163 |
+
Implementing `finalize` function is optional. This function allows
|
164 |
+
the model to perform any necessary clean ups before exit.
|
165 |
+
"""
|
166 |
+
print('Cleaning up...')
|
167 |
+
|
168 |
+
def _single_token_decode(self, token):
|
169 |
+
st = self.token_map[token]
|
170 |
+
if st[0] == '▁':
|
171 |
+
return " " + st[1:]
|
172 |
+
return st
|
173 |
+
|
174 |
+
def _postprocessing(self, tokens_batch):
|
175 |
+
outputs = []
|
176 |
+
output_lens = []
|
177 |
+
for beam_tokens in tokens_batch:
|
178 |
+
total_len = 0
|
179 |
+
for tokens in beam_tokens:
|
180 |
+
if len(tokens) == 1:
|
181 |
+
output = self._single_token_decode(tokens[0])
|
182 |
+
else:
|
183 |
+
output = self.tokenizer.decode(tokens)
|
184 |
+
print(output)
|
185 |
+
outputs.append(output.encode('utf8'))
|
186 |
+
total_len += len(tokens)
|
187 |
+
output_lens.append(total_len)
|
188 |
+
return outputs, output_lens
|
postprocessing/config.pbtxt
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "postprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 128
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "TOKENS_BATCH"
|
33 |
+
data_type: TYPE_INT32
|
34 |
+
dims: [ -1, -1 ]
|
35 |
+
}
|
36 |
+
]
|
37 |
+
output [
|
38 |
+
{
|
39 |
+
name: "OUTPUT"
|
40 |
+
data_type: TYPE_STRING
|
41 |
+
dims: [ -1, -1 ]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
name: "OUTPUT_LENS"
|
45 |
+
data_type: TYPE_UINT32
|
46 |
+
dims: [ -1 ]
|
47 |
+
}
|
48 |
+
]
|
49 |
+
|
50 |
+
parameters {
|
51 |
+
key: "tokenizer_dir"
|
52 |
+
value: {
|
53 |
+
string_value: "/data/tgi-data/orig_llama"
|
54 |
+
}
|
55 |
+
}
|
56 |
+
|
57 |
+
parameters {
|
58 |
+
key: "tokenizer_type"
|
59 |
+
value: {
|
60 |
+
string_value: "llama"
|
61 |
+
}
|
62 |
+
}
|
63 |
+
|
64 |
+
instance_group [
|
65 |
+
{
|
66 |
+
count: 1
|
67 |
+
kind: KIND_CPU
|
68 |
+
}
|
69 |
+
]
|
preprocessing/1/model.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
import csv
|
28 |
+
import json
|
29 |
+
from typing import List
|
30 |
+
|
31 |
+
import numpy as np
|
32 |
+
import torch
|
33 |
+
import triton_python_backend_utils as pb_utils
|
34 |
+
from torch.nn.utils.rnn import pad_sequence
|
35 |
+
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
|
36 |
+
|
37 |
+
|
38 |
+
class TritonPythonModel:
|
39 |
+
"""Your Python model must use the same class name. Every Python model
|
40 |
+
that is created must have "TritonPythonModel" as the class name.
|
41 |
+
"""
|
42 |
+
|
43 |
+
def initialize(self, args):
|
44 |
+
"""`initialize` is called only once when the model is being loaded.
|
45 |
+
Implementing `initialize` function is optional. This function allows
|
46 |
+
the model to initialize any state associated with this model.
|
47 |
+
Parameters
|
48 |
+
----------
|
49 |
+
args : dict
|
50 |
+
Both keys and values are strings. The dictionary keys and values are:
|
51 |
+
* model_config: A JSON string containing the model configuration
|
52 |
+
* model_instance_kind: A string containing model instance kind
|
53 |
+
* model_instance_device_id: A string containing model instance device ID
|
54 |
+
* model_repository: Model repository path
|
55 |
+
* model_version: Model version
|
56 |
+
* model_name: Model name
|
57 |
+
"""
|
58 |
+
# Parse model configs
|
59 |
+
model_config = json.loads(args['model_config'])
|
60 |
+
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
|
61 |
+
'string_value']
|
62 |
+
tokenizer_type = model_config['parameters']['tokenizer_type'][
|
63 |
+
'string_value']
|
64 |
+
|
65 |
+
if tokenizer_type == 't5':
|
66 |
+
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
|
67 |
+
padding_side='left')
|
68 |
+
elif tokenizer_type == 'auto':
|
69 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
70 |
+
padding_side='left')
|
71 |
+
elif tokenizer_type == 'llama':
|
72 |
+
self.tokenizer = LlamaTokenizer.from_pretrained(
|
73 |
+
tokenizer_dir, legacy=False, padding_side='left')
|
74 |
+
else:
|
75 |
+
raise AttributeError(
|
76 |
+
f'Unexpected tokenizer type: {tokenizer_type}')
|
77 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
78 |
+
|
79 |
+
self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token,
|
80 |
+
add_special_tokens=False)[0]
|
81 |
+
|
82 |
+
# Parse model output configs and convert Triton types to numpy types
|
83 |
+
input_names = [
|
84 |
+
"INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
|
85 |
+
]
|
86 |
+
for input_name in input_names:
|
87 |
+
setattr(
|
88 |
+
self,
|
89 |
+
input_name.lower() + "_dtype",
|
90 |
+
pb_utils.triton_string_to_numpy(
|
91 |
+
pb_utils.get_output_config_by_name(
|
92 |
+
model_config, input_name)['data_type']))
|
93 |
+
|
94 |
+
def execute(self, requests):
|
95 |
+
"""`execute` must be implemented in every Python model. `execute`
|
96 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
97 |
+
argument. This function is called when an inference is requested
|
98 |
+
for this model. Depending on the batching configuration (e.g. Dynamic
|
99 |
+
Batching) used, `requests` may contain multiple requests. Every
|
100 |
+
Python model, must create one pb_utils.InferenceResponse for every
|
101 |
+
pb_utils.InferenceRequest in `requests`. If there is an error, you can
|
102 |
+
set the error argument when creating a pb_utils.InferenceResponse.
|
103 |
+
Parameters
|
104 |
+
----------
|
105 |
+
requests : list
|
106 |
+
A list of pb_utils.InferenceRequest
|
107 |
+
Returns
|
108 |
+
-------
|
109 |
+
list
|
110 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
111 |
+
be the same as `requests`
|
112 |
+
"""
|
113 |
+
|
114 |
+
responses = []
|
115 |
+
|
116 |
+
# Every Python backend must iterate over everyone of the requests
|
117 |
+
# and create a pb_utils.InferenceResponse for each of them.
|
118 |
+
for idx, request in enumerate(requests):
|
119 |
+
# Get input tensors
|
120 |
+
query = pb_utils.get_input_tensor_by_name(request,
|
121 |
+
'QUERY').as_numpy()
|
122 |
+
request_output_len = pb_utils.get_input_tensor_by_name(
|
123 |
+
request, 'REQUEST_OUTPUT_LEN').as_numpy()
|
124 |
+
|
125 |
+
bad_words_dict = pb_utils.get_input_tensor_by_name(
|
126 |
+
request, 'BAD_WORDS_DICT').as_numpy()
|
127 |
+
stop_words_dict = pb_utils.get_input_tensor_by_name(
|
128 |
+
request, 'STOP_WORDS_DICT').as_numpy()
|
129 |
+
|
130 |
+
# Preprocessing input data.
|
131 |
+
input_id, request_input_len = self._create_request(query)
|
132 |
+
bad_words = self._to_word_list_format(bad_words_dict)
|
133 |
+
stop_words = self._to_word_list_format(stop_words_dict)
|
134 |
+
|
135 |
+
# Create output tensors. You need pb_utils.Tensor
|
136 |
+
# objects to create pb_utils.InferenceResponse.
|
137 |
+
input_id_tensor = pb_utils.Tensor(
|
138 |
+
'INPUT_ID',
|
139 |
+
np.array(input_id).astype(self.input_id_dtype))
|
140 |
+
request_input_len_tensor = pb_utils.Tensor(
|
141 |
+
'REQUEST_INPUT_LEN',
|
142 |
+
np.array(request_input_len).astype(
|
143 |
+
self.request_input_len_dtype))
|
144 |
+
request_output_len_tensor = pb_utils.Tensor(
|
145 |
+
'REQUEST_OUTPUT_LEN', request_output_len)
|
146 |
+
bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
|
147 |
+
stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
|
148 |
+
stop_words)
|
149 |
+
|
150 |
+
# Create InferenceResponse. You can set an error here in case
|
151 |
+
# there was a problem with handling this inference request.
|
152 |
+
# Below is an example of how you can set errors in inference
|
153 |
+
# response:
|
154 |
+
#
|
155 |
+
# pb_utils.InferenceResponse(
|
156 |
+
# output_tensors=..., TritonError("An error occurred"))
|
157 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
158 |
+
input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
|
159 |
+
request_input_len_tensor, request_output_len_tensor
|
160 |
+
])
|
161 |
+
responses.append(inference_response)
|
162 |
+
|
163 |
+
# You should return a list of pb_utils.InferenceResponse. Length
|
164 |
+
# of this list must match the length of `requests` list.
|
165 |
+
return responses
|
166 |
+
|
167 |
+
def finalize(self):
|
168 |
+
"""`finalize` is called only once when the model is being unloaded.
|
169 |
+
Implementing `finalize` function is optional. This function allows
|
170 |
+
the model to perform any necessary clean ups before exit.
|
171 |
+
"""
|
172 |
+
print('Cleaning up...')
|
173 |
+
|
174 |
+
def _create_request(self, query):
|
175 |
+
"""
|
176 |
+
query : batch string (2D numpy array)
|
177 |
+
"""
|
178 |
+
start_ids = [
|
179 |
+
torch.IntTensor(self.tokenizer.encode(s[0].decode()))
|
180 |
+
for s in query
|
181 |
+
]
|
182 |
+
start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
|
183 |
+
|
184 |
+
start_ids = pad_sequence(start_ids,
|
185 |
+
batch_first=True,
|
186 |
+
padding_value=self.pad_id)
|
187 |
+
# input_len = min(start_lengths)
|
188 |
+
#attn_mask = torch.ones((batch_size, input_len, input_len)).tril()
|
189 |
+
|
190 |
+
return start_ids, start_lengths
|
191 |
+
|
192 |
+
def _to_word_list_format(self, word_dict: List[List[str]]):
|
193 |
+
'''
|
194 |
+
format of word_dict
|
195 |
+
len(word_dict) should be same to batch_size
|
196 |
+
word_dict[i] means the words for batch i
|
197 |
+
len(word_dict[i]) must be 1, which means it only contains 1 string
|
198 |
+
This string can contains several sentences and split by ",".
|
199 |
+
For example, if word_dict[2] = " I am happy, I am sad", then this function will return
|
200 |
+
the ids for two short sentences " I am happy" and " I am sad".
|
201 |
+
'''
|
202 |
+
assert self.tokenizer != None, "need to set tokenizer"
|
203 |
+
|
204 |
+
flat_ids = []
|
205 |
+
offsets = []
|
206 |
+
for word_dict_item in word_dict:
|
207 |
+
item_flat_ids = []
|
208 |
+
item_offsets = []
|
209 |
+
|
210 |
+
if isinstance(word_dict_item[0], bytes):
|
211 |
+
word_dict_item = [word_dict_item[0].decode()]
|
212 |
+
|
213 |
+
words = list(csv.reader(word_dict_item))[0]
|
214 |
+
for word in words:
|
215 |
+
ids = self.tokenizer.encode(word)
|
216 |
+
|
217 |
+
if len(ids) == 0:
|
218 |
+
continue
|
219 |
+
|
220 |
+
item_flat_ids += ids
|
221 |
+
item_offsets.append(len(ids))
|
222 |
+
|
223 |
+
flat_ids.append(np.array(item_flat_ids))
|
224 |
+
offsets.append(np.cumsum(np.array(item_offsets)))
|
225 |
+
|
226 |
+
pad_to = max(1, max(len(ids) for ids in flat_ids))
|
227 |
+
|
228 |
+
for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
|
229 |
+
flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)),
|
230 |
+
constant_values=0)
|
231 |
+
offsets[i] = np.pad(offs, (0, pad_to - len(offs)),
|
232 |
+
constant_values=-1)
|
233 |
+
|
234 |
+
return np.array([flat_ids, offsets], dtype="int32").transpose(
|
235 |
+
(1, 0, 2))
|
preprocessing/config.pbtxt
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "preprocessing"
|
28 |
+
backend: "python"
|
29 |
+
max_batch_size: 128
|
30 |
+
input [
|
31 |
+
{
|
32 |
+
name: "QUERY"
|
33 |
+
data_type: TYPE_STRING
|
34 |
+
dims: [ -1 ]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
name: "BAD_WORDS_DICT"
|
38 |
+
data_type: TYPE_STRING
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "STOP_WORDS_DICT"
|
43 |
+
data_type: TYPE_STRING
|
44 |
+
dims: [ -1 ]
|
45 |
+
},
|
46 |
+
{
|
47 |
+
name: "REQUEST_OUTPUT_LEN"
|
48 |
+
data_type: TYPE_UINT32
|
49 |
+
dims: [ -1 ]
|
50 |
+
}
|
51 |
+
]
|
52 |
+
output [
|
53 |
+
{
|
54 |
+
name: "INPUT_ID"
|
55 |
+
data_type: TYPE_INT32
|
56 |
+
dims: [ -1 ]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
name: "REQUEST_INPUT_LEN"
|
60 |
+
data_type: TYPE_INT32
|
61 |
+
dims: [ 1 ]
|
62 |
+
},
|
63 |
+
{
|
64 |
+
name: "BAD_WORDS_IDS"
|
65 |
+
data_type: TYPE_INT32
|
66 |
+
dims: [ 2, -1 ]
|
67 |
+
},
|
68 |
+
{
|
69 |
+
name: "STOP_WORDS_IDS"
|
70 |
+
data_type: TYPE_INT32
|
71 |
+
dims: [ 2, -1 ]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
name: "REQUEST_OUTPUT_LEN"
|
75 |
+
data_type: TYPE_UINT32
|
76 |
+
dims: [ -1 ]
|
77 |
+
}
|
78 |
+
]
|
79 |
+
|
80 |
+
parameters {
|
81 |
+
key: "tokenizer_dir"
|
82 |
+
value: {
|
83 |
+
string_value: "/data/tgi-data/orig_llama"
|
84 |
+
}
|
85 |
+
}
|
86 |
+
|
87 |
+
parameters {
|
88 |
+
key: "tokenizer_type"
|
89 |
+
value: {
|
90 |
+
string_value: "llama"
|
91 |
+
}
|
92 |
+
}
|
93 |
+
|
94 |
+
instance_group [
|
95 |
+
{
|
96 |
+
count: 1
|
97 |
+
kind: KIND_CPU
|
98 |
+
}
|
99 |
+
]
|
tensorrt_llm/1/config.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"builder_config": {
|
3 |
+
"fp8": true,
|
4 |
+
"hidden_act": "silu",
|
5 |
+
"hidden_size": 8192,
|
6 |
+
"int8": false,
|
7 |
+
"max_batch_size": 64,
|
8 |
+
"max_input_len": 4096,
|
9 |
+
"max_num_tokens": null,
|
10 |
+
"max_output_len": 4096,
|
11 |
+
"max_position_embeddings": 4096,
|
12 |
+
"name": "llama",
|
13 |
+
"num_heads": 64,
|
14 |
+
"num_kv_heads": 8,
|
15 |
+
"num_layers": 80,
|
16 |
+
"parallel_build": true,
|
17 |
+
"pipeline_parallel": 1,
|
18 |
+
"precision": "float16",
|
19 |
+
"quant_mode": 384,
|
20 |
+
"tensor_parallel": 4,
|
21 |
+
"use_refit": false,
|
22 |
+
"vocab_size": 32000
|
23 |
+
},
|
24 |
+
"plugin_config": {
|
25 |
+
"attention_qk_half_accumulation": false,
|
26 |
+
"bert_attention_plugin": false,
|
27 |
+
"context_fmha_type": 1,
|
28 |
+
"gemm_plugin": "float16",
|
29 |
+
"gpt_attention_plugin": "float16",
|
30 |
+
"identity_plugin": false,
|
31 |
+
"layernorm_plugin": false,
|
32 |
+
"layernorm_quantization_plugin": false,
|
33 |
+
"lookup_plugin": false,
|
34 |
+
"nccl_plugin": "float16",
|
35 |
+
"paged_kv_cache": true,
|
36 |
+
"quantize_per_token_plugin": false,
|
37 |
+
"quantize_tensor_plugin": false,
|
38 |
+
"remove_input_padding": true,
|
39 |
+
"rmsnorm_plugin": false,
|
40 |
+
"rmsnorm_quantization_plugin": false,
|
41 |
+
"smooth_quant_gemm_plugin": false,
|
42 |
+
"tokens_per_block": 64,
|
43 |
+
"use_custom_all_reduce": false,
|
44 |
+
"weight_only_groupwise_quant_matmul_plugin": false,
|
45 |
+
"weight_only_quant_matmul_plugin": false
|
46 |
+
}
|
47 |
+
}
|
tensorrt_llm/config.pbtxt
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
#
|
3 |
+
# Redistribution and use in source and binary forms, with or without
|
4 |
+
# modification, are permitted provided that the following conditions
|
5 |
+
# are met:
|
6 |
+
# * Redistributions of source code must retain the above copyright
|
7 |
+
# notice, this list of conditions and the following disclaimer.
|
8 |
+
# * Redistributions in binary form must reproduce the above copyright
|
9 |
+
# notice, this list of conditions and the following disclaimer in the
|
10 |
+
# documentation and/or other materials provided with the distribution.
|
11 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
12 |
+
# contributors may be used to endorse or promote products derived
|
13 |
+
# from this software without specific prior written permission.
|
14 |
+
#
|
15 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
16 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
17 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
18 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
19 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
21 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
22 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
23 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
24 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
26 |
+
|
27 |
+
name: "tensorrt_llm"
|
28 |
+
backend: "tensorrtllm"
|
29 |
+
max_batch_size: 128
|
30 |
+
|
31 |
+
model_transaction_policy {
|
32 |
+
decoupled: True
|
33 |
+
}
|
34 |
+
|
35 |
+
input [
|
36 |
+
{
|
37 |
+
name: "input_ids"
|
38 |
+
data_type: TYPE_INT32
|
39 |
+
dims: [ -1 ]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
name: "input_lengths"
|
43 |
+
data_type: TYPE_INT32
|
44 |
+
dims: [ 1 ]
|
45 |
+
reshape: { shape: [ ] }
|
46 |
+
},
|
47 |
+
{
|
48 |
+
name: "request_output_len"
|
49 |
+
data_type: TYPE_UINT32
|
50 |
+
dims: [ 1 ]
|
51 |
+
},
|
52 |
+
{
|
53 |
+
name: "end_id"
|
54 |
+
data_type: TYPE_UINT32
|
55 |
+
dims: [ 1 ]
|
56 |
+
reshape: { shape: [ ] }
|
57 |
+
optional: true
|
58 |
+
},
|
59 |
+
{
|
60 |
+
name: "pad_id"
|
61 |
+
data_type: TYPE_UINT32
|
62 |
+
dims: [ 1 ]
|
63 |
+
reshape: { shape: [ ] }
|
64 |
+
optional: true
|
65 |
+
},
|
66 |
+
{
|
67 |
+
name: "beam_width"
|
68 |
+
data_type: TYPE_UINT32
|
69 |
+
dims: [ 1 ]
|
70 |
+
reshape: { shape: [ ] }
|
71 |
+
optional: true
|
72 |
+
},
|
73 |
+
{
|
74 |
+
name: "temperature"
|
75 |
+
data_type: TYPE_FP32
|
76 |
+
dims: [ 1 ]
|
77 |
+
reshape: { shape: [ ] }
|
78 |
+
optional: true
|
79 |
+
},
|
80 |
+
{
|
81 |
+
name: "runtime_top_k"
|
82 |
+
data_type: TYPE_UINT32
|
83 |
+
dims: [ 1 ]
|
84 |
+
reshape: { shape: [ ] }
|
85 |
+
optional: true
|
86 |
+
},
|
87 |
+
{
|
88 |
+
name: "runtime_top_p"
|
89 |
+
data_type: TYPE_FP32
|
90 |
+
dims: [ 1 ]
|
91 |
+
reshape: { shape: [ ] }
|
92 |
+
optional: true
|
93 |
+
},
|
94 |
+
{
|
95 |
+
name: "len_penalty"
|
96 |
+
data_type: TYPE_FP32
|
97 |
+
dims: [ 1 ]
|
98 |
+
reshape: { shape: [ ] }
|
99 |
+
optional: true
|
100 |
+
},
|
101 |
+
{
|
102 |
+
name: "repetition_penalty"
|
103 |
+
data_type: TYPE_FP32
|
104 |
+
dims: [ 1 ]
|
105 |
+
reshape: { shape: [ ] }
|
106 |
+
optional: true
|
107 |
+
},
|
108 |
+
{
|
109 |
+
name: "min_length"
|
110 |
+
data_type: TYPE_UINT32
|
111 |
+
dims: [ 1 ]
|
112 |
+
reshape: { shape: [ ] }
|
113 |
+
optional: true
|
114 |
+
},
|
115 |
+
{
|
116 |
+
name: "presence_penalty"
|
117 |
+
data_type: TYPE_FP32
|
118 |
+
dims: [ 1 ]
|
119 |
+
reshape: { shape: [ ] }
|
120 |
+
optional: true
|
121 |
+
},
|
122 |
+
{
|
123 |
+
name: "random_seed"
|
124 |
+
data_type: TYPE_UINT64
|
125 |
+
dims: [ 1 ]
|
126 |
+
reshape: { shape: [ ] }
|
127 |
+
optional: true
|
128 |
+
},
|
129 |
+
{
|
130 |
+
name: "stop"
|
131 |
+
data_type: TYPE_BOOL
|
132 |
+
dims: [ 1 ]
|
133 |
+
optional: true
|
134 |
+
},
|
135 |
+
{
|
136 |
+
name: "streaming"
|
137 |
+
data_type: TYPE_BOOL
|
138 |
+
dims: [ 1 ]
|
139 |
+
optional: true
|
140 |
+
}
|
141 |
+
]
|
142 |
+
output [
|
143 |
+
{
|
144 |
+
name: "output_ids"
|
145 |
+
data_type: TYPE_INT32
|
146 |
+
dims: [ -1, -1 ]
|
147 |
+
}
|
148 |
+
]
|
149 |
+
instance_group [
|
150 |
+
{
|
151 |
+
count: 1
|
152 |
+
kind : KIND_CPU
|
153 |
+
}
|
154 |
+
]
|
155 |
+
parameters: {
|
156 |
+
key: "max_beam_width"
|
157 |
+
value: {
|
158 |
+
string_value: "1"
|
159 |
+
}
|
160 |
+
}
|
161 |
+
parameters: {
|
162 |
+
key: "FORCE_CPU_ONLY_INPUT_TENSORS"
|
163 |
+
value: {
|
164 |
+
string_value: "no"
|
165 |
+
}
|
166 |
+
}
|
167 |
+
parameters: {
|
168 |
+
key: "gpt_model_type"
|
169 |
+
value: {
|
170 |
+
string_value: "inflight_fused_batching"
|
171 |
+
}
|
172 |
+
}
|
173 |
+
parameters: {
|
174 |
+
key: "gpt_model_path"
|
175 |
+
value: {
|
176 |
+
string_value: "/data/tgi-data/triton_model_repo_70_fp8/tensorrt_llm/1"
|
177 |
+
}
|
178 |
+
}
|
179 |
+
parameters: {
|
180 |
+
key: "max_tokens_in_paged_kv_cache"
|
181 |
+
value: {
|
182 |
+
string_value: "${max_tokens_in_paged_kv_cache}"
|
183 |
+
}
|
184 |
+
}
|
185 |
+
parameters: {
|
186 |
+
key: "batch_scheduler_policy"
|
187 |
+
value: {
|
188 |
+
string_value: "max_utilization"
|
189 |
+
}
|
190 |
+
}
|
191 |
+
parameters: {
|
192 |
+
key: "kv_cache_free_gpu_mem_fraction"
|
193 |
+
value: {
|
194 |
+
string_value: "0.9"
|
195 |
+
}
|
196 |
+
}
|
197 |
+
parameters: {
|
198 |
+
key: "max_num_sequences"
|
199 |
+
value: {
|
200 |
+
string_value: "${max_num_sequences}"
|
201 |
+
}
|
202 |
+
}
|
203 |
+
parameters: {
|
204 |
+
key: "enable_trt_overlap"
|
205 |
+
value: {
|
206 |
+
string_value: "${enable_trt_overlap}"
|
207 |
+
}
|
208 |
+
}
|