DeepInfra
/

Llama-2-70b-chat-hf-trt-fp8

Model card Files Files and versions Community

yessenzhar commited on Dec 18, 2023

Commit

dd20dba

•

1 Parent(s): 2d64b65

update models for newer trt 0.6.1 version

Browse files

Files changed (7) hide show

ensemble/config.pbtxt +121 -12
postprocessing/1/model.py +38 -43
postprocessing/config.pbtxt +35 -3
preprocessing/1/model.py +117 -43
preprocessing/config.pbtxt +31 -5
tensorrt_llm/1/config.json +1 -0
tensorrt_llm/config.pbtxt +97 -15

ensemble/config.pbtxt CHANGED Viewed

@@ -26,7 +26,7 @@
 name: "ensemble"
 platform: "ensemble"
-max_batch_size: 128
 input [
   {
     name: "text_input"
@@ -35,34 +35,36 @@ input [
   },
   {
     name: "max_tokens"
-    data_type: TYPE_UINT32
     dims: [ -1 ]
   },
   {
    name: "bad_words"
    data_type: TYPE_STRING
    dims: [ -1 ]
   },
   {
    name: "stop_words"
    data_type: TYPE_STRING
    dims: [ -1 ]
   },
   {
     name: "end_id"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     optional: true
   },
   {
     name: "pad_id"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     optional: true
   },
   {
     name: "top_k"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     optional: true
   },
@@ -92,7 +94,7 @@ input [
   },
   {
     name: "min_length"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     optional: true
   },
@@ -108,9 +110,15 @@ input [
     dims: [ 1 ]
     optional: true
   },
   {
     name: "beam_width"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     optional: true
   },
@@ -119,18 +127,47 @@ input [
     data_type: TYPE_BOOL
     dims: [ 1 ]
     optional: true
   }
 ]
 output [
   {
     name: "text_output"
     data_type: TYPE_STRING
-    dims: [ -1, -1 ]
   },
   {
-    name: "output_tokens"
-    data_type: TYPE_UINT32
     dims: [ -1 ]
   }
 ]
 ensemble_scheduling {
@@ -154,6 +191,14 @@ ensemble_scheduling {
         key: "STOP_WORDS_DICT"
         value: "stop_words"
       }
       output_map {
         key: "REQUEST_INPUT_LEN"
         value: "_REQUEST_INPUT_LEN"
@@ -166,6 +211,18 @@ ensemble_scheduling {
         key: "REQUEST_OUTPUT_LEN"
         value: "_REQUEST_OUTPUT_LEN"
       }
     },
     {
       model_name: "tensorrt_llm"
@@ -190,6 +247,10 @@ ensemble_scheduling {
           key: "pad_id"
           value: "pad_id"
       }
       input_map {
           key: "runtime_top_k"
           value: "top_k"
@@ -222,6 +283,10 @@ ensemble_scheduling {
           key: "random_seed"
           value: "random_seed"
       }
       input_map {
           key: "beam_width"
           value: "beam_width"
@@ -230,10 +295,38 @@ ensemble_scheduling {
           key: "streaming"
           value: "stream"
       }
       output_map {
         key: "output_ids"
         value: "_TOKENS_BATCH"
       }
     },
     {
       model_name: "postprocessing"
@@ -242,13 +335,29 @@ ensemble_scheduling {
         key: "TOKENS_BATCH"
         value: "_TOKENS_BATCH"
       }
       output_map {
         key: "OUTPUT"
         value: "text_output"
       }
       output_map {
-        key: "OUTPUT_LENS"
-        value: "output_tokens"
       }
     }
   ]

 name: "ensemble"
 platform: "ensemble"
+max_batch_size: 64
 input [
   {
     name: "text_input"
   },
   {
     name: "max_tokens"
+    data_type: TYPE_INT32
     dims: [ -1 ]
   },
   {
    name: "bad_words"
    data_type: TYPE_STRING
    dims: [ -1 ]
+   optional: true
   },
   {
    name: "stop_words"
    data_type: TYPE_STRING
    dims: [ -1 ]
+   optional: true
   },
   {
     name: "end_id"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     optional: true
   },
   {
     name: "pad_id"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     optional: true
   },
   {
     name: "top_k"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     optional: true
   },
   },
   {
     name: "min_length"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     optional: true
   },
     dims: [ 1 ]
     optional: true
   },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "beam_width"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     optional: true
   },
     data_type: TYPE_BOOL
     dims: [ 1 ]
     optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+  },
+  {
+      name: "embedding_bias_words"
+      data_type: TYPE_STRING
+      dims: [ -1 ]
+      optional: true
+  },
+  {
+      name: "embedding_bias_weights"
+      data_type: TYPE_FP32
+      dims: [ -1 ]
+      optional: true
   }
 ]
 output [
   {
     name: "text_output"
     data_type: TYPE_STRING
+    dims: [ -1 ]
   },
   {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
     dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
   }
 ]
 ensemble_scheduling {
         key: "STOP_WORDS_DICT"
         value: "stop_words"
       }
+      input_map {
+        key: "EMBEDDING_BIAS_WORDS"
+        value: "embedding_bias_words"
+      }
+      input_map {
+        key: "EMBEDDING_BIAS_WEIGHTS"
+        value: "embedding_bias_weights"
+      }
       output_map {
         key: "REQUEST_INPUT_LEN"
         value: "_REQUEST_INPUT_LEN"
         key: "REQUEST_OUTPUT_LEN"
         value: "_REQUEST_OUTPUT_LEN"
       }
+      output_map {
+        key: "STOP_WORDS_IDS"
+        value: "_STOP_WORDS_IDS"
+      }
+      output_map {
+        key: "BAD_WORDS_IDS"
+        value: "_BAD_WORDS_IDS"
+      }
+      output_map {
+        key: "EMBEDDING_BIAS"
+        value: "_EMBEDDING_BIAS"
+      }
     },
     {
       model_name: "tensorrt_llm"
           key: "pad_id"
           value: "pad_id"
       }
+      input_map {
+          key: "embedding_bias"
+          value: "_EMBEDDING_BIAS"
+      }
       input_map {
           key: "runtime_top_k"
           value: "top_k"
           key: "random_seed"
           value: "random_seed"
       }
+      input_map {
+          key: "return_log_probs"
+          value: "return_log_probs"
+      }
       input_map {
           key: "beam_width"
           value: "beam_width"
           key: "streaming"
           value: "stream"
       }
+      input_map {
+        key: "prompt_embedding_table"
+        value: "prompt_embedding_table"
+      }
+      input_map {
+        key: "prompt_vocab_size"
+        value: "prompt_vocab_size"
+      }
+      input_map {
+        key: "stop_words_list"
+        value: "_STOP_WORDS_IDS"
+      }
+      input_map {
+        key: "bad_words_list"
+        value: "_BAD_WORDS_IDS"
+      }
       output_map {
         key: "output_ids"
         value: "_TOKENS_BATCH"
       }
+      output_map {
+        key: "sequence_length"
+        value: "_SEQUENCE_LENGTH"
+      },
+      output_map {
+        key: "cum_log_probs"
+        value: "_CUM_LOG_PROBS"
+      }
+      output_map {
+        key: "output_log_probs"
+        value: "_OUTPUT_LOG_PROBS"
+      }
     },
     {
       model_name: "postprocessing"
         key: "TOKENS_BATCH"
         value: "_TOKENS_BATCH"
       }
+      input_map {
+        key: "CUM_LOG_PROBS"
+        value: "_CUM_LOG_PROBS"
+      }
+      input_map {
+        key: "OUTPUT_LOG_PROBS"
+        value: "_OUTPUT_LOG_PROBS"
+      }
+      input_map {
+        key: "SEQUENCE_LENGTH"
+        value: "_SEQUENCE_LENGTH"
+      }
       output_map {
         key: "OUTPUT"
         value: "text_output"
       }
       output_map {
+        key: "OUT_OUTPUT_LOG_PROBS"
+        value: "output_log_probs"
+      }
+      output_map {
+        key: "OUT_CUM_LOG_PROBS"
+        value: "cum_log_probs"
       }
     }
   ]

postprocessing/1/model.py CHANGED Viewed

@@ -57,13 +57,18 @@ class TritonPythonModel:
             'string_value']
         tokenizer_type = model_config['parameters']['tokenizer_type'][
             'string_value']
         if tokenizer_type == 't5':
             self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
                                          padding_side='left')
         elif tokenizer_type == 'auto':
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                           padding_side='left')
         elif tokenizer_type == 'llama':
             self.tokenizer = LlamaTokenizer.from_pretrained(
                 tokenizer_dir, legacy=False, padding_side='left')
@@ -72,8 +77,6 @@ class TritonPythonModel:
                 f'Unexpected tokenizer type: {tokenizer_type}')
         self.tokenizer.pad_token = self.tokenizer.eos_token
-        self._init_token_map()
         # Parse model output configs
         output_config = pb_utils.get_output_config_by_name(
             model_config, "OUTPUT")
@@ -84,20 +87,6 @@ class TritonPythonModel:
         output_lens_config = pb_utils.get_output_config_by_name(
             model_config, "OUTPUT_LENS")
-        # Convert Triton types to numpy types
-        self.output_lens_dtype = pb_utils.triton_string_to_numpy(
-            output_lens_config['data_type'])
-    def _init_token_map(self):
-        v = self.tokenizer.get_vocab()
-        self.token_map = [None] * len(v)
-        for k, val in v.items():
-            self.token_map[val] = k
-        for i in range(len(v)):
-            if self.token_map[i] is None:
-                print("error %s" % i)
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`
         function receives a list of pb_utils.InferenceRequest as the only
@@ -127,21 +116,37 @@ class TritonPythonModel:
             tokens_batch = pb_utils.get_input_tensor_by_name(
                 request, 'TOKENS_BATCH').as_numpy()
             # Reshape Input
             # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
             # tokens_batch = tokens_batch.T
             # Postprocessing output data.
-            outputs, output_lens = self._postprocessing(tokens_batch)
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
             output_tensor = pb_utils.Tensor(
                 'OUTPUT',
                 np.array(outputs).astype(self.output_dtype))
-            output_lens_tensor = pb_utils.Tensor(
-                'OUTPUT_LENS',
-                np.array(output_lens).astype(self.output_lens_dtype))
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
@@ -150,8 +155,9 @@ class TritonPythonModel:
             #
             # pb_utils.InferenceResponse(
             #    output_tensors=..., TritonError("An error occurred"))
-            inference_response = pb_utils.InferenceResponse(
-                output_tensors=[output_tensor, output_lens_tensor])
             responses.append(inference_response)
         # You should return a list of pb_utils.InferenceResponse. Length
@@ -165,24 +171,13 @@ class TritonPythonModel:
         """
         print('Cleaning up...')
-    def _single_token_decode(self, token):
-        st = self.token_map[token]
-        if st[0] == '▁':
-            return " " + st[1:]
-        else:
-            return self.tokenizer.decode([token])
-    def _postprocessing(self, tokens_batch):
         outputs = []
-        output_lens = []
-        for beam_tokens in tokens_batch:
-            total_len = 0
-            for tokens in beam_tokens:
-                if len(tokens) == 1:
-                    output = self._single_token_decode(tokens[0])
-                else:
-                    output = self.tokenizer.decode(tokens)
                 outputs.append(output.encode('utf8'))
-                total_len += len(tokens)
-            output_lens.append(total_len)
-        return outputs, output_lens

             'string_value']
         tokenizer_type = model_config['parameters']['tokenizer_type'][
             'string_value']
+        self.skip_special_tokens = model_config['parameters'].get(
+            'skip_special_tokens',
+            {'string_value': "true"})['string_value'].lower() in [
+                'true', '1', 't', 'y', 'yes'
+            ]
         if tokenizer_type == 't5':
             self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
                                          padding_side='left')
         elif tokenizer_type == 'auto':
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_dir, padding_side='left', trust_remote_code=True)
         elif tokenizer_type == 'llama':
             self.tokenizer = LlamaTokenizer.from_pretrained(
                 tokenizer_dir, legacy=False, padding_side='left')
                 f'Unexpected tokenizer type: {tokenizer_type}')
         self.tokenizer.pad_token = self.tokenizer.eos_token
         # Parse model output configs
         output_config = pb_utils.get_output_config_by_name(
             model_config, "OUTPUT")
         output_lens_config = pb_utils.get_output_config_by_name(
             model_config, "OUTPUT_LENS")
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`
         function receives a list of pb_utils.InferenceRequest as the only
             tokens_batch = pb_utils.get_input_tensor_by_name(
                 request, 'TOKENS_BATCH').as_numpy()
+            # Get sequence length
+            sequence_lengths = pb_utils.get_input_tensor_by_name(
+                request, 'SEQUENCE_LENGTH').as_numpy()
+            # Get cum log probs
+            cum_log_probs = pb_utils.get_input_tensor_by_name(
+                request, 'CUM_LOG_PROBS').as_numpy()
+            # Get sequence length
+            output_log_probs = pb_utils.get_input_tensor_by_name(
+                request, 'OUTPUT_LOG_PROBS').as_numpy()
             # Reshape Input
             # tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
             # tokens_batch = tokens_batch.T
             # Postprocessing output data.
+            outputs, output_lens = self._postprocessing(tokens_batch, sequence_lengths)
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
             output_tensor = pb_utils.Tensor(
                 'OUTPUT',
                 np.array(outputs).astype(self.output_dtype))
+            out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
+                                                cum_log_probs)
+            out_output_log_probs = pb_utils.Tensor('OUT_OUTPUT_LOG_PROBS',
+                                                   output_log_probs)
             # Create InferenceResponse. You can set an error here in case
             # there was a problem with handling this inference request.
             #
             # pb_utils.InferenceResponse(
             #    output_tensors=..., TritonError("An error occurred"))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[
+                output_tensor, out_cum_log_probs, out_output_log_probs
+            ])
             responses.append(inference_response)
         # You should return a list of pb_utils.InferenceResponse. Length
         """
         print('Cleaning up...')
+    def _postprocessing(self, tokens_batch, sequence_lengths):
         outputs = []
+        for batch_idx, beam_tokens in enumerate(tokens_batch):
+            for beam_idx, tokens in enumerate(beam_tokens):
+                seq_len = sequence_lengths[batch_idx][beam_idx]
+                output = self.tokenizer.decode(
+                    tokens[:seq_len],
+                    skip_special_tokens=self.skip_special_tokens)
                 outputs.append(output.encode('utf8'))
+        return outputs

postprocessing/config.pbtxt CHANGED Viewed

@@ -32,17 +32,42 @@ input [
     name: "TOKENS_BATCH"
     data_type: TYPE_INT32
     dims: [ -1, -1 ]
   }
 ]
 output [
   {
     name: "OUTPUT"
     data_type: TYPE_STRING
     dims: [ -1, -1 ]
   },
   {
     name: "OUTPUT_LENS"
-    data_type: TYPE_UINT32
     dims: [ -1 ]
   }
 ]
@@ -50,7 +75,7 @@ output [
 parameters {
   key: "tokenizer_dir"
   value: {
-    string_value: "${tokenizer_dir}"
   }
 }
@@ -61,9 +86,16 @@ parameters {
   }
 }
 instance_group [
     {
-        count: 1
         kind: KIND_CPU
     }
 ]

     name: "TOKENS_BATCH"
     data_type: TYPE_INT32
     dims: [ -1, -1 ]
+  },
+  {
+    name: "SEQUENCE_LENGTH"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "CUM_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "OUTPUT_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
   }
 ]
 output [
   {
     name: "OUTPUT"
     data_type: TYPE_STRING
+    dims: [ -1 ]
+  },
+  {
+    name: "OUT_CUM_LOG_PROBS"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "OUT_OUTPUT_LOG_PROBS"
+    data_type: TYPE_FP32
     dims: [ -1, -1 ]
   },
   {
     name: "OUTPUT_LENS"
+    data_type: TYPE_INT32
     dims: [ -1 ]
   }
 ]
 parameters {
   key: "tokenizer_dir"
   value: {
+    string_value: "/data/llama/Llama-2-70b-chat-hf/"
   }
 }
   }
 }
+parameters {
+  key: "skip_special_tokens"
+  value: {
+    string_value: "True"
+  }
+}
 instance_group [
     {
+        count: 4
         kind: KIND_CPU
     }
 ]

preprocessing/1/model.py CHANGED Viewed

@@ -24,14 +24,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import csv
 import json
 from typing import List
 import numpy as np
-import torch
 import triton_python_backend_utils as pb_utils
-from torch.nn.utils.rnn import pad_sequence
 from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
@@ -61,13 +58,18 @@ class TritonPythonModel:
             'string_value']
         tokenizer_type = model_config['parameters']['tokenizer_type'][
             'string_value']
         if tokenizer_type == 't5':
             self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
                                          padding_side='left')
         elif tokenizer_type == 'auto':
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                           padding_side='left')
         elif tokenizer_type == 'llama':
             self.tokenizer = LlamaTokenizer.from_pretrained(
                 tokenizer_dir, legacy=False, padding_side='left')
@@ -80,17 +82,26 @@ class TritonPythonModel:
                                             add_special_tokens=False)[0]
         # Parse model output configs and convert Triton types to numpy types
-        input_names = [
             "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
         ]
         for input_name in input_names:
             setattr(
                 self,
                 input_name.lower() + "_dtype",
                 pb_utils.triton_string_to_numpy(
-                    pb_utils.get_output_config_by_name(
                         model_config, input_name)['data_type']))
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`
         function receives a list of pb_utils.InferenceRequest as the only
@@ -115,48 +126,73 @@ class TritonPythonModel:
         # Every Python backend must iterate over everyone of the requests
         # and create a pb_utils.InferenceResponse for each of them.
         for idx, request in enumerate(requests):
             # Get input tensors
             query = pb_utils.get_input_tensor_by_name(request,
                                                       'QUERY').as_numpy()
             request_output_len = pb_utils.get_input_tensor_by_name(
                 request, 'REQUEST_OUTPUT_LEN').as_numpy()
             bad_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'BAD_WORDS_DICT').as_numpy()
             stop_words_dict = pb_utils.get_input_tensor_by_name(
-                request, 'STOP_WORDS_DICT').as_numpy()
             # Preprocessing input data.
             input_id, request_input_len = self._create_request(query)
             bad_words = self._to_word_list_format(bad_words_dict)
             stop_words = self._to_word_list_format(stop_words_dict)
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
             input_id_tensor = pb_utils.Tensor(
-                'INPUT_ID',
-                np.array(input_id).astype(self.input_id_dtype))
             request_input_len_tensor = pb_utils.Tensor(
                 'REQUEST_INPUT_LEN',
-                np.array(request_input_len).astype(
-                    self.request_input_len_dtype))
             request_output_len_tensor = pb_utils.Tensor(
                 'REQUEST_OUTPUT_LEN', request_output_len)
             bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
             stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
                                                     stop_words)
-            # Create InferenceResponse. You can set an error here in case
-            # there was a problem with handling this inference request.
-            # Below is an example of how you can set errors in inference
-            # response:
-            #
-            # pb_utils.InferenceResponse(
-            #    output_tensors=..., TritonError("An error occurred"))
             inference_response = pb_utils.InferenceResponse(output_tensors=[
                 input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
-                request_input_len_tensor, request_output_len_tensor
             ])
             responses.append(inference_response)
@@ -176,44 +212,48 @@ class TritonPythonModel:
             query : batch string (2D numpy array)
         """
         start_ids = [
-            torch.IntTensor(self.tokenizer.encode(s[0].decode()))
             for s in query
         ]
-        start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
-        start_ids = pad_sequence(start_ids,
-                                 batch_first=True,
-                                 padding_value=self.pad_id)
-        # input_len = min(start_lengths)
-        #attn_mask = torch.ones((batch_size, input_len, input_len)).tril()
         return start_ids, start_lengths
-    def _to_word_list_format(self, word_dict: List[List[str]]):
         '''
-        format of word_dict
-            len(word_dict) should be same to batch_size
-            word_dict[i] means the words for batch i
-            len(word_dict[i]) must be 1, which means it only contains 1 string
-            This string can contains several sentences and split by ",".
-            For example, if word_dict[2] = " I am happy, I am sad", then this function will return
-            the ids for two short sentences " I am happy" and " I am sad".
         '''
         assert self.tokenizer != None, "need to set tokenizer"
         flat_ids = []
         offsets = []
-        for word_dict_item in word_dict:
             item_flat_ids = []
             item_offsets = []
-            if isinstance(word_dict_item[0], bytes):
-                word_dict_item = [word_dict_item[0].decode()]
-            words = list(csv.reader(word_dict_item))[0]
-            for word in words:
-                ids = self.tokenizer.encode(word)
                 if len(ids) == 0:
                     continue
@@ -233,3 +273,37 @@ class TritonPythonModel:
         return np.array([flat_ids, offsets], dtype="int32").transpose(
             (1, 0, 2))

 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import json
 from typing import List
 import numpy as np
 import triton_python_backend_utils as pb_utils
 from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
             'string_value']
         tokenizer_type = model_config['parameters']['tokenizer_type'][
             'string_value']
+        self.add_special_tokens = model_config['parameters'].get(
+            'add_special_tokens',
+            {'string_value': "false"})['string_value'].lower() in [
+                'true', '1', 't', 'y', 'yes'
+            ]
         if tokenizer_type == 't5':
             self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
                                          padding_side='left')
         elif tokenizer_type == 'auto':
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_dir, padding_side='left', trust_remote_code=True)
         elif tokenizer_type == 'llama':
             self.tokenizer = LlamaTokenizer.from_pretrained(
                 tokenizer_dir, legacy=False, padding_side='left')
                                             add_special_tokens=False)[0]
         # Parse model output configs and convert Triton types to numpy types
+        output_names = [
             "INPUT_ID", "REQUEST_INPUT_LEN", "BAD_WORDS_IDS", "STOP_WORDS_IDS"
         ]
+        input_names = ["EMBEDDING_BIAS_WORDS", "EMBEDDING_BIAS_WEIGHTS"]
         for input_name in input_names:
             setattr(
                 self,
                 input_name.lower() + "_dtype",
                 pb_utils.triton_string_to_numpy(
+                    pb_utils.get_input_config_by_name(
                         model_config, input_name)['data_type']))
+        for output_name in output_names:
+            setattr(
+                self,
+                output_name.lower() + "_dtype",
+                pb_utils.triton_string_to_numpy(
+                    pb_utils.get_output_config_by_name(
+                        model_config, output_name)['data_type']))
     def execute(self, requests):
         """`execute` must be implemented in every Python model. `execute`
         function receives a list of pb_utils.InferenceRequest as the only
         # Every Python backend must iterate over everyone of the requests
         # and create a pb_utils.InferenceResponse for each of them.
+        logger = pb_utils.Logger
         for idx, request in enumerate(requests):
             # Get input tensors
             query = pb_utils.get_input_tensor_by_name(request,
                                                       'QUERY').as_numpy()
+            batch_dim = query.shape[0]
+            if batch_dim != 1:
+                err_str = "Inflight batching backend expects requests with batch size of 1."
+                logger.log_error(err_str)
+                responses.append(
+                    pb_utils.InferenceResponse(
+                        output_tensors=[],
+                        error=pb_utils.TritonError(err_str)))
+                continue
             request_output_len = pb_utils.get_input_tensor_by_name(
                 request, 'REQUEST_OUTPUT_LEN').as_numpy()
             bad_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'BAD_WORDS_DICT')
+            if bad_words_dict is not None:
+                bad_words_dict = bad_words_dict.as_numpy()
             stop_words_dict = pb_utils.get_input_tensor_by_name(
+                request, 'STOP_WORDS_DICT')
+            if stop_words_dict is not None:
+                stop_words_dict = stop_words_dict.as_numpy()
+            embedding_bias_words = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WORDS')
+            if embedding_bias_words is not None:
+                embedding_bias_words = embedding_bias_words.as_numpy()
+            embedding_bias_weights = pb_utils.get_input_tensor_by_name(
+                request, 'EMBEDDING_BIAS_WEIGHTS')
+            if embedding_bias_weights is not None:
+                embedding_bias_weights = embedding_bias_weights.as_numpy()
             # Preprocessing input data.
             input_id, request_input_len = self._create_request(query)
             bad_words = self._to_word_list_format(bad_words_dict)
             stop_words = self._to_word_list_format(stop_words_dict)
+            embedding_bias = self._get_embedding_bias(
+                embedding_bias_words, embedding_bias_weights,
+                self.embedding_bias_weights_dtype)
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
             input_id_tensor = pb_utils.Tensor(
+                'INPUT_ID', input_id.astype(self.input_id_dtype))
             request_input_len_tensor = pb_utils.Tensor(
                 'REQUEST_INPUT_LEN',
+                request_input_len.astype(self.request_input_len_dtype))
             request_output_len_tensor = pb_utils.Tensor(
                 'REQUEST_OUTPUT_LEN', request_output_len)
             bad_words_ids_tensor = pb_utils.Tensor('BAD_WORDS_IDS', bad_words)
             stop_words_ids_tensor = pb_utils.Tensor('STOP_WORDS_IDS',
                                                     stop_words)
+            embedding_bias_tensor = pb_utils.Tensor('EMBEDDING_BIAS',
+                                                    embedding_bias)
             inference_response = pb_utils.InferenceResponse(output_tensors=[
                 input_id_tensor, bad_words_ids_tensor, stop_words_ids_tensor,
+                request_input_len_tensor, request_output_len_tensor,
+                embedding_bias_tensor
             ])
             responses.append(inference_response)
             query : batch string (2D numpy array)
         """
         start_ids = [
+            np.array(
+                self.tokenizer.encode(
+                    s[0].decode(),
+                    add_special_tokens=self.add_special_tokens)).astype(int)
             for s in query
         ]
+        start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
+        max_len = 0
+        for seq in start_ids:
+            max_len = max(max_len, seq.shape[0])
+        start_ids = np.stack([
+            np.pad(seq, (0, max_len - seq.shape[0]),
+                   'constant',
+                   constant_values=(0, self.pad_id)) for seq in start_ids
+        ])
         return start_ids, start_lengths
+    def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
         '''
+        word_lists format:
+            len(word_lists) == batch_size
+            word_lists[i] means the words associated to batch item i. A "word" may actually be any string. Like "lorem" or "lorem ipsum".
         '''
         assert self.tokenizer != None, "need to set tokenizer"
+        if word_lists is None:
+            # Return an empty array of shape (1,2,0)
+            return np.empty([1, 2, 0], dtype="int32")
         flat_ids = []
         offsets = []
+        for word_list in word_lists:
             item_flat_ids = []
             item_offsets = []
+            for word in word_list:
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word, add_special_tokens=False)
                 if len(ids) == 0:
                     continue
         return np.array([flat_ids, offsets], dtype="int32").transpose(
             (1, 0, 2))
+    def _get_embedding_bias(self, embedding_bias_words, embedding_bias_weights,
+                            bias_dtype):
+        assert self.tokenizer != None, "need to set tokenizer"
+        if embedding_bias_words is None or embedding_bias_weights is None:
+            return np.empty([1, 0], dtype=self.embedding_bias_weights_dtype)
+        batch_embedding_bias = []
+        for words, weights in zip(embedding_bias_words,
+                                  embedding_bias_weights):
+            vocab_size = self.tokenizer.vocab_size
+            embedding_bias = [0.] * vocab_size
+            assert len(words) == len(
+                weights
+            ), "Embedding bias words must have same dimension as embedding bias weights"
+            for word, weight in zip(words, weights):
+                if isinstance(word, bytes):
+                    word = word.decode()
+                ids = self.tokenizer.encode(word)
+                if len(ids) == 0:
+                    continue
+                for id in ids:
+                    embedding_bias[id] += weight
+            batch_embedding_bias.append(np.array(embedding_bias))
+        return np.array(batch_embedding_bias, dtype=bias_dtype)

preprocessing/config.pbtxt CHANGED Viewed

@@ -33,20 +33,34 @@ input [
         data_type: TYPE_STRING
         dims: [ -1 ]
     },
     {
         name: "BAD_WORDS_DICT"
         data_type: TYPE_STRING
         dims: [ -1 ]
     },
     {
         name: "STOP_WORDS_DICT"
         data_type: TYPE_STRING
         dims: [ -1 ]
     },
     {
-        name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_UINT32
         dims: [ -1 ]
     }
 ]
 output [
@@ -70,9 +84,14 @@ output [
         data_type: TYPE_INT32
         dims: [ 2, -1 ]
     },
     {
         name: "REQUEST_OUTPUT_LEN"
-        data_type: TYPE_UINT32
         dims: [ -1 ]
     }
 ]
@@ -80,7 +99,7 @@ output [
 parameters {
   key: "tokenizer_dir"
   value: {
-    string_value: "${tokenizer_dir}"
   }
 }
@@ -91,9 +110,16 @@ parameters {
   }
 }
 instance_group [
     {
-        count: 1
         kind: KIND_CPU
     }
 ]

         data_type: TYPE_STRING
         dims: [ -1 ]
     },
+    {
+        name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
+        dims: [ -1 ]
+    },
     {
         name: "BAD_WORDS_DICT"
         data_type: TYPE_STRING
         dims: [ -1 ]
+        optional: true
     },
     {
         name: "STOP_WORDS_DICT"
         data_type: TYPE_STRING
         dims: [ -1 ]
+        optional: true
     },
     {
+        name: "EMBEDDING_BIAS_WORDS"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+        optional: true
+    },
+    {
+        name: "EMBEDDING_BIAS_WEIGHTS"
+        data_type: TYPE_FP32
         dims: [ -1 ]
+        optional: true
     }
 ]
 output [
         data_type: TYPE_INT32
         dims: [ 2, -1 ]
     },
+    {
+        name: "EMBEDDING_BIAS"
+        data_type: TYPE_FP32
+        dims: [ -1 ]
+    },
     {
         name: "REQUEST_OUTPUT_LEN"
+        data_type: TYPE_INT32
         dims: [ -1 ]
     }
 ]
 parameters {
   key: "tokenizer_dir"
   value: {
+    string_value: "/data/llama/Llama-2-70b-chat-hf/"
   }
 }
   }
 }
+parameters {
+  key: "add_special_tokens"
+  value: {
+    string_value: "False"
+  }
+}
 instance_group [
     {
+        count: 4
         kind: KIND_CPU
     }
 ]

tensorrt_llm/1/config.json CHANGED Viewed

@@ -47,6 +47,7 @@
     "tokens_per_block": 128,
     "use_custom_all_reduce": false,
     "use_paged_context_fmha": false,
     "weight_only_groupwise_quant_matmul_plugin": false,
     "weight_only_quant_matmul_plugin": false
   }

     "tokens_per_block": 128,
     "use_custom_all_reduce": false,
     "use_paged_context_fmha": false,
+    "use_context_fmha_for_generation": false,
     "weight_only_groupwise_quant_matmul_plugin": false,
     "weight_only_quant_matmul_plugin": false
   }

tensorrt_llm/config.pbtxt CHANGED Viewed

@@ -26,17 +26,23 @@
 name: "tensorrt_llm"
 backend: "tensorrtllm"
-max_batch_size: 128
 model_transaction_policy {
   decoupled: True
 }
 input [
   {
     name: "input_ids"
     data_type: TYPE_INT32
     dims: [ -1 ]
   },
   {
     name: "input_lengths"
@@ -46,26 +52,54 @@ input [
   },
   {
     name: "request_output_len"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
   },
   {
     name: "end_id"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
   },
   {
     name: "pad_id"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
   },
   {
     name: "beam_width"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
@@ -79,7 +113,7 @@ input [
   },
   {
     name: "runtime_top_k"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
@@ -107,7 +141,7 @@ input [
   },
   {
     name: "min_length"
-    data_type: TYPE_UINT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
@@ -126,6 +160,13 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
   {
     name: "stop"
     data_type: TYPE_BOOL
@@ -137,6 +178,20 @@ input [
     data_type: TYPE_BOOL
     dims: [ 1 ]
     optional: true
   }
 ]
 output [
@@ -144,6 +199,21 @@ output [
     name: "output_ids"
     data_type: TYPE_INT32
     dims: [ -1, -1 ]
   }
 ]
 instance_group [
@@ -173,13 +243,19 @@ parameters: {
 parameters: {
   key: "gpt_model_path"
   value: {
-    string_value: "${gpt_model_path}"
   }
 }
 parameters: {
   key: "max_tokens_in_paged_kv_cache"
   value: {
-    string_value: "${max_tokens_in_paged_kv_cache}"
   }
 }
 parameters: {
@@ -195,20 +271,26 @@ parameters: {
   }
 }
 parameters: {
-  key: "exclude_input_in_output"
   value: {
-    string_value: "true"
   }
 }
 parameters: {
-  key: "max_num_sequences"
   value: {
-    string_value: "${max_num_sequences}"
   }
 }
 parameters: {
-  key: "enable_trt_overlap"
   value: {
-    string_value: "${enable_trt_overlap}"
   }
 }

 name: "tensorrt_llm"
 backend: "tensorrtllm"
+max_batch_size: 64
 model_transaction_policy {
   decoupled: True
 }
+dynamic_batching {
+    preferred_batch_size: [ 1 ]
+    max_queue_delay_microseconds: 1000
+}
 input [
   {
     name: "input_ids"
     data_type: TYPE_INT32
     dims: [ -1 ]
+    allow_ragged_batch: true
   },
   {
     name: "input_lengths"
   },
   {
     name: "request_output_len"
+    data_type: TYPE_INT32
     dims: [ 1 ]
   },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
   {
     name: "end_id"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
   },
   {
     name: "pad_id"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "embedding_bias"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
   {
     name: "beam_width"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
   },
   {
     name: "runtime_top_k"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
   },
   {
     name: "min_length"
+    data_type: TYPE_INT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "stop"
     data_type: TYPE_BOOL
     data_type: TYPE_BOOL
     dims: [ 1 ]
     optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
   }
 ]
 output [
     name: "output_ids"
     data_type: TYPE_INT32
     dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
   }
 ]
 instance_group [
 parameters: {
   key: "gpt_model_path"
   value: {
+    string_value: "/data/tgi-data/yessen/Llama-2-70b-chat-hf-trt-fp8/tensorrt_llm/1"
   }
 }
 parameters: {
   key: "max_tokens_in_paged_kv_cache"
   value: {
+    string_value: "40000"
+  }
+}
+parameters: {
+  key: "max_attention_window_size"
+  value: {
+    string_value: "4096"
   }
 }
 parameters: {
   }
 }
 parameters: {
+  key: "max_num_sequences"
   value: {
+    string_value: "64"
   }
 }
 parameters: {
+  key: "enable_trt_overlap"
   value: {
+    string_value: "false"
   }
 }
 parameters: {
+  key: "exclude_input_in_output"
+  value: {
+    string_value: "true"
+  }
+}
+parameters: {
+  key: "enable_kv_cache_reuse"
   value: {
+    string_value: "${enable_kv_cache_reuse}"
   }
 }