INFERENCE_PRECISION=float16 | |
MAX_BEAM_WIDTH=4 | |
MAX_BATCH_SIZE=64 | |
checkpoint_dir=$1 | |
output_dir=$2 | |
trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \ | |
--output_dir ${output_dir}/encoder \ | |
--moe_plugin disable \ | |
--enable_xqa disable \ | |
--max_batch_size ${MAX_BATCH_SIZE} \ | |
--gemm_plugin disable \ | |
--bert_attention_plugin ${INFERENCE_PRECISION} \ | |
--max_input_len 3000 --max_seq_len=3000 | |
trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \ | |
--output_dir ${output_dir}/decoder \ | |
--moe_plugin disable \ | |
--enable_xqa disable \ | |
--max_beam_width ${MAX_BEAM_WIDTH} \ | |
--max_batch_size ${MAX_BATCH_SIZE} \ | |
--max_seq_len 114 \ | |
--max_input_len 14 \ | |
--max_encoder_input_len 3000 \ | |
--gemm_plugin ${INFERENCE_PRECISION} \ | |
--bert_attention_plugin ${INFERENCE_PRECISION} \ | |
--gpt_attention_plugin ${INFERENCE_PRECISION} |