# You need to merge Qwen2-1.5B-Instruct with fine-tuned LLM lora first, see merge_lora.py. | |
# Then you can build the engine with the merged model. | |
# python3 convert_checkpoint.py --model_dir ${model_dir} \ | |
# --output_dir ${checkpoint_dir} \ | |
# --dtype float16 | |
# We have merged the two models and convert it for trt-llm into the below checkpoint: | |
checkpoint_dir=tllm_checkpoint_1gpu_fp16_qwen2_1.5B_instruct_merged | |
# output engine directory | |
engine_dir=qwen2_1.5B_instruct_fp16_merged | |
# max_prompt_embedding_table_size should >= max_batch_size * speech_embedding_seq_length | |
trtllm-build --checkpoint_dir ${checkpoint_dir} \ | |
--output_dir $engine_dir \ | |
--max_prompt_embedding_table_size 4096 \ | |
--max_batch_size 16 \ | |
--gemm_plugin float16 |