yuekai
/

distill_whisper_large_v3_trtllm_triton

yuekai commited on Aug 21

Commit

f605912

•

1 Parent(s): a3eaad0

Upload folder using huggingface_hub

Files changed (10) hide show

build_whisper_fp16.sh ADDED Viewed

+export CUDA_VISIBLE_DEVICES=1
+INFERENCE_PRECISION=float16
+MAX_BEAM_WIDTH=4
+MAX_BATCH_SIZE=8
+checkpoint_dir=tllm_checkpoint
+output_dir=distill_whisper_large_v3
+# trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
+# --output_dir ${output_dir}/encoder \
+# --paged_kv_cache disable \
+# --moe_plugin disable \
+# --enable_xqa disable \
+# --max_batch_size ${MAX_BATCH_SIZE} \
+# --gemm_plugin disable \
+# --bert_attention_plugin ${INFERENCE_PRECISION} \
+# --remove_input_padding disable
+# trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
+# --output_dir ${output_dir}/decoder \
+# --paged_kv_cache disable \
+# --moe_plugin disable \
+# --enable_xqa disable \
+# --max_beam_width ${MAX_BEAM_WIDTH} \
+# --max_batch_size ${MAX_BATCH_SIZE} \
+# --max_seq_len 100 \
+# --max_input_len 14 \
+# --max_encoder_input_len 1500 \
+# --gemm_plugin ${INFERENCE_PRECISION} \
+# --bert_attention_plugin ${INFERENCE_PRECISION} \
+# --gpt_attention_plugin ${INFERENCE_PRECISION} \
+# --remove_input_padding disable
+python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}

build_whisper_int8.sh ADDED Viewed

+export CUDA_VISIBLE_DEVICES=1
+INFERENCE_PRECISION=float16
+WEIGHT_ONLY_PRECISION=int8
+MAX_BEAM_WIDTH=4
+MAX_BATCH_SIZE=8
+checkpoint_dir=tllm_checkpoint_${WEIGHT_ONLY_PRECISION}
+output_dir=distil_whisper_large_v3_${WEIGHT_ONLY_PRECISION}
+# python3 convert_checkpoint.py --use_weight_only \
+# --weight_only_precision $WEIGHT_ONLY_PRECISION \
+# --output_dir $checkpoint_dir
+trtllm-build --checkpoint_dir ${checkpoint_dir}/encoder \
+ --output_dir ${output_dir}/encoder \
+ --paged_kv_cache disable \
+ --moe_plugin disable \
+ --enable_xqa disable \
+ --max_batch_size ${MAX_BATCH_SIZE} \
+ --gemm_plugin ${INFERENCE_PRECISION} \
+ --bert_attention_plugin ${INFERENCE_PRECISION} \
+ --remove_input_padding disable
+trtllm-build --checkpoint_dir ${checkpoint_dir}/decoder \
+ --output_dir ${output_dir}/decoder \
+ --paged_kv_cache disable \
+ --moe_plugin disable \
+ --enable_xqa disable \
+ --max_beam_width ${MAX_BEAM_WIDTH} \
+ --max_batch_size ${MAX_BATCH_SIZE} \
+ --max_seq_len 100 \
+ --max_input_len 14 \
+ --max_encoder_input_len 1500 \
+ --gemm_plugin ${INFERENCE_PRECISION} \
+ --bert_attention_plugin ${INFERENCE_PRECISION} \
+ --gpt_attention_plugin ${INFERENCE_PRECISION} \
+ --remove_input_padding disable
+python3 run.py --engine_dir $output_dir --dataset hf-internal-testing/librispeech_asr_dummy --name librispeech_dummy_${output_dir}

tllm_checkpoint/decoder/config.json ADDED Viewed

+{
+ "architecture": "DecoderModel",
+ "dtype": "float16",
+ "logits_dtype": "float16",
+ "num_hidden_layers": 2,
+ "num_attention_heads": 20,
+ "hidden_size": 1280,
+ "norm_epsilon": 1e-05,
+ "vocab_size": 51866,
+ "hidden_act": "gelu",
+ "use_parallel_embedding": false,
+ "embedding_sharding_dim": 0,
+ "max_position_embeddings": 448,
+ "use_prompt_tuning": false,
+ "head_size": 64,
+ "has_position_embedding": true,
+ "layernorm_type": 0,
+ "has_attention_qkvo_bias": true,
+ "has_mlp_bias": true,
+ "has_model_final_layernorm": true,
+ "has_embedding_layernorm": false,
+ "has_embedding_scale": false,
+ "ffn_hidden_size": 5120,
+ "q_scaling": 1.0,
+ "layernorm_position": 0,
+ "relative_attention": false,
+ "max_distance": 0,
+ "num_buckets": 0,
+ "model_type": "whisper",
+ "rescale_before_lm_head": false,
+ "encoder_hidden_size": 1280,
+ "encoder_num_heads": 20,
+ "encoder_head_size": null,
+ "skip_cross_qkv": false,
+ "quantization": {
+ "quant_algo": null
+ }
+}

tllm_checkpoint/decoder/rank0.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a2e1b9d985e4764effd3367fb7994c5b5cc4ac1dedd63f3c829d05f30113118
+size 371665280

tllm_checkpoint/encoder/config.json ADDED Viewed

+{
+ "architecture": "WhisperEncoder",
+ "dtype": "float16",
+ "num_hidden_layers": 32,
+ "num_attention_heads": 20,
+ "hidden_size": 1280,
+ "n_mels": 128,
+ "n_audio_ctx": 1500,
+ "vocab_size": 51866,
+ "hidden_act": "gelu",
+ "num_languages": 100,
+ "quantization": {
+ "quant_algo": null
+ }
+}

tllm_checkpoint/encoder/rank0.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9f10fdcb40284710f7488bba8c1a021c0d3099487ee886c95c537aacbc22b9
+size 1288720864

tllm_checkpoint_int8/decoder/config.json ADDED Viewed

+{
+ "architecture": "DecoderModel",
+ "dtype": "float16",
+ "logits_dtype": "float16",
+ "num_hidden_layers": 2,
+ "num_attention_heads": 20,
+ "hidden_size": 1280,
+ "norm_epsilon": 1e-05,
+ "vocab_size": 51866,
+ "hidden_act": "gelu",
+ "use_parallel_embedding": false,
+ "embedding_sharding_dim": 0,
+ "max_position_embeddings": 448,
+ "use_prompt_tuning": false,
+ "head_size": 64,
+ "has_position_embedding": true,
+ "layernorm_type": 0,
+ "has_attention_qkvo_bias": true,
+ "has_mlp_bias": true,
+ "has_model_final_layernorm": true,
+ "has_embedding_layernorm": false,
+ "has_embedding_scale": false,
+ "ffn_hidden_size": 5120,
+ "q_scaling": 1.0,
+ "layernorm_position": 0,
+ "relative_attention": false,
+ "max_distance": 0,
+ "num_buckets": 0,
+ "model_type": "whisper",
+ "rescale_before_lm_head": false,
+ "encoder_hidden_size": 1280,
+ "encoder_num_heads": 20,
+ "encoder_head_size": null,
+ "skip_cross_qkv": false,
+ "quantization": {
+ "quant_algo": "W8A16"
+ }
+}

tllm_checkpoint_int8/decoder/rank0.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:85c059ed4fa6c74025538b2c38b42bce19755dfad6390ee1930dfc5c111fed19
+size 319304296

tllm_checkpoint_int8/encoder/config.json ADDED Viewed

+{
+ "architecture": "WhisperEncoder",
+ "dtype": "float16",
+ "num_hidden_layers": 32,
+ "num_attention_heads": 20,
+ "hidden_size": 1280,
+ "n_mels": 128,
+ "n_audio_ctx": 1500,
+ "vocab_size": 51866,
+ "hidden_act": "gelu",
+ "num_languages": 100,
+ "quantization": {
+ "quant_algo": "W8A16"
+ }
+}

tllm_checkpoint_int8/encoder/rank0.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5637200b3af4798f5d775e9c150efbda0084902937aa3a2cf200f9a4dfecb35
+size 660326480