import os | |
import subprocess | |
def run_vllm_inference(): | |
# Set the necessary environment variables | |
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" | |
# vLLM serve command | |
command = [ | |
"vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8", | |
"--tensor-parallel-size", "4", | |
"--dtype", "auto", | |
"--api-key", "token-abc123", | |
"--max-model-len", "2000", | |
"--kv-cache-dtype", "auto" | |
] | |
# Run the command as a subprocess | |
subprocess.run(command) | |
if __name__ == "__main__": | |
run_vllm_inference() | |