Spaces:

damienbenveniste
/

deploy_vLLM

Sleeping

Damien Benveniste commited on Aug 12, 2024

Commit

0306c33

1 Parent(s): 70ea3e3

modified

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,13 +13,14 @@ app = FastAPI()
 engine = AsyncLLMEngine.from_engine_args(
     AsyncEngineArgs(
         model='microsoft/Phi-3-mini-4k-instruct',
-        dtype="half",
-        max_num_batched_tokens=512,  # Reduce from default
-        max_num_seqs=32,              # Reduce from default
-        gpu_memory_utilization=0.8,   # Adjust based on your GPU
-        max_model_len=4096,           # Adjust based on model requirements
-        quantization='awq',           # Enable quantization if supported
-        enforce_eager=True,
     )
 )

 engine = AsyncLLMEngine.from_engine_args(
     AsyncEngineArgs(
         model='microsoft/Phi-3-mini-4k-instruct',
+        max_num_batched_tokens=512,    # Reduced for T4
+        max_num_seqs=16,               # Reduced for T4
+        gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
+        max_model_len=4096,            # Phi-3-mini-4k context length
+        quantization='awq',            # Enable quantization if supported by the model
+        enforce_eager=True,            # Disable CUDA graphs
+        max_num_layers=None,           # This allows vLLM to determine the optimal number of layers
+        dtype='half',                  # Use half precision
     )
 )