Damien Benveniste commited on
Commit
0306c33
·
1 Parent(s): 70ea3e3
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -13,13 +13,14 @@ app = FastAPI()
13
  engine = AsyncLLMEngine.from_engine_args(
14
  AsyncEngineArgs(
15
  model='microsoft/Phi-3-mini-4k-instruct',
16
- dtype="half",
17
- max_num_batched_tokens=512, # Reduce from default
18
- max_num_seqs=32, # Reduce from default
19
- gpu_memory_utilization=0.8, # Adjust based on your GPU
20
- max_model_len=4096, # Adjust based on model requirements
21
- quantization='awq', # Enable quantization if supported
22
- enforce_eager=True,
 
23
  )
24
  )
25
 
 
13
  engine = AsyncLLMEngine.from_engine_args(
14
  AsyncEngineArgs(
15
  model='microsoft/Phi-3-mini-4k-instruct',
16
+ max_num_batched_tokens=512, # Reduced for T4
17
+ max_num_seqs=16, # Reduced for T4
18
+ gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
19
+ max_model_len=4096, # Phi-3-mini-4k context length
20
+ quantization='awq', # Enable quantization if supported by the model
21
+ enforce_eager=True, # Disable CUDA graphs
22
+ max_num_layers=None, # This allows vLLM to determine the optimal number of layers
23
+ dtype='half', # Use half precision
24
  )
25
  )
26