Damien Benveniste commited on
Commit
96c4e4e
·
1 Parent(s): 0306c33
Files changed (1) hide show
  1. app.py +0 -1
app.py CHANGED
@@ -19,7 +19,6 @@ engine = AsyncLLMEngine.from_engine_args(
19
  max_model_len=4096, # Phi-3-mini-4k context length
20
  quantization='awq', # Enable quantization if supported by the model
21
  enforce_eager=True, # Disable CUDA graphs
22
- max_num_layers=None, # This allows vLLM to determine the optimal number of layers
23
  dtype='half', # Use half precision
24
  )
25
  )
 
19
  max_model_len=4096, # Phi-3-mini-4k context length
20
  quantization='awq', # Enable quantization if supported by the model
21
  enforce_eager=True, # Disable CUDA graphs
 
22
  dtype='half', # Use half precision
23
  )
24
  )