Damien Benveniste commited on
Commit
a959d74
·
1 Parent(s): 96c4e4e
Files changed (1) hide show
  1. app.py +1 -1
app.py CHANGED
@@ -18,7 +18,7 @@ engine = AsyncLLMEngine.from_engine_args(
18
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
19
  max_model_len=4096, # Phi-3-mini-4k context length
20
  quantization='awq', # Enable quantization if supported by the model
21
- enforce_eager=True, # Disable CUDA graphs
22
  dtype='half', # Use half precision
23
  )
24
  )
 
18
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
19
  max_model_len=4096, # Phi-3-mini-4k context length
20
  quantization='awq', # Enable quantization if supported by the model
21
+ enforce_eager=True, # Disable CUDA graph
22
  dtype='half', # Use half precision
23
  )
24
  )