Damien Benveniste commited on
Commit
70ea3e3
·
1 Parent(s): 14549f3
Files changed (1) hide show
  1. app.py +6 -1
app.py CHANGED
@@ -14,7 +14,12 @@ engine = AsyncLLMEngine.from_engine_args(
14
  AsyncEngineArgs(
15
  model='microsoft/Phi-3-mini-4k-instruct',
16
  dtype="half",
17
- gpu_memory_utilization=0.99,
 
 
 
 
 
18
  )
19
  )
20
 
 
14
  AsyncEngineArgs(
15
  model='microsoft/Phi-3-mini-4k-instruct',
16
  dtype="half",
17
+ max_num_batched_tokens=512, # Reduce from default
18
+ max_num_seqs=32, # Reduce from default
19
+ gpu_memory_utilization=0.8, # Adjust based on your GPU
20
+ max_model_len=4096, # Adjust based on model requirements
21
+ quantization='awq', # Enable quantization if supported
22
+ enforce_eager=True,
23
  )
24
  )
25