yusufs commited on
Commit
0ef012d
·
1 Parent(s): 586265c

fix(remove-params): Removing max_model_len

Browse files
Files changed (1) hide show
  1. main.py +12 -3
main.py CHANGED
@@ -12,30 +12,39 @@ app = FastAPI()
12
 
13
  # Initialize the LLM engine
14
  # Replace 'your-model-path' with the actual path or name of your model
 
 
15
 
16
  engine_llama_3_2: LLM = LLM(
17
  model='meta-llama/Llama-3.2-3B-Instruct',
18
  revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
 
19
  max_num_batched_tokens=512, # Reduced for T4
20
  max_num_seqs=16, # Reduced for T4
21
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
 
22
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
23
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
24
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
25
  # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
26
- max_model_len=32768,
 
 
27
  enforce_eager=True, # Disable CUDA graph
28
  dtype='auto', # Use 'half' if you want half precision
29
  )
30
 
31
-
 
 
32
  engine_sailor_chat: LLM = LLM(
33
  model='sail/Sailor-4B-Chat',
34
  revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
35
  max_num_batched_tokens=512, # Reduced for T4
36
  max_num_seqs=16, # Reduced for T4
37
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
38
- max_model_len=32768,
 
39
  enforce_eager=True, # Disable CUDA graph
40
  dtype='auto', # Use 'half' if you want half precision
41
  )
 
12
 
13
  # Initialize the LLM engine
14
  # Replace 'your-model-path' with the actual path or name of your model
15
+ # example:
16
+ # https://huggingface.co/spaces/damienbenveniste/deploy_vLLM/blob/b210a934d4ff7b68254d42fa28736d74649e610d/app.py#L17-L20
17
 
18
  engine_llama_3_2: LLM = LLM(
19
  model='meta-llama/Llama-3.2-3B-Instruct',
20
  revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
21
+ # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
22
  max_num_batched_tokens=512, # Reduced for T4
23
  max_num_seqs=16, # Reduced for T4
24
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
25
+ tensor_parallel_size=2,
26
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
27
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
28
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
29
  # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
30
+ # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
31
+ # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
32
+ # max_model_len=32768,
33
  enforce_eager=True, # Disable CUDA graph
34
  dtype='auto', # Use 'half' if you want half precision
35
  )
36
 
37
+ # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
38
+ # This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.
39
+ # Please increase max_num_batched_tokens or decrease max_model_len.
40
  engine_sailor_chat: LLM = LLM(
41
  model='sail/Sailor-4B-Chat',
42
  revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
43
  max_num_batched_tokens=512, # Reduced for T4
44
  max_num_seqs=16, # Reduced for T4
45
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
46
+ tensor_parallel_size=2,
47
+ # max_model_len=32768,
48
  enforce_eager=True, # Disable CUDA graph
49
  dtype='auto', # Use 'half' if you want half precision
50
  )