Spaces:
Paused
Paused
fix(remove-params): Removing max_model_len
Browse files
main.py
CHANGED
@@ -12,30 +12,39 @@ app = FastAPI()
|
|
12 |
|
13 |
# Initialize the LLM engine
|
14 |
# Replace 'your-model-path' with the actual path or name of your model
|
|
|
|
|
15 |
|
16 |
engine_llama_3_2: LLM = LLM(
|
17 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
18 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
|
|
19 |
max_num_batched_tokens=512, # Reduced for T4
|
20 |
max_num_seqs=16, # Reduced for T4
|
21 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
|
|
22 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
23 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
24 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
25 |
# Because when we use maximum token length, it will be slower and the memory is not enough for T4.
|
26 |
-
|
|
|
|
|
27 |
enforce_eager=True, # Disable CUDA graph
|
28 |
dtype='auto', # Use 'half' if you want half precision
|
29 |
)
|
30 |
|
31 |
-
|
|
|
|
|
32 |
engine_sailor_chat: LLM = LLM(
|
33 |
model='sail/Sailor-4B-Chat',
|
34 |
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
35 |
max_num_batched_tokens=512, # Reduced for T4
|
36 |
max_num_seqs=16, # Reduced for T4
|
37 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
38 |
-
|
|
|
39 |
enforce_eager=True, # Disable CUDA graph
|
40 |
dtype='auto', # Use 'half' if you want half precision
|
41 |
)
|
|
|
12 |
|
13 |
# Initialize the LLM engine
|
14 |
# Replace 'your-model-path' with the actual path or name of your model
|
15 |
+
# example:
|
16 |
+
# https://huggingface.co/spaces/damienbenveniste/deploy_vLLM/blob/b210a934d4ff7b68254d42fa28736d74649e610d/app.py#L17-L20
|
17 |
|
18 |
engine_llama_3_2: LLM = LLM(
|
19 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
20 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
21 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
|
22 |
max_num_batched_tokens=512, # Reduced for T4
|
23 |
max_num_seqs=16, # Reduced for T4
|
24 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
25 |
+
tensor_parallel_size=2,
|
26 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
27 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
28 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
29 |
# Because when we use maximum token length, it will be slower and the memory is not enough for T4.
|
30 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
|
31 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
|
32 |
+
# max_model_len=32768,
|
33 |
enforce_eager=True, # Disable CUDA graph
|
34 |
dtype='auto', # Use 'half' if you want half precision
|
35 |
)
|
36 |
|
37 |
+
# ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
|
38 |
+
# This effectively limits the maximum sequence length to max_num_batched_tokens and makes vLLM reject longer sequences.
|
39 |
+
# Please increase max_num_batched_tokens or decrease max_model_len.
|
40 |
engine_sailor_chat: LLM = LLM(
|
41 |
model='sail/Sailor-4B-Chat',
|
42 |
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
43 |
max_num_batched_tokens=512, # Reduced for T4
|
44 |
max_num_seqs=16, # Reduced for T4
|
45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
46 |
+
tensor_parallel_size=2,
|
47 |
+
# max_model_len=32768,
|
48 |
enforce_eager=True, # Disable CUDA graph
|
49 |
dtype='auto', # Use 'half' if you want half precision
|
50 |
)
|