vllm
#33
by
zhiminy
- opened
- app.py +2 -4
- backend-cli.py +1 -3
- src/backend/tasks/arena_hard/task.py +1 -1
- src/display/about.py +2 -2
- src/display/utils.py +33 -39
- src/leaderboard/read_evals.py +10 -11
app.py
CHANGED
@@ -11,7 +11,6 @@ import time
|
|
11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
12 |
|
13 |
from huggingface_hub import snapshot_download
|
14 |
-
from pytz import utc
|
15 |
|
16 |
from src.display.about import (
|
17 |
CITATION_BUTTON_LABEL,
|
@@ -160,7 +159,6 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
|
|
160 |
type_emoji = [t[0] for t in type_query]
|
161 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
162 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
163 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.inference_framework.name].isin(size_query)]
|
164 |
|
165 |
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
166 |
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
@@ -259,7 +257,7 @@ with demo:
|
|
259 |
for c in fields(AutoEvalColumn)
|
260 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
261 |
],
|
262 |
-
label="
|
263 |
elem_id="column-select",
|
264 |
interactive=True,
|
265 |
)
|
@@ -479,7 +477,7 @@ with demo:
|
|
479 |
show_copy_button=True,
|
480 |
)
|
481 |
|
482 |
-
scheduler = BackgroundScheduler(
|
483 |
|
484 |
scheduler.add_job(restart_space, "interval", hours=6)
|
485 |
|
|
|
11 |
from apscheduler.schedulers.background import BackgroundScheduler
|
12 |
|
13 |
from huggingface_hub import snapshot_download
|
|
|
14 |
|
15 |
from src.display.about import (
|
16 |
CITATION_BUTTON_LABEL,
|
|
|
159 |
type_emoji = [t[0] for t in type_query]
|
160 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
161 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
|
|
162 |
|
163 |
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
164 |
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
|
|
257 |
for c in fields(AutoEvalColumn)
|
258 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
259 |
],
|
260 |
+
label="Select columns to show",
|
261 |
elem_id="column-select",
|
262 |
interactive=True,
|
263 |
)
|
|
|
477 |
show_copy_button=True,
|
478 |
)
|
479 |
|
480 |
+
scheduler = BackgroundScheduler()
|
481 |
|
482 |
scheduler.add_job(restart_space, "interval", hours=6)
|
483 |
|
backend-cli.py
CHANGED
@@ -458,7 +458,6 @@ def get_args():
|
|
458 |
parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
|
459 |
help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
|
460 |
parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
|
461 |
-
parser.add_argument("--model_type", type=str, default="chat", help="Model type")
|
462 |
return parser.parse_args()
|
463 |
|
464 |
|
@@ -489,8 +488,7 @@ if __name__ == "__main__":
|
|
489 |
json_filepath="",
|
490 |
precision=precision, # Use precision from arguments
|
491 |
inference_framework=args.inference_framework, # Use inference framework from arguments
|
492 |
-
gpu_type=args.gpu_type
|
493 |
-
model_type=args.model_type,
|
494 |
)
|
495 |
curr_gpu_type = get_gpu_details()
|
496 |
if eval_request.gpu_type != curr_gpu_type:
|
|
|
458 |
parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
|
459 |
help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
|
460 |
parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
|
|
|
461 |
return parser.parse_args()
|
462 |
|
463 |
|
|
|
488 |
json_filepath="",
|
489 |
precision=precision, # Use precision from arguments
|
490 |
inference_framework=args.inference_framework, # Use inference framework from arguments
|
491 |
+
gpu_type=args.gpu_type
|
|
|
492 |
)
|
493 |
curr_gpu_type = get_gpu_details()
|
494 |
if eval_request.gpu_type != curr_gpu_type:
|
src/backend/tasks/arena_hard/task.py
CHANGED
@@ -72,7 +72,7 @@ class ArenaHard(ConfigurableTask):
|
|
72 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
73 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
74 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
75 |
-
self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "
|
76 |
# self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
77 |
# self.generation_kwargs_sampling = {
|
78 |
# "temperature": 0.99,
|
|
|
72 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
73 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
74 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
75 |
+
self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_length": 4096}
|
76 |
# self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
77 |
# self.generation_kwargs_sampling = {
|
78 |
# "temperature": 0.99,
|
src/display/about.py
CHANGED
@@ -19,8 +19,8 @@ Columns and Metrics:
|
|
19 |
- E2E(s): Average End to End generation time in seconds.
|
20 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
21 |
- T/s: Tokens throughout per second.
|
22 |
-
-
|
23 |
-
-
|
24 |
- Precision: The precison of used model.
|
25 |
|
26 |
"""
|
|
|
19 |
- E2E(s): Average End to End generation time in seconds.
|
20 |
- PRE(s): Prefilling Time of input prompt in seconds.
|
21 |
- T/s: Tokens throughout per second.
|
22 |
+
- MBU(%): Model Bandwidth Utilization.
|
23 |
+
- MFU(%): Model FLOPs Utilization.
|
24 |
- Precision: The precison of used model.
|
25 |
|
26 |
"""
|
src/display/utils.py
CHANGED
@@ -18,8 +18,8 @@ GPU_Power = 'Power(W)'
|
|
18 |
GPU_Mem = 'Mem(G)'
|
19 |
GPU_Name = "GPU"
|
20 |
GPU_Util = 'Util(%)'
|
21 |
-
MFU = '
|
22 |
-
MBU = '
|
23 |
BATCH_SIZE = 'bs'
|
24 |
PRECISION = "Precision"
|
25 |
system_metrics_to_name_map = {
|
@@ -106,7 +106,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
106 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
107 |
|
108 |
# Inference framework
|
109 |
-
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True
|
110 |
|
111 |
for task in Tasks:
|
112 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
@@ -126,15 +126,15 @@ for task in Tasks:
|
|
126 |
|
127 |
|
128 |
# Model information
|
129 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False
|
130 |
-
|
131 |
-
|
132 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
# Dummy column for the search bar (hidden by the custom CSS)
|
139 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
140 |
|
@@ -160,10 +160,10 @@ class ModelDetails:
|
|
160 |
|
161 |
|
162 |
class ModelType(Enum):
|
163 |
-
|
164 |
-
|
165 |
chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="π¬")
|
166 |
-
|
167 |
Unknown = ModelDetails(name="", symbol="?")
|
168 |
|
169 |
def to_str(self, separator=" "):
|
@@ -171,24 +171,22 @@ class ModelType(Enum):
|
|
171 |
|
172 |
@staticmethod
|
173 |
def from_str(type):
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "π¦", "β", "π¬"]]):
|
179 |
return ModelType.chat
|
180 |
-
|
181 |
-
|
182 |
return ModelType.Unknown
|
183 |
|
184 |
|
185 |
class InferenceFramework(Enum):
|
186 |
# "moe-infinity", hf-chat
|
187 |
-
|
188 |
HF_Chat = ModelDetails("hf-chat")
|
189 |
VLLM = ModelDetails("vllm_moe")
|
190 |
-
TRTLLM = ModelDetails("tensorrt_llm")
|
191 |
-
VLLM_FIX = ModelDetails("vllm_moe_fixbs")
|
192 |
Unknown = ModelDetails("?")
|
193 |
|
194 |
def to_str(self):
|
@@ -196,16 +194,12 @@ class InferenceFramework(Enum):
|
|
196 |
|
197 |
@staticmethod
|
198 |
def from_str(inference_framework: str):
|
199 |
-
|
200 |
-
|
201 |
-
if inference_framework in ["tensorrt_llm"]:
|
202 |
-
return InferenceFramework.TRTLLM
|
203 |
if inference_framework in ["hf-chat"]:
|
204 |
return InferenceFramework.HF_Chat
|
205 |
if inference_framework in ["vllm_moe"]:
|
206 |
return InferenceFramework.VLLM
|
207 |
-
if inference_framework in ["vllm_moe_fixbs"]:
|
208 |
-
return InferenceFramework.VLLM_FIX
|
209 |
return InferenceFramework.Unknown
|
210 |
|
211 |
class GPUType(Enum):
|
@@ -231,28 +225,28 @@ class WeightType(Enum):
|
|
231 |
|
232 |
|
233 |
class Precision(Enum):
|
234 |
-
|
235 |
-
|
236 |
bfloat16 = ModelDetails("bfloat16")
|
237 |
qt_8bit = ModelDetails("8bit")
|
238 |
qt_4bit = ModelDetails("4bit")
|
239 |
-
|
240 |
Unknown = ModelDetails("?")
|
241 |
|
242 |
@staticmethod
|
243 |
def from_str(precision: str):
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
249 |
return Precision.bfloat16
|
250 |
if precision in ["8bit"]:
|
251 |
return Precision.qt_8bit
|
252 |
if precision in ["4bit"]:
|
253 |
return Precision.qt_4bit
|
254 |
-
|
255 |
-
|
256 |
return Precision.Unknown
|
257 |
|
258 |
|
|
|
18 |
GPU_Mem = 'Mem(G)'
|
19 |
GPU_Name = "GPU"
|
20 |
GPU_Util = 'Util(%)'
|
21 |
+
MFU = 'MFU(%)'
|
22 |
+
MBU = 'MBU(%)'
|
23 |
BATCH_SIZE = 'bs'
|
24 |
PRECISION = "Precision"
|
25 |
system_metrics_to_name_map = {
|
|
|
106 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
107 |
|
108 |
# Inference framework
|
109 |
+
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)])
|
110 |
|
111 |
for task in Tasks:
|
112 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
126 |
|
127 |
|
128 |
# Model information
|
129 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
130 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
131 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
132 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
133 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
134 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
135 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub β€οΈ", "number", False)])
|
136 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
137 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
138 |
# Dummy column for the search bar (hidden by the custom CSS)
|
139 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
140 |
|
|
|
160 |
|
161 |
|
162 |
class ModelType(Enum):
|
163 |
+
PT = ModelDetails(name="pretrained", symbol="π’")
|
164 |
+
FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="πΆ")
|
165 |
chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="π¬")
|
166 |
+
merges = ModelDetails(name="base merges and moerges", symbol="π€")
|
167 |
Unknown = ModelDetails(name="", symbol="?")
|
168 |
|
169 |
def to_str(self, separator=" "):
|
|
|
171 |
|
172 |
@staticmethod
|
173 |
def from_str(type):
|
174 |
+
if "fine-tuned" in type or "πΆ" in type:
|
175 |
+
return ModelType.FT
|
176 |
+
if "pretrained" in type or "π’" in type:
|
177 |
+
return ModelType.PT
|
178 |
if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "π¦", "β", "π¬"]]):
|
179 |
return ModelType.chat
|
180 |
+
if "merge" in type or "π€" in type:
|
181 |
+
return ModelType.merges
|
182 |
return ModelType.Unknown
|
183 |
|
184 |
|
185 |
class InferenceFramework(Enum):
|
186 |
# "moe-infinity", hf-chat
|
187 |
+
MoE_Infinity = ModelDetails("moe-infinity")
|
188 |
HF_Chat = ModelDetails("hf-chat")
|
189 |
VLLM = ModelDetails("vllm_moe")
|
|
|
|
|
190 |
Unknown = ModelDetails("?")
|
191 |
|
192 |
def to_str(self):
|
|
|
194 |
|
195 |
@staticmethod
|
196 |
def from_str(inference_framework: str):
|
197 |
+
if inference_framework in ["moe-infinity"]:
|
198 |
+
return InferenceFramework.MoE_Infinity
|
|
|
|
|
199 |
if inference_framework in ["hf-chat"]:
|
200 |
return InferenceFramework.HF_Chat
|
201 |
if inference_framework in ["vllm_moe"]:
|
202 |
return InferenceFramework.VLLM
|
|
|
|
|
203 |
return InferenceFramework.Unknown
|
204 |
|
205 |
class GPUType(Enum):
|
|
|
225 |
|
226 |
|
227 |
class Precision(Enum):
|
228 |
+
float32 = ModelDetails("float32")
|
229 |
+
float16 = ModelDetails("float16")
|
230 |
bfloat16 = ModelDetails("bfloat16")
|
231 |
qt_8bit = ModelDetails("8bit")
|
232 |
qt_4bit = ModelDetails("4bit")
|
233 |
+
qt_GPTQ = ModelDetails("GPTQ")
|
234 |
Unknown = ModelDetails("?")
|
235 |
|
236 |
@staticmethod
|
237 |
def from_str(precision: str):
|
238 |
+
if precision in ["torch.float32", "float32"]:
|
239 |
+
return Precision.float32
|
240 |
+
if precision in ["torch.float16", "float16"]:
|
241 |
+
return Precision.float16
|
242 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
243 |
return Precision.bfloat16
|
244 |
if precision in ["8bit"]:
|
245 |
return Precision.qt_8bit
|
246 |
if precision in ["4bit"]:
|
247 |
return Precision.qt_4bit
|
248 |
+
if precision in ["GPTQ", "None"]:
|
249 |
+
return Precision.qt_GPTQ
|
250 |
return Precision.Unknown
|
251 |
|
252 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -140,7 +140,6 @@ class EvalResult:
|
|
140 |
revision=config.get("model_sha", ""),
|
141 |
still_on_hub=still_on_hub,
|
142 |
architecture=architecture,
|
143 |
-
model_type=ModelType.from_str(config.get("model_type", "")),
|
144 |
inference_framework=inference_framework,
|
145 |
)
|
146 |
|
@@ -175,22 +174,22 @@ class EvalResult:
|
|
175 |
|
176 |
# breakpoint()
|
177 |
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
178 |
-
|
179 |
data_dict = {
|
180 |
"eval_name": self.eval_name, # not a column, just a save name,
|
181 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
182 |
-
|
183 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
184 |
-
|
185 |
-
|
186 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
187 |
AutoEvalColumn.dummy.name: self.full_model,
|
188 |
-
|
189 |
-
#
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
AutoEvalColumn.inference_framework.name: self.inference_framework,
|
195 |
}
|
196 |
|
|
|
140 |
revision=config.get("model_sha", ""),
|
141 |
still_on_hub=still_on_hub,
|
142 |
architecture=architecture,
|
|
|
143 |
inference_framework=inference_framework,
|
144 |
)
|
145 |
|
|
|
174 |
|
175 |
# breakpoint()
|
176 |
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
177 |
+
|
178 |
data_dict = {
|
179 |
"eval_name": self.eval_name, # not a column, just a save name,
|
180 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
181 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
182 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
183 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
184 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
185 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
186 |
AutoEvalColumn.dummy.name: self.full_model,
|
187 |
+
AutoEvalColumn.revision.name: self.revision,
|
188 |
+
# AutoEvalColumn.average.name: average,
|
189 |
+
AutoEvalColumn.license.name: self.license,
|
190 |
+
AutoEvalColumn.likes.name: self.likes,
|
191 |
+
AutoEvalColumn.params.name: self.num_params,
|
192 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
193 |
AutoEvalColumn.inference_framework.name: self.inference_framework,
|
194 |
}
|
195 |
|