Spaces:
Running
Running
File size: 4,237 Bytes
ab5f5f1 76b423c ab5f5f1 76b423c ab5f5f1 76b423c a1f6c2e 76b423c a1f6c2e 76b423c ab5f5f1 76b423c ab5f5f1 76b423c 7ecfa5a 76b423c ab5f5f1 7ecfa5a 76b423c ab5f5f1 76b423c 7ecfa5a ab5f5f1 76b423c ab5f5f1 76b423c ab5f5f1 76b423c 5345cba 76b423c ab5f5f1 76b423c 0232cf1 ab5f5f1 76b423c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import os
import pandas as pd
from .utils import process_kernels, process_quantizations
COLUMNS_MAPPING = {
"config.name": "Experiment π§ͺ",
"config.backend.model": "Model π€",
# primary measurements
"report.prefill.latency.p50": "Prefill (s)",
"report.per_token.latency.p50": "Per Token (s)",
"report.decode.throughput.value": "Decode (tokens/s)",
"report.decode.efficiency.value": "Energy (tokens/kWh)",
"report.decode.memory.max_allocated": "Memory (MB)",
# deployment settings
"config.backend.name": "Backend π",
"config.backend.torch_dtype": "Precision π₯",
"quantization": "Quantization ποΈ",
"attention": "Attention ποΈ",
"kernel": "Kernel βοΈ",
# additional information
"architecture": "Architecture ποΈ",
"prefill+decode": "End-to-End (s)",
"Average β¬οΈ": "Open LLM Score (%)",
"#Params (B)": "Params (B)",
}
SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
SUBSETS = ["unquantized", "awq", "bnb", "gptq"]
SORTING_ASCENDING = [False, True, False]
def get_raw_llm_perf_df(machine: str = "1xA10"):
dfs = []
for subset in SUBSETS:
try:
dfs.append(
pd.read_csv(
f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{subset}-{machine}.csv"
)
)
except Exception:
print(f"Subset {subset} for machine {machine} not found")
perf_df = pd.concat(dfs)
llm_df = pd.read_csv(
"hf://datasets/optimum-benchmark/llm-perf-leaderboard/llm-df.csv"
)
llm_perf_df = pd.merge(
llm_df, perf_df, left_on="Model", right_on="config.backend.model"
)
return llm_perf_df
def processed_llm_perf_df(llm_perf_df):
# some assertions
assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
# fix couple stuff
llm_perf_df.dropna(subset=["report.decode.latency.p50"], inplace=True)
llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace(
"flash_attention_2", "fa2"
)
llm_perf_df["prefill+decode"] = (
llm_perf_df["report.prefill.latency.p50"]
+ (llm_perf_df["report.decode.latency.p50"])
)
# llm_perf_df["architecture"] = llm_perf_df["config.backend.model"].apply(
# process_architectures
# )
llm_perf_df["architecture"] = llm_perf_df["Architecture"]
llm_perf_df["attention"] = (
llm_perf_df["config.backend.attn_implementation"]
.str.replace("flash_attention_2", "FAv2")
.str.replace("eager", "Eager")
.str.replace("sdpa", "SDPA")
)
llm_perf_df["quantization"] = llm_perf_df.apply(process_quantizations, axis=1)
llm_perf_df["kernel"] = llm_perf_df.apply(process_kernels, axis=1)
# round numerical columns
llm_perf_df = llm_perf_df.round(
{
"report.prefill.latency.p50": 3,
"report.decode.latency.p50": 3,
"report.decode.throughput.value": 3,
"report.decode.efficiency.value": 3,
"report.decode.memory.max_allocated": 3,
"Average β¬οΈ": 3,
"prefill+decode": 3,
"#Params (B)": 3,
}
)
# filter columns
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
# rename columns
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
# sort by metric
llm_perf_df.sort_values(
by=SORTING_COLUMNS,
ascending=SORTING_ASCENDING,
inplace=True,
)
return llm_perf_df
def get_llm_perf_df(machine: str = "1xA10"):
if os.path.exists(f"llm-perf-leaderboard-{machine}.csv"):
llm_perf_df = pd.read_csv(f"llm-perf-leaderboard-{machine}.csv")
else:
llm_perf_df = get_raw_llm_perf_df(machine)
llm_perf_df = processed_llm_perf_df(llm_perf_df)
llm_perf_df.to_csv(f"llm-perf-leaderboard-{machine}.csv", index=False)
return llm_perf_df
|